tabula-extractor 0.6.6-java → 0.7.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
@@ -0,0 +1,300 @@
|
|
1
|
+
module Tabula
|
2
|
+
class Ruling < java.awt.geom.Line2D::Float
|
3
|
+
|
4
|
+
attr_accessor :stroking_color
|
5
|
+
|
6
|
+
def initialize(top, left, width, height, stroking_color=nil)
|
7
|
+
super(left, top, left+width, top+height)
|
8
|
+
self.stroking_color = stroking_color
|
9
|
+
end
|
10
|
+
|
11
|
+
alias :top :getY1
|
12
|
+
alias :left :getX1
|
13
|
+
alias :bottom :getY2
|
14
|
+
alias :right :getX2
|
15
|
+
|
16
|
+
def top=(v)
|
17
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, v, right, bottom
|
18
|
+
end
|
19
|
+
|
20
|
+
def left=(v)
|
21
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], v, top, right, bottom
|
22
|
+
end
|
23
|
+
|
24
|
+
def bottom=(v)
|
25
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, top, right, v
|
26
|
+
end
|
27
|
+
|
28
|
+
def right=(v)
|
29
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, top, v, bottom
|
30
|
+
end
|
31
|
+
|
32
|
+
def width
|
33
|
+
right - left
|
34
|
+
end
|
35
|
+
|
36
|
+
def height
|
37
|
+
bottom - top
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
# attributes that make sense only for non-oblique lines
|
42
|
+
# these are used to have a single collapse method (in page, currently)
|
43
|
+
def position
|
44
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #position method." if oblique?
|
45
|
+
vertical? ? left : top
|
46
|
+
end
|
47
|
+
def start
|
48
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #start method." if oblique?
|
49
|
+
vertical? ? top : left
|
50
|
+
end
|
51
|
+
def end
|
52
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #end method." if oblique?
|
53
|
+
vertical? ? bottom : right
|
54
|
+
end
|
55
|
+
def position=(coord)
|
56
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #position= method." if oblique?
|
57
|
+
if vertical?
|
58
|
+
self.left = coord
|
59
|
+
self.right = coord
|
60
|
+
else
|
61
|
+
self.top = coord
|
62
|
+
self.bottom = coord
|
63
|
+
end
|
64
|
+
end
|
65
|
+
def start=(coord)
|
66
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #start= method." if oblique?
|
67
|
+
if vertical?
|
68
|
+
self.top = coord
|
69
|
+
else
|
70
|
+
self.left = coord
|
71
|
+
end
|
72
|
+
end
|
73
|
+
def end=(coord)
|
74
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #end= method." if oblique?
|
75
|
+
if vertical?
|
76
|
+
self.bottom = coord
|
77
|
+
else
|
78
|
+
self.right = coord
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
#ok wtf are you doing, Jeremy?
|
83
|
+
# some PDFs (garment factory audits, precise link TK) make tables by drawing lines that
|
84
|
+
# very nearly intersect each other, but not quite. E.g. a horizontal line spans the table at a Y val of 100
|
85
|
+
# and each vertical line (i.e. column separating ruling line) starts at 101 or 102.
|
86
|
+
# this is very annoying. so we check if those lines nearly overlap by expanding each pair
|
87
|
+
# by 2 pixels in each direction (so the vertical lines' top becomes 99 or 100, and then the expanded versions overlap)
|
88
|
+
|
89
|
+
PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2
|
90
|
+
COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1
|
91
|
+
|
92
|
+
# if the lines we're comparing are colinear or parallel, we expand them by a only 1 pixel,
|
93
|
+
# because the expansions are additive
|
94
|
+
# (e.g. two vertical lines, at x = 100, with one having y2 of 98 and the other having y1 of 102 would
|
95
|
+
# erroneously be said to nearlyIntersect if they were each expanded by 2 (since they'd both terminate at 100).
|
96
|
+
# The COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT is only 1 so the total expansion is 2.
|
97
|
+
# A total expansion amount of 2 is empirically verified to work sometimes. It's not a magic number from any
|
98
|
+
# source other than a little bit of experience.)
|
99
|
+
|
100
|
+
def nearlyIntersects?(another)
|
101
|
+
if self.intersectsLine(another)
|
102
|
+
true
|
103
|
+
elsif self.perpendicular_to?(another)
|
104
|
+
self.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT))
|
105
|
+
else
|
106
|
+
self.expand(COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT).intersectsLine(another.expand(COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT))
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
##
|
111
|
+
# intersect this Ruling with a java.awt.geom.Rectangle2D
|
112
|
+
def intersect(area)
|
113
|
+
i = self.getBounds2D.createIntersection(area)
|
114
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], i.getX, i.getY, i.getX + i.getWidth, i.getY + i.getHeight
|
115
|
+
self
|
116
|
+
end
|
117
|
+
|
118
|
+
def expand(amt)
|
119
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #expand method." if oblique?
|
120
|
+
r = Ruling.new(self.top, self.left, self.width, self.height)
|
121
|
+
r.start = r.start - amt
|
122
|
+
r.end = r.end + amt
|
123
|
+
r
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
def length
|
128
|
+
Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
|
129
|
+
end
|
130
|
+
|
131
|
+
def vertical?
|
132
|
+
left == right
|
133
|
+
end
|
134
|
+
|
135
|
+
def horizontal?
|
136
|
+
top == bottom
|
137
|
+
end
|
138
|
+
|
139
|
+
def oblique?
|
140
|
+
!(vertical? || horizontal?)
|
141
|
+
end
|
142
|
+
|
143
|
+
def perpendicular_to?(other)
|
144
|
+
return self.vertical? == other.horizontal?
|
145
|
+
end
|
146
|
+
|
147
|
+
def to_json(arg)
|
148
|
+
[left, top, right, bottom].to_json
|
149
|
+
end
|
150
|
+
|
151
|
+
def colinear?(point)
|
152
|
+
point.x >= left && point.x <= right &&
|
153
|
+
point.y >= top && point.y <= bottom
|
154
|
+
end
|
155
|
+
|
156
|
+
##
|
157
|
+
# calculate the intersection point between +self+ and other Ruling
|
158
|
+
def intersection_point(other)
|
159
|
+
# algo taken from http://mathworld.wolfram.com/Line-LineIntersection.html
|
160
|
+
|
161
|
+
#self and other should always be perpendicular, since one should be horizontal and one should be vertical
|
162
|
+
self_l = self.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)
|
163
|
+
other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)
|
164
|
+
|
165
|
+
return nil if !self_l.intersectsLine(other_l)
|
166
|
+
|
167
|
+
x1 = self_l.getX1; y1 = self_l.getY1
|
168
|
+
x2 = self_l.getX2; y2 = self_l.getY2
|
169
|
+
x3 = other_l.getX1; y3 = other_l.getY1
|
170
|
+
x4 = other_l.getX2; y4 = other_l.getY2
|
171
|
+
|
172
|
+
det = lambda { |a,b,c,d| a * d - b * c }
|
173
|
+
|
174
|
+
int_x = det.call(det.call(x1, y1, x2, y2), x1 - x2, det.call(x3, y3, x4, y4), x3 - x4) /
|
175
|
+
det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
|
176
|
+
|
177
|
+
int_y = det.call(det.call(x1, y1, x2, y2), y1 - y2,
|
178
|
+
det.call(x3, y3, x4, y4), y3 - y4) /
|
179
|
+
det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
|
180
|
+
|
181
|
+
return nil if int_x.nan? || int_y.nan? # TODO is this right?
|
182
|
+
|
183
|
+
|
184
|
+
java.awt.geom.Point2D::Float.new(int_x, int_y)
|
185
|
+
end
|
186
|
+
|
187
|
+
##
|
188
|
+
# Find all intersection points between two list of +Ruling+
|
189
|
+
# (+horizontals+ and +verticals+)
|
190
|
+
# TODO: this is O(n^2) - optimize.
|
191
|
+
def self.find_intersections(horizontals, verticals)
|
192
|
+
horizontals.product(verticals).inject({}) do |memo, (h, v)|
|
193
|
+
ip = h.intersection_point(v)
|
194
|
+
unless ip.nil?
|
195
|
+
memo[ip] ||= []
|
196
|
+
# TODO: stupid hack for FLA pdfs where lines appear to intersect, but don't.
|
197
|
+
memo[ip] << [h.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), v.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)]
|
198
|
+
end
|
199
|
+
memo
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
##
|
204
|
+
# crop an enumerable of +Ruling+ to an +area+
|
205
|
+
def self.crop_rulings_to_area(rulings, area)
|
206
|
+
rulings.reduce([]) do |memo, r|
|
207
|
+
if r.intersects(area)
|
208
|
+
memo << r.clone.intersect(area)
|
209
|
+
end
|
210
|
+
memo
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
# TODO do we really need this one anymore?
|
215
|
+
def self.clean_rulings(rulings, max_distance=4)
|
216
|
+
|
217
|
+
# merge horizontal and vertical lines
|
218
|
+
# TODO this should be iterative
|
219
|
+
|
220
|
+
skip = false
|
221
|
+
|
222
|
+
horiz = rulings.select { |r| r.horizontal? }
|
223
|
+
.group_by(&:top)
|
224
|
+
.values.reduce([]) do |memo, rs|
|
225
|
+
|
226
|
+
rs = rs.sort_by(&:left)
|
227
|
+
if rs.size > 1
|
228
|
+
memo +=
|
229
|
+
rs.each_cons(2)
|
230
|
+
.chunk { |p| p[1].left - p[0].right < 7 }
|
231
|
+
.select { |c| c[0] }
|
232
|
+
.map { |group|
|
233
|
+
group = group.last.flatten.uniq
|
234
|
+
Tabula::Ruling.new(group[0].top,
|
235
|
+
group[0].left,
|
236
|
+
group[-1].right - group[0].left,
|
237
|
+
0)
|
238
|
+
}
|
239
|
+
Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
|
240
|
+
else
|
241
|
+
memo << rs.first
|
242
|
+
end
|
243
|
+
memo
|
244
|
+
end
|
245
|
+
.sort_by(&:top)
|
246
|
+
|
247
|
+
h = []
|
248
|
+
horiz.size.times do |i|
|
249
|
+
|
250
|
+
if i == horiz.size - 1
|
251
|
+
h << horiz[-1]
|
252
|
+
break
|
253
|
+
end
|
254
|
+
|
255
|
+
if skip
|
256
|
+
skip = false;
|
257
|
+
next
|
258
|
+
end
|
259
|
+
d = (horiz[i+1].top - horiz[i].top).abs
|
260
|
+
|
261
|
+
h << if d < max_distance # THRESHOLD DISTANCE between horizontal lines
|
262
|
+
skip = true
|
263
|
+
Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
|
264
|
+
else
|
265
|
+
horiz[i]
|
266
|
+
end
|
267
|
+
end
|
268
|
+
horiz = h
|
269
|
+
|
270
|
+
vert = rulings.select { |r| r.vertical? }
|
271
|
+
.group_by(&:left)
|
272
|
+
.values
|
273
|
+
.reduce([]) do |memo, rs|
|
274
|
+
|
275
|
+
rs = rs.sort_by(&:top)
|
276
|
+
|
277
|
+
if rs.size > 1
|
278
|
+
# Here be dragons:
|
279
|
+
# merge consecutive segments of lines that are close enough
|
280
|
+
memo +=
|
281
|
+
rs.each_cons(2)
|
282
|
+
.chunk { |p| p[1].top - p[0].bottom < 7 }
|
283
|
+
.select { |c| c[0] }
|
284
|
+
.map { |group|
|
285
|
+
group = group.last.flatten.uniq
|
286
|
+
Tabula::Ruling.new(group[0].top,
|
287
|
+
group[0].left,
|
288
|
+
0,
|
289
|
+
group[-1].bottom - group[0].top)
|
290
|
+
}
|
291
|
+
else
|
292
|
+
memo << rs.first
|
293
|
+
end
|
294
|
+
memo
|
295
|
+
end.sort_by(&:left)
|
296
|
+
|
297
|
+
return horiz += vert
|
298
|
+
end
|
299
|
+
end
|
300
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Tabula
|
2
|
+
# a counterpart of Table, to be sure.
|
3
|
+
# not sure yet what their relationship ought to be.
|
4
|
+
class Spreadsheet < ZoneEntity
|
5
|
+
include Tabula::HasCells
|
6
|
+
attr_accessor :cells, :vertical_ruling_lines, :horizontal_ruling_lines, :cells_resolved
|
7
|
+
|
8
|
+
def initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) #, lines)
|
9
|
+
super(top, left, width, height)
|
10
|
+
@cells = cells
|
11
|
+
@page = page
|
12
|
+
@vertical_ruling_lines = vertical_ruling_lines
|
13
|
+
@horizontal_ruling_lines = horizontal_ruling_lines
|
14
|
+
end
|
15
|
+
|
16
|
+
def ruling_lines
|
17
|
+
@vertical_ruling_lines + @horizontal_ruling_lines
|
18
|
+
end
|
19
|
+
|
20
|
+
def ruling_lines=(lines)
|
21
|
+
@vertical_ruling_lines = lines.select{|vl| vl.vertical? && spr.intersectsLine(vl) }
|
22
|
+
@horizontal_ruling_lines = lines.select{|hl| hl.horizontal? && spr.intersectsLine(hl) }
|
23
|
+
end
|
24
|
+
|
25
|
+
def fill_in_cells!
|
26
|
+
unless @cells_resolved
|
27
|
+
@cells_resolved = true
|
28
|
+
cells.each do |cell|
|
29
|
+
cell.text_elements = @page.get_cell_text(cell)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# call `rows` with `evaluate_cells` as `false` to defer filling in the text in
|
35
|
+
# each cell, which can be computationally intensive.
|
36
|
+
def rows(evaluate_cells=true)
|
37
|
+
if evaluate_cells
|
38
|
+
fill_in_cells!
|
39
|
+
end
|
40
|
+
tops = cells.map(&:top).uniq.sort
|
41
|
+
array_of_rows = tops.map do |top|
|
42
|
+
cells.select{|c| c.top == top }.sort_by(&:left)
|
43
|
+
end
|
44
|
+
#here, insert another kind of placeholder for empty corners
|
45
|
+
# like in 01001523B_China.pdf
|
46
|
+
#TODO: support placeholders for "empty" cells in rows other than row 1, and in #cols
|
47
|
+
# puts array_of_rows[0].inspect
|
48
|
+
if array_of_rows.size > 2
|
49
|
+
if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
|
50
|
+
missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
|
51
|
+
# puts missing_spots.inspect
|
52
|
+
missing_spots.each do |missing_spot|
|
53
|
+
missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
|
54
|
+
missing_spot_placeholder.placeholder = true
|
55
|
+
array_of_rows[0] << missing_spot_placeholder
|
56
|
+
end
|
57
|
+
end
|
58
|
+
array_of_rows[0].sort_by!(&:left)
|
59
|
+
end
|
60
|
+
array_of_rows
|
61
|
+
end
|
62
|
+
|
63
|
+
# call `cols` with `evaluate_cells` as `false` to defer filling in the text in
|
64
|
+
# each cell, which can be computationally intensive.
|
65
|
+
def cols(evaluate_cells=true)
|
66
|
+
if evaluate_cells
|
67
|
+
fill_in_cells!
|
68
|
+
end
|
69
|
+
lefts = cells.map(&:left).uniq.sort
|
70
|
+
lefts.map do |left|
|
71
|
+
cells.select{|c| c.left == left }.sort_by(&:top)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_a
|
76
|
+
fill_in_cells!
|
77
|
+
rows.map{ |row_cells| row_cells.map(&:text) }
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_csv
|
81
|
+
out = StringIO.new
|
82
|
+
Tabula::Writers.CSV(rows, out)
|
83
|
+
out.string
|
84
|
+
end
|
85
|
+
|
86
|
+
def to_tsv
|
87
|
+
out = StringIO.new
|
88
|
+
Tabula::Writers.TSV(rows, out)
|
89
|
+
out.string
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module Tabula
|
2
|
+
class Table
|
3
|
+
attr_reader :lines
|
4
|
+
def initialize(line_count, separators)
|
5
|
+
@separators = separators
|
6
|
+
@lines = (0...line_count).inject([]) { |m| m << Line.new }
|
7
|
+
end
|
8
|
+
|
9
|
+
def add_text_element(text_element, i, j)
|
10
|
+
if @lines.size <= i
|
11
|
+
@lines[i] = Line.new
|
12
|
+
end
|
13
|
+
if @lines[i].text_elements[j]
|
14
|
+
@lines[i].text_elements[j].merge!(text_element)
|
15
|
+
else
|
16
|
+
@lines[i].text_elements[j] = text_element
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def rpad!
|
21
|
+
max = lines.map{|l| l.text_elements.size}.max
|
22
|
+
lines.each do |line|
|
23
|
+
needed = max - line.text_elements.size
|
24
|
+
needed.times do
|
25
|
+
line.text_elements << TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def cols
|
31
|
+
self.rpad!
|
32
|
+
lines.map(&:text_elements).transpose
|
33
|
+
end
|
34
|
+
|
35
|
+
def rows
|
36
|
+
self.rpad!
|
37
|
+
lines.map(&:text_elements)
|
38
|
+
end
|
39
|
+
|
40
|
+
# create a new Table object from an array of arrays, representing a list of rows in a spreadsheet
|
41
|
+
# probably only used for testing
|
42
|
+
def self.new_from_array(array_of_rows)
|
43
|
+
t = Table.new(array_of_rows.size, [])
|
44
|
+
array_of_rows.each_with_index do |row, index|
|
45
|
+
t.lines[index].text_elements = row.map{|cell| TextElement.new(nil, nil, nil, nil, nil, nil, cell, nil)}
|
46
|
+
end
|
47
|
+
t
|
48
|
+
end
|
49
|
+
|
50
|
+
#for equality testing, return @lines stripped of leading columns of empty strings
|
51
|
+
#TODO: write a method to strip all totally-empty columns (or not?)
|
52
|
+
def lstrip_lines
|
53
|
+
return @lines if @lines.include?(nil)
|
54
|
+
min_leading_empty_strings = Float::INFINITY
|
55
|
+
@lines.each do |line|
|
56
|
+
empties = line.text_elements.map{|t| t.nil? || t.text.empty? }
|
57
|
+
min_leading_empty_strings = [min_leading_empty_strings, empties.index(false)].min
|
58
|
+
end
|
59
|
+
if min_leading_empty_strings == 0
|
60
|
+
@lines
|
61
|
+
else
|
62
|
+
@lines.each{|line| line.text_elements = line.text_elements[min_leading_empty_strings..-1]}
|
63
|
+
@lines
|
64
|
+
end
|
65
|
+
end
|
66
|
+
def lstrip_lines!
|
67
|
+
@lines = self.lstrip_lines
|
68
|
+
end
|
69
|
+
|
70
|
+
#used for testing, ignores separator locations (they'll sometimes be nil/empty)
|
71
|
+
def ==(other)
|
72
|
+
self.instance_variable_set(:@lines, self.lstrip_lines)
|
73
|
+
other.instance_variable_set(:@lines, other.lstrip_lines)
|
74
|
+
self.instance_variable_set(:@lines, self.lines.rpad(nil, other.lines.size))
|
75
|
+
other.instance_variable_set(:@lines, other.lines.rpad(nil, self.lines.size))
|
76
|
+
|
77
|
+
self.lines.zip(other.lines).all? { |my, yours| my == yours }
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|