tabula-extractor 0.6.6-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
@@ -0,0 +1,300 @@
|
|
1
|
+
module Tabula
|
2
|
+
class Ruling < java.awt.geom.Line2D::Float
|
3
|
+
|
4
|
+
attr_accessor :stroking_color
|
5
|
+
|
6
|
+
def initialize(top, left, width, height, stroking_color=nil)
|
7
|
+
super(left, top, left+width, top+height)
|
8
|
+
self.stroking_color = stroking_color
|
9
|
+
end
|
10
|
+
|
11
|
+
alias :top :getY1
|
12
|
+
alias :left :getX1
|
13
|
+
alias :bottom :getY2
|
14
|
+
alias :right :getX2
|
15
|
+
|
16
|
+
def top=(v)
|
17
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, v, right, bottom
|
18
|
+
end
|
19
|
+
|
20
|
+
def left=(v)
|
21
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], v, top, right, bottom
|
22
|
+
end
|
23
|
+
|
24
|
+
def bottom=(v)
|
25
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, top, right, v
|
26
|
+
end
|
27
|
+
|
28
|
+
def right=(v)
|
29
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, top, v, bottom
|
30
|
+
end
|
31
|
+
|
32
|
+
def width
|
33
|
+
right - left
|
34
|
+
end
|
35
|
+
|
36
|
+
def height
|
37
|
+
bottom - top
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
# attributes that make sense only for non-oblique lines
|
42
|
+
# these are used to have a single collapse method (in page, currently)
|
43
|
+
def position
|
44
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #position method." if oblique?
|
45
|
+
vertical? ? left : top
|
46
|
+
end
|
47
|
+
def start
|
48
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #start method." if oblique?
|
49
|
+
vertical? ? top : left
|
50
|
+
end
|
51
|
+
def end
|
52
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #end method." if oblique?
|
53
|
+
vertical? ? bottom : right
|
54
|
+
end
|
55
|
+
def position=(coord)
|
56
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #position= method." if oblique?
|
57
|
+
if vertical?
|
58
|
+
self.left = coord
|
59
|
+
self.right = coord
|
60
|
+
else
|
61
|
+
self.top = coord
|
62
|
+
self.bottom = coord
|
63
|
+
end
|
64
|
+
end
|
65
|
+
def start=(coord)
|
66
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #start= method." if oblique?
|
67
|
+
if vertical?
|
68
|
+
self.top = coord
|
69
|
+
else
|
70
|
+
self.left = coord
|
71
|
+
end
|
72
|
+
end
|
73
|
+
def end=(coord)
|
74
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #end= method." if oblique?
|
75
|
+
if vertical?
|
76
|
+
self.bottom = coord
|
77
|
+
else
|
78
|
+
self.right = coord
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
#ok wtf are you doing, Jeremy?
|
83
|
+
# some PDFs (garment factory audits, precise link TK) make tables by drawing lines that
|
84
|
+
# very nearly intersect each other, but not quite. E.g. a horizontal line spans the table at a Y val of 100
|
85
|
+
# and each vertical line (i.e. column separating ruling line) starts at 101 or 102.
|
86
|
+
# this is very annoying. so we check if those lines nearly overlap by expanding each pair
|
87
|
+
# by 2 pixels in each direction (so the vertical lines' top becomes 99 or 100, and then the expanded versions overlap)
|
88
|
+
|
89
|
+
PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2
|
90
|
+
COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1
|
91
|
+
|
92
|
+
# if the lines we're comparing are colinear or parallel, we expand them by a only 1 pixel,
|
93
|
+
# because the expansions are additive
|
94
|
+
# (e.g. two vertical lines, at x = 100, with one having y2 of 98 and the other having y1 of 102 would
|
95
|
+
# erroneously be said to nearlyIntersect if they were each expanded by 2 (since they'd both terminate at 100).
|
96
|
+
# The COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT is only 1 so the total expansion is 2.
|
97
|
+
# A total expansion amount of 2 is empirically verified to work sometimes. It's not a magic number from any
|
98
|
+
# source other than a little bit of experience.)
|
99
|
+
|
100
|
+
def nearlyIntersects?(another)
|
101
|
+
if self.intersectsLine(another)
|
102
|
+
true
|
103
|
+
elsif self.perpendicular_to?(another)
|
104
|
+
self.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT))
|
105
|
+
else
|
106
|
+
self.expand(COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT).intersectsLine(another.expand(COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT))
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
##
|
111
|
+
# intersect this Ruling with a java.awt.geom.Rectangle2D
|
112
|
+
def intersect(area)
|
113
|
+
i = self.getBounds2D.createIntersection(area)
|
114
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], i.getX, i.getY, i.getX + i.getWidth, i.getY + i.getHeight
|
115
|
+
self
|
116
|
+
end
|
117
|
+
|
118
|
+
def expand(amt)
|
119
|
+
raise NoMethodError, "Oblique line #{self.inspect} has no #expand method." if oblique?
|
120
|
+
r = Ruling.new(self.top, self.left, self.width, self.height)
|
121
|
+
r.start = r.start - amt
|
122
|
+
r.end = r.end + amt
|
123
|
+
r
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
def length
|
128
|
+
Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
|
129
|
+
end
|
130
|
+
|
131
|
+
def vertical?
|
132
|
+
left == right
|
133
|
+
end
|
134
|
+
|
135
|
+
def horizontal?
|
136
|
+
top == bottom
|
137
|
+
end
|
138
|
+
|
139
|
+
def oblique?
|
140
|
+
!(vertical? || horizontal?)
|
141
|
+
end
|
142
|
+
|
143
|
+
def perpendicular_to?(other)
|
144
|
+
return self.vertical? == other.horizontal?
|
145
|
+
end
|
146
|
+
|
147
|
+
def to_json(arg)
|
148
|
+
[left, top, right, bottom].to_json
|
149
|
+
end
|
150
|
+
|
151
|
+
def colinear?(point)
|
152
|
+
point.x >= left && point.x <= right &&
|
153
|
+
point.y >= top && point.y <= bottom
|
154
|
+
end
|
155
|
+
|
156
|
+
##
|
157
|
+
# calculate the intersection point between +self+ and other Ruling
|
158
|
+
def intersection_point(other)
|
159
|
+
# algo taken from http://mathworld.wolfram.com/Line-LineIntersection.html
|
160
|
+
|
161
|
+
#self and other should always be perpendicular, since one should be horizontal and one should be vertical
|
162
|
+
self_l = self.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)
|
163
|
+
other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)
|
164
|
+
|
165
|
+
return nil if !self_l.intersectsLine(other_l)
|
166
|
+
|
167
|
+
x1 = self_l.getX1; y1 = self_l.getY1
|
168
|
+
x2 = self_l.getX2; y2 = self_l.getY2
|
169
|
+
x3 = other_l.getX1; y3 = other_l.getY1
|
170
|
+
x4 = other_l.getX2; y4 = other_l.getY2
|
171
|
+
|
172
|
+
det = lambda { |a,b,c,d| a * d - b * c }
|
173
|
+
|
174
|
+
int_x = det.call(det.call(x1, y1, x2, y2), x1 - x2, det.call(x3, y3, x4, y4), x3 - x4) /
|
175
|
+
det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
|
176
|
+
|
177
|
+
int_y = det.call(det.call(x1, y1, x2, y2), y1 - y2,
|
178
|
+
det.call(x3, y3, x4, y4), y3 - y4) /
|
179
|
+
det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
|
180
|
+
|
181
|
+
return nil if int_x.nan? || int_y.nan? # TODO is this right?
|
182
|
+
|
183
|
+
|
184
|
+
java.awt.geom.Point2D::Float.new(int_x, int_y)
|
185
|
+
end
|
186
|
+
|
187
|
+
##
|
188
|
+
# Find all intersection points between two list of +Ruling+
|
189
|
+
# (+horizontals+ and +verticals+)
|
190
|
+
# TODO: this is O(n^2) - optimize.
|
191
|
+
def self.find_intersections(horizontals, verticals)
|
192
|
+
horizontals.product(verticals).inject({}) do |memo, (h, v)|
|
193
|
+
ip = h.intersection_point(v)
|
194
|
+
unless ip.nil?
|
195
|
+
memo[ip] ||= []
|
196
|
+
# TODO: stupid hack for FLA pdfs where lines appear to intersect, but don't.
|
197
|
+
memo[ip] << [h.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), v.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)]
|
198
|
+
end
|
199
|
+
memo
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
##
|
204
|
+
# crop an enumerable of +Ruling+ to an +area+
|
205
|
+
def self.crop_rulings_to_area(rulings, area)
|
206
|
+
rulings.reduce([]) do |memo, r|
|
207
|
+
if r.intersects(area)
|
208
|
+
memo << r.clone.intersect(area)
|
209
|
+
end
|
210
|
+
memo
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
# TODO do we really need this one anymore?
|
215
|
+
def self.clean_rulings(rulings, max_distance=4)
|
216
|
+
|
217
|
+
# merge horizontal and vertical lines
|
218
|
+
# TODO this should be iterative
|
219
|
+
|
220
|
+
skip = false
|
221
|
+
|
222
|
+
horiz = rulings.select { |r| r.horizontal? }
|
223
|
+
.group_by(&:top)
|
224
|
+
.values.reduce([]) do |memo, rs|
|
225
|
+
|
226
|
+
rs = rs.sort_by(&:left)
|
227
|
+
if rs.size > 1
|
228
|
+
memo +=
|
229
|
+
rs.each_cons(2)
|
230
|
+
.chunk { |p| p[1].left - p[0].right < 7 }
|
231
|
+
.select { |c| c[0] }
|
232
|
+
.map { |group|
|
233
|
+
group = group.last.flatten.uniq
|
234
|
+
Tabula::Ruling.new(group[0].top,
|
235
|
+
group[0].left,
|
236
|
+
group[-1].right - group[0].left,
|
237
|
+
0)
|
238
|
+
}
|
239
|
+
Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
|
240
|
+
else
|
241
|
+
memo << rs.first
|
242
|
+
end
|
243
|
+
memo
|
244
|
+
end
|
245
|
+
.sort_by(&:top)
|
246
|
+
|
247
|
+
h = []
|
248
|
+
horiz.size.times do |i|
|
249
|
+
|
250
|
+
if i == horiz.size - 1
|
251
|
+
h << horiz[-1]
|
252
|
+
break
|
253
|
+
end
|
254
|
+
|
255
|
+
if skip
|
256
|
+
skip = false;
|
257
|
+
next
|
258
|
+
end
|
259
|
+
d = (horiz[i+1].top - horiz[i].top).abs
|
260
|
+
|
261
|
+
h << if d < max_distance # THRESHOLD DISTANCE between horizontal lines
|
262
|
+
skip = true
|
263
|
+
Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
|
264
|
+
else
|
265
|
+
horiz[i]
|
266
|
+
end
|
267
|
+
end
|
268
|
+
horiz = h
|
269
|
+
|
270
|
+
vert = rulings.select { |r| r.vertical? }
|
271
|
+
.group_by(&:left)
|
272
|
+
.values
|
273
|
+
.reduce([]) do |memo, rs|
|
274
|
+
|
275
|
+
rs = rs.sort_by(&:top)
|
276
|
+
|
277
|
+
if rs.size > 1
|
278
|
+
# Here be dragons:
|
279
|
+
# merge consecutive segments of lines that are close enough
|
280
|
+
memo +=
|
281
|
+
rs.each_cons(2)
|
282
|
+
.chunk { |p| p[1].top - p[0].bottom < 7 }
|
283
|
+
.select { |c| c[0] }
|
284
|
+
.map { |group|
|
285
|
+
group = group.last.flatten.uniq
|
286
|
+
Tabula::Ruling.new(group[0].top,
|
287
|
+
group[0].left,
|
288
|
+
0,
|
289
|
+
group[-1].bottom - group[0].top)
|
290
|
+
}
|
291
|
+
else
|
292
|
+
memo << rs.first
|
293
|
+
end
|
294
|
+
memo
|
295
|
+
end.sort_by(&:left)
|
296
|
+
|
297
|
+
return horiz += vert
|
298
|
+
end
|
299
|
+
end
|
300
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Tabula
|
2
|
+
# a counterpart of Table, to be sure.
|
3
|
+
# not sure yet what their relationship ought to be.
|
4
|
+
class Spreadsheet < ZoneEntity
|
5
|
+
include Tabula::HasCells
|
6
|
+
attr_accessor :cells, :vertical_ruling_lines, :horizontal_ruling_lines, :cells_resolved
|
7
|
+
|
8
|
+
def initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) #, lines)
|
9
|
+
super(top, left, width, height)
|
10
|
+
@cells = cells
|
11
|
+
@page = page
|
12
|
+
@vertical_ruling_lines = vertical_ruling_lines
|
13
|
+
@horizontal_ruling_lines = horizontal_ruling_lines
|
14
|
+
end
|
15
|
+
|
16
|
+
def ruling_lines
|
17
|
+
@vertical_ruling_lines + @horizontal_ruling_lines
|
18
|
+
end
|
19
|
+
|
20
|
+
def ruling_lines=(lines)
|
21
|
+
@vertical_ruling_lines = lines.select{|vl| vl.vertical? && spr.intersectsLine(vl) }
|
22
|
+
@horizontal_ruling_lines = lines.select{|hl| hl.horizontal? && spr.intersectsLine(hl) }
|
23
|
+
end
|
24
|
+
|
25
|
+
def fill_in_cells!
|
26
|
+
unless @cells_resolved
|
27
|
+
@cells_resolved = true
|
28
|
+
cells.each do |cell|
|
29
|
+
cell.text_elements = @page.get_cell_text(cell)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# call `rows` with `evaluate_cells` as `false` to defer filling in the text in
|
35
|
+
# each cell, which can be computationally intensive.
|
36
|
+
def rows(evaluate_cells=true)
|
37
|
+
if evaluate_cells
|
38
|
+
fill_in_cells!
|
39
|
+
end
|
40
|
+
tops = cells.map(&:top).uniq.sort
|
41
|
+
array_of_rows = tops.map do |top|
|
42
|
+
cells.select{|c| c.top == top }.sort_by(&:left)
|
43
|
+
end
|
44
|
+
#here, insert another kind of placeholder for empty corners
|
45
|
+
# like in 01001523B_China.pdf
|
46
|
+
#TODO: support placeholders for "empty" cells in rows other than row 1, and in #cols
|
47
|
+
# puts array_of_rows[0].inspect
|
48
|
+
if array_of_rows.size > 2
|
49
|
+
if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
|
50
|
+
missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
|
51
|
+
# puts missing_spots.inspect
|
52
|
+
missing_spots.each do |missing_spot|
|
53
|
+
missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
|
54
|
+
missing_spot_placeholder.placeholder = true
|
55
|
+
array_of_rows[0] << missing_spot_placeholder
|
56
|
+
end
|
57
|
+
end
|
58
|
+
array_of_rows[0].sort_by!(&:left)
|
59
|
+
end
|
60
|
+
array_of_rows
|
61
|
+
end
|
62
|
+
|
63
|
+
# call `cols` with `evaluate_cells` as `false` to defer filling in the text in
|
64
|
+
# each cell, which can be computationally intensive.
|
65
|
+
def cols(evaluate_cells=true)
|
66
|
+
if evaluate_cells
|
67
|
+
fill_in_cells!
|
68
|
+
end
|
69
|
+
lefts = cells.map(&:left).uniq.sort
|
70
|
+
lefts.map do |left|
|
71
|
+
cells.select{|c| c.left == left }.sort_by(&:top)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_a
|
76
|
+
fill_in_cells!
|
77
|
+
rows.map{ |row_cells| row_cells.map(&:text) }
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_csv
|
81
|
+
out = StringIO.new
|
82
|
+
Tabula::Writers.CSV(rows, out)
|
83
|
+
out.string
|
84
|
+
end
|
85
|
+
|
86
|
+
def to_tsv
|
87
|
+
out = StringIO.new
|
88
|
+
Tabula::Writers.TSV(rows, out)
|
89
|
+
out.string
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module Tabula
|
2
|
+
class Table
|
3
|
+
attr_reader :lines
|
4
|
+
def initialize(line_count, separators)
|
5
|
+
@separators = separators
|
6
|
+
@lines = (0...line_count).inject([]) { |m| m << Line.new }
|
7
|
+
end
|
8
|
+
|
9
|
+
def add_text_element(text_element, i, j)
|
10
|
+
if @lines.size <= i
|
11
|
+
@lines[i] = Line.new
|
12
|
+
end
|
13
|
+
if @lines[i].text_elements[j]
|
14
|
+
@lines[i].text_elements[j].merge!(text_element)
|
15
|
+
else
|
16
|
+
@lines[i].text_elements[j] = text_element
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def rpad!
|
21
|
+
max = lines.map{|l| l.text_elements.size}.max
|
22
|
+
lines.each do |line|
|
23
|
+
needed = max - line.text_elements.size
|
24
|
+
needed.times do
|
25
|
+
line.text_elements << TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def cols
|
31
|
+
self.rpad!
|
32
|
+
lines.map(&:text_elements).transpose
|
33
|
+
end
|
34
|
+
|
35
|
+
def rows
|
36
|
+
self.rpad!
|
37
|
+
lines.map(&:text_elements)
|
38
|
+
end
|
39
|
+
|
40
|
+
# create a new Table object from an array of arrays, representing a list of rows in a spreadsheet
|
41
|
+
# probably only used for testing
|
42
|
+
def self.new_from_array(array_of_rows)
|
43
|
+
t = Table.new(array_of_rows.size, [])
|
44
|
+
array_of_rows.each_with_index do |row, index|
|
45
|
+
t.lines[index].text_elements = row.map{|cell| TextElement.new(nil, nil, nil, nil, nil, nil, cell, nil)}
|
46
|
+
end
|
47
|
+
t
|
48
|
+
end
|
49
|
+
|
50
|
+
#for equality testing, return @lines stripped of leading columns of empty strings
|
51
|
+
#TODO: write a method to strip all totally-empty columns (or not?)
|
52
|
+
def lstrip_lines
|
53
|
+
return @lines if @lines.include?(nil)
|
54
|
+
min_leading_empty_strings = Float::INFINITY
|
55
|
+
@lines.each do |line|
|
56
|
+
empties = line.text_elements.map{|t| t.nil? || t.text.empty? }
|
57
|
+
min_leading_empty_strings = [min_leading_empty_strings, empties.index(false)].min
|
58
|
+
end
|
59
|
+
if min_leading_empty_strings == 0
|
60
|
+
@lines
|
61
|
+
else
|
62
|
+
@lines.each{|line| line.text_elements = line.text_elements[min_leading_empty_strings..-1]}
|
63
|
+
@lines
|
64
|
+
end
|
65
|
+
end
|
66
|
+
def lstrip_lines!
|
67
|
+
@lines = self.lstrip_lines
|
68
|
+
end
|
69
|
+
|
70
|
+
#used for testing, ignores separator locations (they'll sometimes be nil/empty)
|
71
|
+
def ==(other)
|
72
|
+
self.instance_variable_set(:@lines, self.lstrip_lines)
|
73
|
+
other.instance_variable_set(:@lines, other.lstrip_lines)
|
74
|
+
self.instance_variable_set(:@lines, self.lines.rpad(nil, other.lines.size))
|
75
|
+
other.instance_variable_set(:@lines, other.lines.rpad(nil, self.lines.size))
|
76
|
+
|
77
|
+
self.lines.zip(other.lines).all? { |my, yours| my == yours }
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|