tabula-extractor 0.7.2-java → 0.7.4-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +4 -8
- data/bin/tabula +3 -3
- data/lib/tabula.rb +9 -5
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/cell.rb +6 -4
- data/lib/tabula/entities/has_cells.rb +22 -78
- data/lib/tabula/entities/line.rb +52 -6
- data/lib/tabula/entities/page.rb +43 -50
- data/lib/tabula/entities/ruling.rb +83 -105
- data/lib/tabula/entities/spreadsheet.rb +74 -11
- data/lib/tabula/entities/table.rb +55 -37
- data/lib/tabula/entities/tabular.rb +42 -0
- data/lib/tabula/entities/text_chunk.rb +55 -52
- data/lib/tabula/entities/text_element.rb +129 -62
- data/lib/tabula/entities/zone_entity.rb +15 -6
- data/lib/tabula/extraction.rb +114 -49
- data/lib/tabula/line_segment_detector.rb +0 -5
- data/lib/tabula/table_extractor.rb +32 -37
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -5
- metadata +13 -95
- data/ext/COPYING +0 -661
- data/ext/Makefile.OSX +0 -18
- data/ext/Makefile.defaults +0 -9
- data/ext/Makefile.linux32 +0 -11
- data/ext/Makefile.linux64 +0 -12
- data/ext/Makefile.mingw +0 -10
- data/ext/Makefile.mingw64 +0 -10
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +0 -3
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +0 -2270
- data/ext/lsd.h +0 -283
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/argentina_diputados_voting_record.pdf +0 -0
- data/test/data/bo_page24.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +0 -88
- data/test/data/gre.pdf +0 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +0 -21
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/tabla_subsidios.pdf +0 -0
- data/test/data/vertical_rulings_bug.pdf +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +0 -50
- data/test/test_bin_tabula.sh +0 -7
- data/test/tests.rb +0 -603
@@ -37,9 +37,11 @@ module Tabula
|
|
37
37
|
bottom - top
|
38
38
|
end
|
39
39
|
|
40
|
-
|
41
40
|
# attributes that make sense only for non-oblique lines
|
42
41
|
# these are used to have a single collapse method (in page, currently)
|
42
|
+
|
43
|
+
##
|
44
|
+
# `x` (left) coordinate if line vertical, `y` (top) if horizontal
|
43
45
|
def position
|
44
46
|
raise NoMethodError, "Oblique line #{self.inspect} has no #position method." if oblique?
|
45
47
|
vertical? ? left : top
|
@@ -153,6 +155,10 @@ module Tabula
|
|
153
155
|
point.y >= top && point.y <= bottom
|
154
156
|
end
|
155
157
|
|
158
|
+
def ==(other)
|
159
|
+
return self.getX1 == other.getX1 && self.getY1 == other.getY1 && self.getX2 == other.getX2 && self.getY2 == other.getY2
|
160
|
+
end
|
161
|
+
|
156
162
|
##
|
157
163
|
# calculate the intersection point between +self+ and other Ruling
|
158
164
|
def intersection_point(other)
|
@@ -164,40 +170,71 @@ module Tabula
|
|
164
170
|
|
165
171
|
return nil if !self_l.intersectsLine(other_l)
|
166
172
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
+
horizontal, vertical = if self_l.horizontal? && other_l.vertical?
|
174
|
+
[self_l, other]
|
175
|
+
elsif self_l.vertical? && other_l.horizontal?
|
176
|
+
[other_l, self_l]
|
177
|
+
else
|
178
|
+
raise ArgumentError, "must be orthogonal, horizontal and vertical"
|
179
|
+
end
|
173
180
|
|
174
|
-
int_x = det.call(det.call(x1, y1, x2, y2), x1 - x2, det.call(x3, y3, x4, y4), x3 - x4) /
|
175
|
-
det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
|
176
181
|
|
177
|
-
|
178
|
-
det.call(x3, y3, x4, y4), y3 - y4) /
|
179
|
-
det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
|
180
|
-
|
181
|
-
return nil if int_x.nan? || int_y.nan? # TODO is this right?
|
182
|
+
java.awt.geom.Point2D::Float.new(vertical.getX1, horizontal.getY1)
|
182
183
|
|
184
|
+
end
|
183
185
|
|
184
|
-
|
186
|
+
class HSegmentComparator
|
187
|
+
include java.util.Comparator
|
188
|
+
def compare(o1, o2)
|
189
|
+
o1.top <=> o2.top
|
190
|
+
end
|
185
191
|
end
|
186
192
|
|
187
193
|
##
|
188
|
-
#
|
189
|
-
#
|
190
|
-
# TODO: this is O(n^2) - optimize.
|
194
|
+
# log(n) implementation of find_intersections
|
195
|
+
# based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
191
196
|
def self.find_intersections(horizontals, verticals)
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
197
|
+
tree = java.util.TreeMap.new(HSegmentComparator.new)
|
198
|
+
sort_obj = Struct.new(:type, :pos, :obj)
|
199
|
+
|
200
|
+
(horizontals + verticals)
|
201
|
+
.flat_map { |r|
|
202
|
+
r.vertical? ? sort_obj.new(:v, r.left, r) : [sort_obj.new(:hl, r.left, r),
|
203
|
+
sort_obj.new(:hr, r.right, r)]
|
204
|
+
}
|
205
|
+
.sort { |a,b|
|
206
|
+
if a.pos == b.pos
|
207
|
+
if a.type == :v && b.type == :hl
|
208
|
+
1
|
209
|
+
elsif a.type == :v && b.type == :hr
|
210
|
+
-1
|
211
|
+
elsif a.type == :hl && b.type == :v
|
212
|
+
-1
|
213
|
+
elsif a.type == :hr && b.type == :v
|
214
|
+
1
|
215
|
+
else
|
216
|
+
a.pos <=> b.pos
|
217
|
+
end
|
218
|
+
else
|
219
|
+
a.pos <=> b.pos
|
220
|
+
end
|
221
|
+
}
|
222
|
+
.inject({}) { |memo, e|
|
223
|
+
case e.type
|
224
|
+
when :v
|
225
|
+
tree.each { |h,_|
|
226
|
+
i = h.intersection_point(e.obj)
|
227
|
+
next memo if i.nil?
|
228
|
+
memo[i] = [h.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
|
229
|
+
e.obj.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)]
|
230
|
+
}
|
231
|
+
when :hr
|
232
|
+
tree.remove(e.obj)
|
233
|
+
when :hl
|
234
|
+
tree[e.obj] = 1
|
235
|
+
end
|
236
|
+
memo
|
237
|
+
}
|
201
238
|
end
|
202
239
|
|
203
240
|
##
|
@@ -211,90 +248,31 @@ module Tabula
|
|
211
248
|
end
|
212
249
|
end
|
213
250
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
skip = false
|
221
|
-
|
222
|
-
horiz = rulings.select { |r| r.horizontal? }
|
223
|
-
.group_by(&:top)
|
224
|
-
.values.reduce([]) do |memo, rs|
|
225
|
-
|
226
|
-
rs = rs.sort_by(&:left)
|
227
|
-
if rs.size > 1
|
228
|
-
memo +=
|
229
|
-
rs.each_cons(2)
|
230
|
-
.chunk { |p| p[1].left - p[0].right < 7 }
|
231
|
-
.select { |c| c[0] }
|
232
|
-
.map { |group|
|
233
|
-
group = group.last.flatten.uniq
|
234
|
-
Tabula::Ruling.new(group[0].top,
|
235
|
-
group[0].left,
|
236
|
-
group[-1].right - group[0].left,
|
237
|
-
0)
|
238
|
-
}
|
239
|
-
Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
|
240
|
-
else
|
241
|
-
memo << rs.first
|
242
|
-
end
|
243
|
-
memo
|
251
|
+
def self.collapse_oriented_rulings(lines)
|
252
|
+
# lines must all be of one orientation (i.e. horizontal, vertical)
|
253
|
+
|
254
|
+
if lines.empty?
|
255
|
+
return []
|
244
256
|
end
|
245
|
-
.sort_by(&:top)
|
246
257
|
|
247
|
-
|
248
|
-
horiz.size.times do |i|
|
258
|
+
lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
|
249
259
|
|
250
|
-
|
251
|
-
|
252
|
-
break
|
253
|
-
end
|
260
|
+
lines = lines.inject([lines.shift]) do |memo, next_line|
|
261
|
+
last = memo.last
|
254
262
|
|
255
|
-
if
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
else
|
265
|
-
horiz[i]
|
266
|
-
end
|
267
|
-
end
|
268
|
-
horiz = h
|
269
|
-
|
270
|
-
vert = rulings.select { |r| r.vertical? }
|
271
|
-
.group_by(&:left)
|
272
|
-
.values
|
273
|
-
.reduce([]) do |memo, rs|
|
274
|
-
|
275
|
-
rs = rs.sort_by(&:top)
|
276
|
-
|
277
|
-
if rs.size > 1
|
278
|
-
# Here be dragons:
|
279
|
-
# merge consecutive segments of lines that are close enough
|
280
|
-
memo +=
|
281
|
-
rs.each_cons(2)
|
282
|
-
.chunk { |p| p[1].top - p[0].bottom < 7 }
|
283
|
-
.select { |c| c[0] }
|
284
|
-
.map { |group|
|
285
|
-
group = group.last.flatten.uniq
|
286
|
-
Tabula::Ruling.new(group[0].top,
|
287
|
-
group[0].left,
|
288
|
-
0,
|
289
|
-
group[-1].bottom - group[0].top)
|
290
|
-
}
|
263
|
+
# if current line colinear with next, and are "close enough": expand current line
|
264
|
+
if next_line.position == last.position && last.nearlyIntersects?(next_line)
|
265
|
+
memo.last.start = next_line.start < last.start ? next_line.start : last.start
|
266
|
+
memo.last.end = next_line.end < last.end ? last.end : next_line.end
|
267
|
+
memo
|
268
|
+
# if next line has no length, ignore it
|
269
|
+
elsif next_line.length == 0
|
270
|
+
memo
|
271
|
+
# otherwise, add it to the returned collection
|
291
272
|
else
|
292
|
-
memo <<
|
273
|
+
memo << next_line
|
293
274
|
end
|
294
|
-
|
295
|
-
end.sort_by(&:left)
|
296
|
-
|
297
|
-
return horiz += vert
|
275
|
+
end
|
298
276
|
end
|
299
277
|
end
|
300
278
|
end
|
@@ -5,7 +5,7 @@ module Tabula
|
|
5
5
|
# the both should implement `cells`, `rows`, `cols`, `extraction_method`
|
6
6
|
|
7
7
|
class Spreadsheet < ZoneEntity
|
8
|
-
include Tabula::
|
8
|
+
include Tabula::Tabular
|
9
9
|
attr_accessor :cells, :vertical_ruling_lines, :horizontal_ruling_lines, :cells_resolved
|
10
10
|
attr_reader :extraction_method, :page
|
11
11
|
|
@@ -18,6 +18,10 @@ module Tabula
|
|
18
18
|
@extraction_method = "spreadsheet"
|
19
19
|
end
|
20
20
|
|
21
|
+
def self.empty(page)
|
22
|
+
Spreadsheet.new(0, 0, 0, 0, page, [], nil, nil)
|
23
|
+
end
|
24
|
+
|
21
25
|
def ruling_lines
|
22
26
|
@vertical_ruling_lines + @horizontal_ruling_lines
|
23
27
|
end
|
@@ -27,15 +31,6 @@ module Tabula
|
|
27
31
|
@horizontal_ruling_lines = lines.select{|hl| hl.horizontal? && spr.intersectsLine(hl) }
|
28
32
|
end
|
29
33
|
|
30
|
-
def fill_in_cells!
|
31
|
-
unless @cells_resolved
|
32
|
-
@cells_resolved = true
|
33
|
-
cells.each do |cell|
|
34
|
-
cell.text_elements = @page.get_cell_text(cell)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
34
|
# call `rows` with `evaluate_cells` as `false` to defer filling in the text in
|
40
35
|
# each cell, which can be computationally intensive.
|
41
36
|
def rows(evaluate_cells=true)
|
@@ -77,6 +72,64 @@ module Tabula
|
|
77
72
|
end
|
78
73
|
end
|
79
74
|
|
75
|
+
#######################################################
|
76
|
+
# Chapter 2 of Spreadsheet extraction, Spanning Cells #
|
77
|
+
#######################################################
|
78
|
+
#if c is a "spanning cell", that is
|
79
|
+
# if there are N>0 vertical lines strictly between this cell's left and right
|
80
|
+
#insert N placeholder cells after it with zero size (but same top)
|
81
|
+
def add_spanning_cells!
|
82
|
+
#rounding: because Cell.new_from_points, using in #find_cells above, has
|
83
|
+
# a float precision error where, for instance, a cell whose x2 coord is
|
84
|
+
# supposed to be 160.137451171875 comes out as 160.13745498657227 because
|
85
|
+
# of minus. :(
|
86
|
+
vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq #already sorted
|
87
|
+
horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
|
88
|
+
|
89
|
+
cells.each do |c|
|
90
|
+
vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
|
91
|
+
horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
|
92
|
+
|
93
|
+
unless vertical_rulings_spanned_over.empty?
|
94
|
+
c.spanning = true
|
95
|
+
vertical_rulings_spanned_over.each do |spanned_over_line_loc|
|
96
|
+
placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
|
97
|
+
placeholder.placeholder = true
|
98
|
+
cells << placeholder
|
99
|
+
end
|
100
|
+
end
|
101
|
+
unless horizontal_rulings_spanned_over.empty?
|
102
|
+
c.spanning = true
|
103
|
+
horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
|
104
|
+
placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
|
105
|
+
placeholder.placeholder = true
|
106
|
+
cells << placeholder
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
#if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
|
111
|
+
# e.g. -------------------
|
112
|
+
# | C | C | C | C | (this is some pretty sweet ASCII art, eh?)
|
113
|
+
# |-----------------|
|
114
|
+
# | C | C | C | C |
|
115
|
+
# |-----------------|
|
116
|
+
# | C | SC P | C | where MC is the "spanning cell" that holds all the text within its bounds
|
117
|
+
# |---- + ----| P is a "placeholder" cell with either zero width or zero height
|
118
|
+
# | C | P DP | C | DP is a "double placeholder" cell with zero width and zero height
|
119
|
+
# |---- + ----| C is an ordinary cell.
|
120
|
+
# | C | P DP | C |
|
121
|
+
# |-----------------|
|
122
|
+
|
123
|
+
unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
|
124
|
+
double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
|
125
|
+
placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
|
126
|
+
placeholder.placeholder = true
|
127
|
+
cells << placeholder
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
80
133
|
def to_a
|
81
134
|
fill_in_cells!
|
82
135
|
rows.map{ |row_cells| row_cells.map(&:text) }
|
@@ -103,8 +156,18 @@ module Tabula
|
|
103
156
|
end
|
104
157
|
|
105
158
|
def +(other)
|
106
|
-
raise ArgumentError unless other.page == @page
|
159
|
+
raise ArgumentError, "Data can only be added if it's from the same PDF page" unless other.page == @page
|
107
160
|
Spreadsheet.new(nil, nil, nil, nil, @page, @cells + other.cells, nil, nil )
|
108
161
|
end
|
162
|
+
|
163
|
+
protected
|
164
|
+
def fill_in_cells!
|
165
|
+
unless @cells_resolved
|
166
|
+
@cells_resolved = true
|
167
|
+
cells.each do |cell|
|
168
|
+
cell.text_elements = @page.get_cell_text(cell)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
109
172
|
end
|
110
173
|
end
|
@@ -1,7 +1,8 @@
|
|
1
1
|
module Tabula
|
2
2
|
class Table
|
3
|
+
include Tabula::Tabular
|
3
4
|
attr_reader :extraction_method
|
4
|
-
|
5
|
+
|
5
6
|
def initialize(line_count, separators)
|
6
7
|
@separators = separators
|
7
8
|
@lines = (0...line_count).inject([]) { |m| m << Line.new }
|
@@ -19,27 +20,24 @@ module Tabula
|
|
19
20
|
end
|
20
21
|
end
|
21
22
|
|
22
|
-
def rpad!
|
23
|
-
max = lines.map{|l| l.text_elements.size}.max
|
24
|
-
lines.each do |line|
|
25
|
-
needed = max - line.text_elements.size
|
26
|
-
needed.times do
|
27
|
-
line.text_elements << TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
23
|
|
32
24
|
def cols
|
33
25
|
rows.transpose
|
34
26
|
end
|
35
27
|
|
28
|
+
# TODO: this is awful, refactor
|
36
29
|
def rows
|
37
|
-
|
38
|
-
|
30
|
+
rpad!
|
31
|
+
lstrip_lines!
|
32
|
+
li = lines.map do |l|
|
39
33
|
l.text_elements.map! do |te|
|
40
34
|
te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
41
35
|
end
|
42
|
-
end.
|
36
|
+
end.select do
|
37
|
+
|l| !l.all? { |te| te.text.empty? }
|
38
|
+
end.sort_by do |l|
|
39
|
+
l.map { |te| te.top || 0 }.max
|
40
|
+
end
|
43
41
|
end
|
44
42
|
|
45
43
|
# create a new Table object from an array of arrays, representing a list of rows in a spreadsheet
|
@@ -47,41 +45,26 @@ module Tabula
|
|
47
45
|
def self.new_from_array(array_of_rows)
|
48
46
|
t = Table.new(array_of_rows.size, [])
|
49
47
|
@extraction_method = "testing"
|
48
|
+
tlines = []
|
50
49
|
array_of_rows.each_with_index do |row, index|
|
51
|
-
|
50
|
+
l = Line.new
|
51
|
+
l.text_elements = row.each_with_index.map{|cell, inner_index| TextElement.new(index, inner_index, 1, 1, nil, nil, cell, nil)}
|
52
|
+
tlines << l
|
52
53
|
end
|
53
|
-
t.
|
54
|
+
t.instance_variable_set(:@lines, tlines)
|
55
|
+
t.send(:rpad!)
|
54
56
|
t
|
55
57
|
end
|
56
58
|
|
57
|
-
#for equality testing, return @lines stripped of leading columns of empty strings
|
58
|
-
#TODO: write a method to strip all totally-empty columns (or not?)
|
59
|
-
def lstrip_lines
|
60
|
-
return @lines if @lines.include?(nil)
|
61
|
-
min_leading_empty_strings = Float::INFINITY
|
62
|
-
@lines.each do |line|
|
63
|
-
empties = line.text_elements.map{|t| t.nil? || t.text.empty? }
|
64
|
-
min_leading_empty_strings = [min_leading_empty_strings, empties.index(false)].min
|
65
|
-
end
|
66
|
-
if min_leading_empty_strings == 0
|
67
|
-
@lines
|
68
|
-
else
|
69
|
-
@lines.each{|line| line.text_elements = line.text_elements[min_leading_empty_strings..-1]}
|
70
|
-
@lines
|
71
|
-
end
|
72
|
-
end
|
73
|
-
def lstrip_lines!
|
74
|
-
@lines = self.lstrip_lines
|
75
|
-
end
|
76
59
|
|
77
60
|
#used for testing, ignores separator locations (they'll sometimes be nil/empty)
|
78
61
|
def ==(other)
|
79
62
|
self.instance_variable_set(:@lines, self.lstrip_lines)
|
80
63
|
other.instance_variable_set(:@lines, other.lstrip_lines)
|
81
|
-
self.instance_variable_set(:@lines, self.lines.rpad(
|
82
|
-
other.instance_variable_set(:@lines, other.lines.rpad(
|
64
|
+
self.instance_variable_set(:@lines, self.lines.rpad(Line.new, other.lines.size))
|
65
|
+
other.instance_variable_set(:@lines, other.lines.rpad(Line.new, self.lines.size))
|
83
66
|
|
84
|
-
self.
|
67
|
+
self.rows.zip(other.rows).all? { |my, yours| my == yours }
|
85
68
|
|
86
69
|
end
|
87
70
|
|
@@ -89,6 +72,7 @@ module Tabula
|
|
89
72
|
{
|
90
73
|
'json_class' => self.class.name,
|
91
74
|
'extraction_method' => @extraction_method,
|
75
|
+
'vertical_separators' => @separators,
|
92
76
|
'data' => rows,
|
93
77
|
}.to_json(*a)
|
94
78
|
end
|
@@ -104,5 +88,39 @@ module Tabula
|
|
104
88
|
Tabula::Writers.TSV(rows, out)
|
105
89
|
out.string
|
106
90
|
end
|
91
|
+
|
92
|
+
protected
|
93
|
+
def rpad!
|
94
|
+
max = lines.map{|l| l.text_elements.size}.max
|
95
|
+
lines.each do |line|
|
96
|
+
needed = max - line.text_elements.size
|
97
|
+
needed.times do
|
98
|
+
line.text_elements << TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
#for equality testing, return @lines stripped of leading columns of empty strings
|
104
|
+
#TODO: write a method to strip all totally-empty columns (or not?)
|
105
|
+
def lstrip_lines
|
106
|
+
min_leading_empty_strings = Float::INFINITY
|
107
|
+
@lines.each do |line|
|
108
|
+
empties = line.text_elements.map{|t| t.nil? || t.text.empty? }
|
109
|
+
min_leading_empty_strings = [min_leading_empty_strings,
|
110
|
+
empties.index(false) || 0].min
|
111
|
+
end
|
112
|
+
if min_leading_empty_strings == 0
|
113
|
+
@lines
|
114
|
+
else
|
115
|
+
@lines.each{|line| line.text_elements = line.text_elements[min_leading_empty_strings..-1]}
|
116
|
+
@lines
|
117
|
+
end
|
118
|
+
end
|
119
|
+
def lstrip_lines!
|
120
|
+
@lines = self.lstrip_lines
|
121
|
+
end
|
122
|
+
|
123
|
+
attr_accessor :lines
|
124
|
+
|
107
125
|
end
|
108
126
|
end
|