tabula-extractor 0.7.2-java → 0.7.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +4 -8
- data/bin/tabula +3 -3
- data/lib/tabula.rb +9 -5
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/cell.rb +6 -4
- data/lib/tabula/entities/has_cells.rb +22 -78
- data/lib/tabula/entities/line.rb +52 -6
- data/lib/tabula/entities/page.rb +43 -50
- data/lib/tabula/entities/ruling.rb +83 -105
- data/lib/tabula/entities/spreadsheet.rb +74 -11
- data/lib/tabula/entities/table.rb +55 -37
- data/lib/tabula/entities/tabular.rb +42 -0
- data/lib/tabula/entities/text_chunk.rb +55 -52
- data/lib/tabula/entities/text_element.rb +129 -62
- data/lib/tabula/entities/zone_entity.rb +15 -6
- data/lib/tabula/extraction.rb +114 -49
- data/lib/tabula/line_segment_detector.rb +0 -5
- data/lib/tabula/table_extractor.rb +32 -37
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -5
- metadata +13 -95
- data/ext/COPYING +0 -661
- data/ext/Makefile.OSX +0 -18
- data/ext/Makefile.defaults +0 -9
- data/ext/Makefile.linux32 +0 -11
- data/ext/Makefile.linux64 +0 -12
- data/ext/Makefile.mingw +0 -10
- data/ext/Makefile.mingw64 +0 -10
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +0 -3
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +0 -2270
- data/ext/lsd.h +0 -283
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/argentina_diputados_voting_record.pdf +0 -0
- data/test/data/bo_page24.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +0 -88
- data/test/data/gre.pdf +0 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +0 -21
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/tabla_subsidios.pdf +0 -0
- data/test/data/vertical_rulings_bug.pdf +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +0 -50
- data/test/test_bin_tabula.sh +0 -7
- data/test/tests.rb +0 -603
@@ -37,9 +37,11 @@ module Tabula
|
|
37
37
|
bottom - top
|
38
38
|
end
|
39
39
|
|
40
|
-
|
41
40
|
# attributes that make sense only for non-oblique lines
|
42
41
|
# these are used to have a single collapse method (in page, currently)
|
42
|
+
|
43
|
+
##
|
44
|
+
# `x` (left) coordinate if line vertical, `y` (top) if horizontal
|
43
45
|
def position
|
44
46
|
raise NoMethodError, "Oblique line #{self.inspect} has no #position method." if oblique?
|
45
47
|
vertical? ? left : top
|
@@ -153,6 +155,10 @@ module Tabula
|
|
153
155
|
point.y >= top && point.y <= bottom
|
154
156
|
end
|
155
157
|
|
158
|
+
def ==(other)
|
159
|
+
return self.getX1 == other.getX1 && self.getY1 == other.getY1 && self.getX2 == other.getX2 && self.getY2 == other.getY2
|
160
|
+
end
|
161
|
+
|
156
162
|
##
|
157
163
|
# calculate the intersection point between +self+ and other Ruling
|
158
164
|
def intersection_point(other)
|
@@ -164,40 +170,71 @@ module Tabula
|
|
164
170
|
|
165
171
|
return nil if !self_l.intersectsLine(other_l)
|
166
172
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
+
horizontal, vertical = if self_l.horizontal? && other_l.vertical?
|
174
|
+
[self_l, other]
|
175
|
+
elsif self_l.vertical? && other_l.horizontal?
|
176
|
+
[other_l, self_l]
|
177
|
+
else
|
178
|
+
raise ArgumentError, "must be orthogonal, horizontal and vertical"
|
179
|
+
end
|
173
180
|
|
174
|
-
int_x = det.call(det.call(x1, y1, x2, y2), x1 - x2, det.call(x3, y3, x4, y4), x3 - x4) /
|
175
|
-
det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
|
176
181
|
|
177
|
-
|
178
|
-
det.call(x3, y3, x4, y4), y3 - y4) /
|
179
|
-
det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
|
180
|
-
|
181
|
-
return nil if int_x.nan? || int_y.nan? # TODO is this right?
|
182
|
+
java.awt.geom.Point2D::Float.new(vertical.getX1, horizontal.getY1)
|
182
183
|
|
184
|
+
end
|
183
185
|
|
184
|
-
|
186
|
+
class HSegmentComparator
|
187
|
+
include java.util.Comparator
|
188
|
+
def compare(o1, o2)
|
189
|
+
o1.top <=> o2.top
|
190
|
+
end
|
185
191
|
end
|
186
192
|
|
187
193
|
##
|
188
|
-
#
|
189
|
-
#
|
190
|
-
# TODO: this is O(n^2) - optimize.
|
194
|
+
# log(n) implementation of find_intersections
|
195
|
+
# based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
191
196
|
def self.find_intersections(horizontals, verticals)
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
197
|
+
tree = java.util.TreeMap.new(HSegmentComparator.new)
|
198
|
+
sort_obj = Struct.new(:type, :pos, :obj)
|
199
|
+
|
200
|
+
(horizontals + verticals)
|
201
|
+
.flat_map { |r|
|
202
|
+
r.vertical? ? sort_obj.new(:v, r.left, r) : [sort_obj.new(:hl, r.left, r),
|
203
|
+
sort_obj.new(:hr, r.right, r)]
|
204
|
+
}
|
205
|
+
.sort { |a,b|
|
206
|
+
if a.pos == b.pos
|
207
|
+
if a.type == :v && b.type == :hl
|
208
|
+
1
|
209
|
+
elsif a.type == :v && b.type == :hr
|
210
|
+
-1
|
211
|
+
elsif a.type == :hl && b.type == :v
|
212
|
+
-1
|
213
|
+
elsif a.type == :hr && b.type == :v
|
214
|
+
1
|
215
|
+
else
|
216
|
+
a.pos <=> b.pos
|
217
|
+
end
|
218
|
+
else
|
219
|
+
a.pos <=> b.pos
|
220
|
+
end
|
221
|
+
}
|
222
|
+
.inject({}) { |memo, e|
|
223
|
+
case e.type
|
224
|
+
when :v
|
225
|
+
tree.each { |h,_|
|
226
|
+
i = h.intersection_point(e.obj)
|
227
|
+
next memo if i.nil?
|
228
|
+
memo[i] = [h.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
|
229
|
+
e.obj.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)]
|
230
|
+
}
|
231
|
+
when :hr
|
232
|
+
tree.remove(e.obj)
|
233
|
+
when :hl
|
234
|
+
tree[e.obj] = 1
|
235
|
+
end
|
236
|
+
memo
|
237
|
+
}
|
201
238
|
end
|
202
239
|
|
203
240
|
##
|
@@ -211,90 +248,31 @@ module Tabula
|
|
211
248
|
end
|
212
249
|
end
|
213
250
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
skip = false
|
221
|
-
|
222
|
-
horiz = rulings.select { |r| r.horizontal? }
|
223
|
-
.group_by(&:top)
|
224
|
-
.values.reduce([]) do |memo, rs|
|
225
|
-
|
226
|
-
rs = rs.sort_by(&:left)
|
227
|
-
if rs.size > 1
|
228
|
-
memo +=
|
229
|
-
rs.each_cons(2)
|
230
|
-
.chunk { |p| p[1].left - p[0].right < 7 }
|
231
|
-
.select { |c| c[0] }
|
232
|
-
.map { |group|
|
233
|
-
group = group.last.flatten.uniq
|
234
|
-
Tabula::Ruling.new(group[0].top,
|
235
|
-
group[0].left,
|
236
|
-
group[-1].right - group[0].left,
|
237
|
-
0)
|
238
|
-
}
|
239
|
-
Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
|
240
|
-
else
|
241
|
-
memo << rs.first
|
242
|
-
end
|
243
|
-
memo
|
251
|
+
def self.collapse_oriented_rulings(lines)
|
252
|
+
# lines must all be of one orientation (i.e. horizontal, vertical)
|
253
|
+
|
254
|
+
if lines.empty?
|
255
|
+
return []
|
244
256
|
end
|
245
|
-
.sort_by(&:top)
|
246
257
|
|
247
|
-
|
248
|
-
horiz.size.times do |i|
|
258
|
+
lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
|
249
259
|
|
250
|
-
|
251
|
-
|
252
|
-
break
|
253
|
-
end
|
260
|
+
lines = lines.inject([lines.shift]) do |memo, next_line|
|
261
|
+
last = memo.last
|
254
262
|
|
255
|
-
if
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
else
|
265
|
-
horiz[i]
|
266
|
-
end
|
267
|
-
end
|
268
|
-
horiz = h
|
269
|
-
|
270
|
-
vert = rulings.select { |r| r.vertical? }
|
271
|
-
.group_by(&:left)
|
272
|
-
.values
|
273
|
-
.reduce([]) do |memo, rs|
|
274
|
-
|
275
|
-
rs = rs.sort_by(&:top)
|
276
|
-
|
277
|
-
if rs.size > 1
|
278
|
-
# Here be dragons:
|
279
|
-
# merge consecutive segments of lines that are close enough
|
280
|
-
memo +=
|
281
|
-
rs.each_cons(2)
|
282
|
-
.chunk { |p| p[1].top - p[0].bottom < 7 }
|
283
|
-
.select { |c| c[0] }
|
284
|
-
.map { |group|
|
285
|
-
group = group.last.flatten.uniq
|
286
|
-
Tabula::Ruling.new(group[0].top,
|
287
|
-
group[0].left,
|
288
|
-
0,
|
289
|
-
group[-1].bottom - group[0].top)
|
290
|
-
}
|
263
|
+
# if current line colinear with next, and are "close enough": expand current line
|
264
|
+
if next_line.position == last.position && last.nearlyIntersects?(next_line)
|
265
|
+
memo.last.start = next_line.start < last.start ? next_line.start : last.start
|
266
|
+
memo.last.end = next_line.end < last.end ? last.end : next_line.end
|
267
|
+
memo
|
268
|
+
# if next line has no length, ignore it
|
269
|
+
elsif next_line.length == 0
|
270
|
+
memo
|
271
|
+
# otherwise, add it to the returned collection
|
291
272
|
else
|
292
|
-
memo <<
|
273
|
+
memo << next_line
|
293
274
|
end
|
294
|
-
|
295
|
-
end.sort_by(&:left)
|
296
|
-
|
297
|
-
return horiz += vert
|
275
|
+
end
|
298
276
|
end
|
299
277
|
end
|
300
278
|
end
|
@@ -5,7 +5,7 @@ module Tabula
|
|
5
5
|
# the both should implement `cells`, `rows`, `cols`, `extraction_method`
|
6
6
|
|
7
7
|
class Spreadsheet < ZoneEntity
|
8
|
-
include Tabula::
|
8
|
+
include Tabula::Tabular
|
9
9
|
attr_accessor :cells, :vertical_ruling_lines, :horizontal_ruling_lines, :cells_resolved
|
10
10
|
attr_reader :extraction_method, :page
|
11
11
|
|
@@ -18,6 +18,10 @@ module Tabula
|
|
18
18
|
@extraction_method = "spreadsheet"
|
19
19
|
end
|
20
20
|
|
21
|
+
def self.empty(page)
|
22
|
+
Spreadsheet.new(0, 0, 0, 0, page, [], nil, nil)
|
23
|
+
end
|
24
|
+
|
21
25
|
def ruling_lines
|
22
26
|
@vertical_ruling_lines + @horizontal_ruling_lines
|
23
27
|
end
|
@@ -27,15 +31,6 @@ module Tabula
|
|
27
31
|
@horizontal_ruling_lines = lines.select{|hl| hl.horizontal? && spr.intersectsLine(hl) }
|
28
32
|
end
|
29
33
|
|
30
|
-
def fill_in_cells!
|
31
|
-
unless @cells_resolved
|
32
|
-
@cells_resolved = true
|
33
|
-
cells.each do |cell|
|
34
|
-
cell.text_elements = @page.get_cell_text(cell)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
34
|
# call `rows` with `evaluate_cells` as `false` to defer filling in the text in
|
40
35
|
# each cell, which can be computationally intensive.
|
41
36
|
def rows(evaluate_cells=true)
|
@@ -77,6 +72,64 @@ module Tabula
|
|
77
72
|
end
|
78
73
|
end
|
79
74
|
|
75
|
+
#######################################################
|
76
|
+
# Chapter 2 of Spreadsheet extraction, Spanning Cells #
|
77
|
+
#######################################################
|
78
|
+
#if c is a "spanning cell", that is
|
79
|
+
# if there are N>0 vertical lines strictly between this cell's left and right
|
80
|
+
#insert N placeholder cells after it with zero size (but same top)
|
81
|
+
def add_spanning_cells!
|
82
|
+
#rounding: because Cell.new_from_points, using in #find_cells above, has
|
83
|
+
# a float precision error where, for instance, a cell whose x2 coord is
|
84
|
+
# supposed to be 160.137451171875 comes out as 160.13745498657227 because
|
85
|
+
# of minus. :(
|
86
|
+
vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq #already sorted
|
87
|
+
horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
|
88
|
+
|
89
|
+
cells.each do |c|
|
90
|
+
vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
|
91
|
+
horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
|
92
|
+
|
93
|
+
unless vertical_rulings_spanned_over.empty?
|
94
|
+
c.spanning = true
|
95
|
+
vertical_rulings_spanned_over.each do |spanned_over_line_loc|
|
96
|
+
placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
|
97
|
+
placeholder.placeholder = true
|
98
|
+
cells << placeholder
|
99
|
+
end
|
100
|
+
end
|
101
|
+
unless horizontal_rulings_spanned_over.empty?
|
102
|
+
c.spanning = true
|
103
|
+
horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
|
104
|
+
placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
|
105
|
+
placeholder.placeholder = true
|
106
|
+
cells << placeholder
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
#if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
|
111
|
+
# e.g. -------------------
|
112
|
+
# | C | C | C | C | (this is some pretty sweet ASCII art, eh?)
|
113
|
+
# |-----------------|
|
114
|
+
# | C | C | C | C |
|
115
|
+
# |-----------------|
|
116
|
+
# | C | SC P | C | where MC is the "spanning cell" that holds all the text within its bounds
|
117
|
+
# |---- + ----| P is a "placeholder" cell with either zero width or zero height
|
118
|
+
# | C | P DP | C | DP is a "double placeholder" cell with zero width and zero height
|
119
|
+
# |---- + ----| C is an ordinary cell.
|
120
|
+
# | C | P DP | C |
|
121
|
+
# |-----------------|
|
122
|
+
|
123
|
+
unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
|
124
|
+
double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
|
125
|
+
placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
|
126
|
+
placeholder.placeholder = true
|
127
|
+
cells << placeholder
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
80
133
|
def to_a
|
81
134
|
fill_in_cells!
|
82
135
|
rows.map{ |row_cells| row_cells.map(&:text) }
|
@@ -103,8 +156,18 @@ module Tabula
|
|
103
156
|
end
|
104
157
|
|
105
158
|
def +(other)
|
106
|
-
raise ArgumentError unless other.page == @page
|
159
|
+
raise ArgumentError, "Data can only be added if it's from the same PDF page" unless other.page == @page
|
107
160
|
Spreadsheet.new(nil, nil, nil, nil, @page, @cells + other.cells, nil, nil )
|
108
161
|
end
|
162
|
+
|
163
|
+
protected
|
164
|
+
def fill_in_cells!
|
165
|
+
unless @cells_resolved
|
166
|
+
@cells_resolved = true
|
167
|
+
cells.each do |cell|
|
168
|
+
cell.text_elements = @page.get_cell_text(cell)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
109
172
|
end
|
110
173
|
end
|
@@ -1,7 +1,8 @@
|
|
1
1
|
module Tabula
|
2
2
|
class Table
|
3
|
+
include Tabula::Tabular
|
3
4
|
attr_reader :extraction_method
|
4
|
-
|
5
|
+
|
5
6
|
def initialize(line_count, separators)
|
6
7
|
@separators = separators
|
7
8
|
@lines = (0...line_count).inject([]) { |m| m << Line.new }
|
@@ -19,27 +20,24 @@ module Tabula
|
|
19
20
|
end
|
20
21
|
end
|
21
22
|
|
22
|
-
def rpad!
|
23
|
-
max = lines.map{|l| l.text_elements.size}.max
|
24
|
-
lines.each do |line|
|
25
|
-
needed = max - line.text_elements.size
|
26
|
-
needed.times do
|
27
|
-
line.text_elements << TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
23
|
|
32
24
|
def cols
|
33
25
|
rows.transpose
|
34
26
|
end
|
35
27
|
|
28
|
+
# TODO: this is awful, refactor
|
36
29
|
def rows
|
37
|
-
|
38
|
-
|
30
|
+
rpad!
|
31
|
+
lstrip_lines!
|
32
|
+
li = lines.map do |l|
|
39
33
|
l.text_elements.map! do |te|
|
40
34
|
te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
41
35
|
end
|
42
|
-
end.
|
36
|
+
end.select do
|
37
|
+
|l| !l.all? { |te| te.text.empty? }
|
38
|
+
end.sort_by do |l|
|
39
|
+
l.map { |te| te.top || 0 }.max
|
40
|
+
end
|
43
41
|
end
|
44
42
|
|
45
43
|
# create a new Table object from an array of arrays, representing a list of rows in a spreadsheet
|
@@ -47,41 +45,26 @@ module Tabula
|
|
47
45
|
def self.new_from_array(array_of_rows)
|
48
46
|
t = Table.new(array_of_rows.size, [])
|
49
47
|
@extraction_method = "testing"
|
48
|
+
tlines = []
|
50
49
|
array_of_rows.each_with_index do |row, index|
|
51
|
-
|
50
|
+
l = Line.new
|
51
|
+
l.text_elements = row.each_with_index.map{|cell, inner_index| TextElement.new(index, inner_index, 1, 1, nil, nil, cell, nil)}
|
52
|
+
tlines << l
|
52
53
|
end
|
53
|
-
t.
|
54
|
+
t.instance_variable_set(:@lines, tlines)
|
55
|
+
t.send(:rpad!)
|
54
56
|
t
|
55
57
|
end
|
56
58
|
|
57
|
-
#for equality testing, return @lines stripped of leading columns of empty strings
|
58
|
-
#TODO: write a method to strip all totally-empty columns (or not?)
|
59
|
-
def lstrip_lines
|
60
|
-
return @lines if @lines.include?(nil)
|
61
|
-
min_leading_empty_strings = Float::INFINITY
|
62
|
-
@lines.each do |line|
|
63
|
-
empties = line.text_elements.map{|t| t.nil? || t.text.empty? }
|
64
|
-
min_leading_empty_strings = [min_leading_empty_strings, empties.index(false)].min
|
65
|
-
end
|
66
|
-
if min_leading_empty_strings == 0
|
67
|
-
@lines
|
68
|
-
else
|
69
|
-
@lines.each{|line| line.text_elements = line.text_elements[min_leading_empty_strings..-1]}
|
70
|
-
@lines
|
71
|
-
end
|
72
|
-
end
|
73
|
-
def lstrip_lines!
|
74
|
-
@lines = self.lstrip_lines
|
75
|
-
end
|
76
59
|
|
77
60
|
#used for testing, ignores separator locations (they'll sometimes be nil/empty)
|
78
61
|
def ==(other)
|
79
62
|
self.instance_variable_set(:@lines, self.lstrip_lines)
|
80
63
|
other.instance_variable_set(:@lines, other.lstrip_lines)
|
81
|
-
self.instance_variable_set(:@lines, self.lines.rpad(
|
82
|
-
other.instance_variable_set(:@lines, other.lines.rpad(
|
64
|
+
self.instance_variable_set(:@lines, self.lines.rpad(Line.new, other.lines.size))
|
65
|
+
other.instance_variable_set(:@lines, other.lines.rpad(Line.new, self.lines.size))
|
83
66
|
|
84
|
-
self.
|
67
|
+
self.rows.zip(other.rows).all? { |my, yours| my == yours }
|
85
68
|
|
86
69
|
end
|
87
70
|
|
@@ -89,6 +72,7 @@ module Tabula
|
|
89
72
|
{
|
90
73
|
'json_class' => self.class.name,
|
91
74
|
'extraction_method' => @extraction_method,
|
75
|
+
'vertical_separators' => @separators,
|
92
76
|
'data' => rows,
|
93
77
|
}.to_json(*a)
|
94
78
|
end
|
@@ -104,5 +88,39 @@ module Tabula
|
|
104
88
|
Tabula::Writers.TSV(rows, out)
|
105
89
|
out.string
|
106
90
|
end
|
91
|
+
|
92
|
+
protected
|
93
|
+
def rpad!
|
94
|
+
max = lines.map{|l| l.text_elements.size}.max
|
95
|
+
lines.each do |line|
|
96
|
+
needed = max - line.text_elements.size
|
97
|
+
needed.times do
|
98
|
+
line.text_elements << TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
#for equality testing, return @lines stripped of leading columns of empty strings
|
104
|
+
#TODO: write a method to strip all totally-empty columns (or not?)
|
105
|
+
def lstrip_lines
|
106
|
+
min_leading_empty_strings = Float::INFINITY
|
107
|
+
@lines.each do |line|
|
108
|
+
empties = line.text_elements.map{|t| t.nil? || t.text.empty? }
|
109
|
+
min_leading_empty_strings = [min_leading_empty_strings,
|
110
|
+
empties.index(false) || 0].min
|
111
|
+
end
|
112
|
+
if min_leading_empty_strings == 0
|
113
|
+
@lines
|
114
|
+
else
|
115
|
+
@lines.each{|line| line.text_elements = line.text_elements[min_leading_empty_strings..-1]}
|
116
|
+
@lines
|
117
|
+
end
|
118
|
+
end
|
119
|
+
def lstrip_lines!
|
120
|
+
@lines = self.lstrip_lines
|
121
|
+
end
|
122
|
+
|
123
|
+
attr_accessor :lines
|
124
|
+
|
107
125
|
end
|
108
126
|
end
|