tabula-extractor 0.7.2-java → 0.7.4-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +4 -8
  4. data/bin/tabula +3 -3
  5. data/lib/tabula.rb +9 -5
  6. data/lib/tabula/entities.rb +1 -0
  7. data/lib/tabula/entities/cell.rb +6 -4
  8. data/lib/tabula/entities/has_cells.rb +22 -78
  9. data/lib/tabula/entities/line.rb +52 -6
  10. data/lib/tabula/entities/page.rb +43 -50
  11. data/lib/tabula/entities/ruling.rb +83 -105
  12. data/lib/tabula/entities/spreadsheet.rb +74 -11
  13. data/lib/tabula/entities/table.rb +55 -37
  14. data/lib/tabula/entities/tabular.rb +42 -0
  15. data/lib/tabula/entities/text_chunk.rb +55 -52
  16. data/lib/tabula/entities/text_element.rb +129 -62
  17. data/lib/tabula/entities/zone_entity.rb +15 -6
  18. data/lib/tabula/extraction.rb +114 -49
  19. data/lib/tabula/line_segment_detector.rb +0 -5
  20. data/lib/tabula/table_extractor.rb +32 -37
  21. data/lib/tabula/version.rb +1 -1
  22. data/tabula-extractor.gemspec +2 -5
  23. metadata +13 -95
  24. data/ext/COPYING +0 -661
  25. data/ext/Makefile.OSX +0 -18
  26. data/ext/Makefile.defaults +0 -9
  27. data/ext/Makefile.linux32 +0 -11
  28. data/ext/Makefile.linux64 +0 -12
  29. data/ext/Makefile.mingw +0 -10
  30. data/ext/Makefile.mingw64 +0 -10
  31. data/ext/liblsd-linux32.so +0 -0
  32. data/ext/liblsd-linux64.so +0 -0
  33. data/ext/liblsd.def +0 -3
  34. data/ext/liblsd.dll +0 -0
  35. data/ext/liblsd.dylib +0 -0
  36. data/ext/liblsd64.dll +0 -0
  37. data/ext/lsd.c +0 -2270
  38. data/ext/lsd.h +0 -283
  39. data/test/data/47008204D_USA.page4.pdf +0 -0
  40. data/test/data/560015757GV_China.page1.pdf +0 -0
  41. data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
  42. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  43. data/test/data/S2MNCEbirdisland.pdf +0 -0
  44. data/test/data/argentina_diputados_voting_record.pdf +0 -0
  45. data/test/data/bo_page24.pdf +0 -0
  46. data/test/data/campaign_donors.pdf +0 -0
  47. data/test/data/frx_2012_disclosure.pdf +0 -0
  48. data/test/data/frx_2012_disclosure.tsv +0 -88
  49. data/test/data/gre.pdf +0 -0
  50. data/test/data/no_tables.pdf +0 -0
  51. data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
  52. data/test/data/puertos1.pdf +0 -0
  53. data/test/data/spanning_cells.csv +0 -21
  54. data/test/data/spanning_cells.pdf +0 -0
  55. data/test/data/strongschools.pdf +0 -0
  56. data/test/data/sydney_disclosure_contract.pdf +0 -0
  57. data/test/data/tabla_subsidios.pdf +0 -0
  58. data/test/data/vertical_rulings_bug.pdf +0 -0
  59. data/test/data/vietnam3.pdf +0 -0
  60. data/test/data/wc2012.pdf +0 -0
  61. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  62. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  63. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  64. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  65. data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
  66. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  67. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  68. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  69. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  70. data/test/heuristic.rb +0 -50
  71. data/test/test_bin_tabula.sh +0 -7
  72. data/test/tests.rb +0 -603
@@ -37,9 +37,11 @@ module Tabula
37
37
  bottom - top
38
38
  end
39
39
 
40
-
41
40
  # attributes that make sense only for non-oblique lines
42
41
  # these are used to have a single collapse method (in page, currently)
42
+
43
+ ##
44
+ # `x` (left) coordinate if line vertical, `y` (top) if horizontal
43
45
  def position
44
46
  raise NoMethodError, "Oblique line #{self.inspect} has no #position method." if oblique?
45
47
  vertical? ? left : top
@@ -153,6 +155,10 @@ module Tabula
153
155
  point.y >= top && point.y <= bottom
154
156
  end
155
157
 
158
+ def ==(other)
159
+ return self.getX1 == other.getX1 && self.getY1 == other.getY1 && self.getX2 == other.getX2 && self.getY2 == other.getY2
160
+ end
161
+
156
162
  ##
157
163
  # calculate the intersection point between +self+ and other Ruling
158
164
  def intersection_point(other)
@@ -164,40 +170,71 @@ module Tabula
164
170
 
165
171
  return nil if !self_l.intersectsLine(other_l)
166
172
 
167
- x1 = self_l.getX1; y1 = self_l.getY1
168
- x2 = self_l.getX2; y2 = self_l.getY2
169
- x3 = other_l.getX1; y3 = other_l.getY1
170
- x4 = other_l.getX2; y4 = other_l.getY2
171
-
172
- det = lambda { |a,b,c,d| a * d - b * c }
173
+ horizontal, vertical = if self_l.horizontal? && other_l.vertical?
174
+ [self_l, other]
175
+ elsif self_l.vertical? && other_l.horizontal?
176
+ [other_l, self_l]
177
+ else
178
+ raise ArgumentError, "must be orthogonal, horizontal and vertical"
179
+ end
173
180
 
174
- int_x = det.call(det.call(x1, y1, x2, y2), x1 - x2, det.call(x3, y3, x4, y4), x3 - x4) /
175
- det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
176
181
 
177
- int_y = det.call(det.call(x1, y1, x2, y2), y1 - y2,
178
- det.call(x3, y3, x4, y4), y3 - y4) /
179
- det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
180
-
181
- return nil if int_x.nan? || int_y.nan? # TODO is this right?
182
+ java.awt.geom.Point2D::Float.new(vertical.getX1, horizontal.getY1)
182
183
 
184
+ end
183
185
 
184
- java.awt.geom.Point2D::Float.new(int_x, int_y)
186
+ class HSegmentComparator
187
+ include java.util.Comparator
188
+ def compare(o1, o2)
189
+ o1.top <=> o2.top
190
+ end
185
191
  end
186
192
 
187
193
  ##
188
- # Find all intersection points between two list of +Ruling+
189
- # (+horizontals+ and +verticals+)
190
- # TODO: this is O(n^2) - optimize.
194
+ # log(n) implementation of find_intersections
195
+ # based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
191
196
  def self.find_intersections(horizontals, verticals)
192
- horizontals.product(verticals).inject({}) do |memo, (h, v)|
193
- ip = h.intersection_point(v)
194
- unless ip.nil?
195
- memo[ip] ||= []
196
- # TODO: stupid hack for FLA pdfs where lines appear to intersect, but don't.
197
- memo[ip] << [h.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), v.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)]
198
- end
199
- memo
200
- end
197
+ tree = java.util.TreeMap.new(HSegmentComparator.new)
198
+ sort_obj = Struct.new(:type, :pos, :obj)
199
+
200
+ (horizontals + verticals)
201
+ .flat_map { |r|
202
+ r.vertical? ? sort_obj.new(:v, r.left, r) : [sort_obj.new(:hl, r.left, r),
203
+ sort_obj.new(:hr, r.right, r)]
204
+ }
205
+ .sort { |a,b|
206
+ if a.pos == b.pos
207
+ if a.type == :v && b.type == :hl
208
+ 1
209
+ elsif a.type == :v && b.type == :hr
210
+ -1
211
+ elsif a.type == :hl && b.type == :v
212
+ -1
213
+ elsif a.type == :hr && b.type == :v
214
+ 1
215
+ else
216
+ a.pos <=> b.pos
217
+ end
218
+ else
219
+ a.pos <=> b.pos
220
+ end
221
+ }
222
+ .inject({}) { |memo, e|
223
+ case e.type
224
+ when :v
225
+ tree.each { |h,_|
226
+ i = h.intersection_point(e.obj)
227
+ next memo if i.nil?
228
+ memo[i] = [h.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT),
229
+ e.obj.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)]
230
+ }
231
+ when :hr
232
+ tree.remove(e.obj)
233
+ when :hl
234
+ tree[e.obj] = 1
235
+ end
236
+ memo
237
+ }
201
238
  end
202
239
 
203
240
  ##
@@ -211,90 +248,31 @@ module Tabula
211
248
  end
212
249
  end
213
250
 
214
- # TODO do we really need this one anymore?
215
- def self.clean_rulings(rulings, max_distance=4)
216
-
217
- # merge horizontal and vertical lines
218
- # TODO this should be iterative
219
-
220
- skip = false
221
-
222
- horiz = rulings.select { |r| r.horizontal? }
223
- .group_by(&:top)
224
- .values.reduce([]) do |memo, rs|
225
-
226
- rs = rs.sort_by(&:left)
227
- if rs.size > 1
228
- memo +=
229
- rs.each_cons(2)
230
- .chunk { |p| p[1].left - p[0].right < 7 }
231
- .select { |c| c[0] }
232
- .map { |group|
233
- group = group.last.flatten.uniq
234
- Tabula::Ruling.new(group[0].top,
235
- group[0].left,
236
- group[-1].right - group[0].left,
237
- 0)
238
- }
239
- Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
240
- else
241
- memo << rs.first
242
- end
243
- memo
251
+ def self.collapse_oriented_rulings(lines)
252
+ # lines must all be of one orientation (i.e. horizontal, vertical)
253
+
254
+ if lines.empty?
255
+ return []
244
256
  end
245
- .sort_by(&:top)
246
257
 
247
- h = []
248
- horiz.size.times do |i|
258
+ lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
249
259
 
250
- if i == horiz.size - 1
251
- h << horiz[-1]
252
- break
253
- end
260
+ lines = lines.inject([lines.shift]) do |memo, next_line|
261
+ last = memo.last
254
262
 
255
- if skip
256
- skip = false;
257
- next
258
- end
259
- d = (horiz[i+1].top - horiz[i].top).abs
260
-
261
- h << if d < max_distance # THRESHOLD DISTANCE between horizontal lines
262
- skip = true
263
- Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
264
- else
265
- horiz[i]
266
- end
267
- end
268
- horiz = h
269
-
270
- vert = rulings.select { |r| r.vertical? }
271
- .group_by(&:left)
272
- .values
273
- .reduce([]) do |memo, rs|
274
-
275
- rs = rs.sort_by(&:top)
276
-
277
- if rs.size > 1
278
- # Here be dragons:
279
- # merge consecutive segments of lines that are close enough
280
- memo +=
281
- rs.each_cons(2)
282
- .chunk { |p| p[1].top - p[0].bottom < 7 }
283
- .select { |c| c[0] }
284
- .map { |group|
285
- group = group.last.flatten.uniq
286
- Tabula::Ruling.new(group[0].top,
287
- group[0].left,
288
- 0,
289
- group[-1].bottom - group[0].top)
290
- }
263
+ # if current line colinear with next, and are "close enough": expand current line
264
+ if next_line.position == last.position && last.nearlyIntersects?(next_line)
265
+ memo.last.start = next_line.start < last.start ? next_line.start : last.start
266
+ memo.last.end = next_line.end < last.end ? last.end : next_line.end
267
+ memo
268
+ # if next line has no length, ignore it
269
+ elsif next_line.length == 0
270
+ memo
271
+ # otherwise, add it to the returned collection
291
272
  else
292
- memo << rs.first
273
+ memo << next_line
293
274
  end
294
- memo
295
- end.sort_by(&:left)
296
-
297
- return horiz += vert
275
+ end
298
276
  end
299
277
  end
300
278
  end
@@ -5,7 +5,7 @@ module Tabula
5
5
  # the both should implement `cells`, `rows`, `cols`, `extraction_method`
6
6
 
7
7
  class Spreadsheet < ZoneEntity
8
- include Tabula::HasCells
8
+ include Tabula::Tabular
9
9
  attr_accessor :cells, :vertical_ruling_lines, :horizontal_ruling_lines, :cells_resolved
10
10
  attr_reader :extraction_method, :page
11
11
 
@@ -18,6 +18,10 @@ module Tabula
18
18
  @extraction_method = "spreadsheet"
19
19
  end
20
20
 
21
+ def self.empty(page)
22
+ Spreadsheet.new(0, 0, 0, 0, page, [], nil, nil)
23
+ end
24
+
21
25
  def ruling_lines
22
26
  @vertical_ruling_lines + @horizontal_ruling_lines
23
27
  end
@@ -27,15 +31,6 @@ module Tabula
27
31
  @horizontal_ruling_lines = lines.select{|hl| hl.horizontal? && spr.intersectsLine(hl) }
28
32
  end
29
33
 
30
- def fill_in_cells!
31
- unless @cells_resolved
32
- @cells_resolved = true
33
- cells.each do |cell|
34
- cell.text_elements = @page.get_cell_text(cell)
35
- end
36
- end
37
- end
38
-
39
34
  # call `rows` with `evaluate_cells` as `false` to defer filling in the text in
40
35
  # each cell, which can be computationally intensive.
41
36
  def rows(evaluate_cells=true)
@@ -77,6 +72,64 @@ module Tabula
77
72
  end
78
73
  end
79
74
 
75
+ #######################################################
76
+ # Chapter 2 of Spreadsheet extraction, Spanning Cells #
77
+ #######################################################
78
+ #if c is a "spanning cell", that is
79
+ # if there are N>0 vertical lines strictly between this cell's left and right
80
+ #insert N placeholder cells after it with zero size (but same top)
81
+ def add_spanning_cells!
82
+ #rounding: because Cell.new_from_points, using in #find_cells above, has
83
+ # a float precision error where, for instance, a cell whose x2 coord is
84
+ # supposed to be 160.137451171875 comes out as 160.13745498657227 because
85
+ # of minus. :(
86
+ vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq #already sorted
87
+ horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
88
+
89
+ cells.each do |c|
90
+ vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
91
+ horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
92
+
93
+ unless vertical_rulings_spanned_over.empty?
94
+ c.spanning = true
95
+ vertical_rulings_spanned_over.each do |spanned_over_line_loc|
96
+ placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
97
+ placeholder.placeholder = true
98
+ cells << placeholder
99
+ end
100
+ end
101
+ unless horizontal_rulings_spanned_over.empty?
102
+ c.spanning = true
103
+ horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
104
+ placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
105
+ placeholder.placeholder = true
106
+ cells << placeholder
107
+ end
108
+ end
109
+
110
+ #if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
111
+ # e.g. -------------------
112
+ # | C | C | C | C | (this is some pretty sweet ASCII art, eh?)
113
+ # |-----------------|
114
+ # | C | C | C | C |
115
+ # |-----------------|
116
+ # | C | SC P | C | where MC is the "spanning cell" that holds all the text within its bounds
117
+ # |---- + ----| P is a "placeholder" cell with either zero width or zero height
118
+ # | C | P DP | C | DP is a "double placeholder" cell with zero width and zero height
119
+ # |---- + ----| C is an ordinary cell.
120
+ # | C | P DP | C |
121
+ # |-----------------|
122
+
123
+ unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
124
+ double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
125
+ placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
126
+ placeholder.placeholder = true
127
+ cells << placeholder
128
+ end
129
+ end
130
+ end
131
+ end
132
+
80
133
  def to_a
81
134
  fill_in_cells!
82
135
  rows.map{ |row_cells| row_cells.map(&:text) }
@@ -103,8 +156,18 @@ module Tabula
103
156
  end
104
157
 
105
158
  def +(other)
106
- raise ArgumentError unless other.page == @page
159
+ raise ArgumentError, "Data can only be added if it's from the same PDF page" unless other.page == @page
107
160
  Spreadsheet.new(nil, nil, nil, nil, @page, @cells + other.cells, nil, nil )
108
161
  end
162
+
163
+ protected
164
+ def fill_in_cells!
165
+ unless @cells_resolved
166
+ @cells_resolved = true
167
+ cells.each do |cell|
168
+ cell.text_elements = @page.get_cell_text(cell)
169
+ end
170
+ end
171
+ end
109
172
  end
110
173
  end
@@ -1,7 +1,8 @@
1
1
  module Tabula
2
2
  class Table
3
+ include Tabula::Tabular
3
4
  attr_reader :extraction_method
4
- attr_accessor :lines
5
+
5
6
  def initialize(line_count, separators)
6
7
  @separators = separators
7
8
  @lines = (0...line_count).inject([]) { |m| m << Line.new }
@@ -19,27 +20,24 @@ module Tabula
19
20
  end
20
21
  end
21
22
 
22
- def rpad!
23
- max = lines.map{|l| l.text_elements.size}.max
24
- lines.each do |line|
25
- needed = max - line.text_elements.size
26
- needed.times do
27
- line.text_elements << TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
28
- end
29
- end
30
- end
31
23
 
32
24
  def cols
33
25
  rows.transpose
34
26
  end
35
27
 
28
+ # TODO: this is awful, refactor
36
29
  def rows
37
- self.rpad!
38
- lines.map do |l|
30
+ rpad!
31
+ lstrip_lines!
32
+ li = lines.map do |l|
39
33
  l.text_elements.map! do |te|
40
34
  te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
41
35
  end
42
- end.sort_by { |l| l.map { |te| te.top || 0 }.max }
36
+ end.select do
37
+ |l| !l.all? { |te| te.text.empty? }
38
+ end.sort_by do |l|
39
+ l.map { |te| te.top || 0 }.max
40
+ end
43
41
  end
44
42
 
45
43
  # create a new Table object from an array of arrays, representing a list of rows in a spreadsheet
@@ -47,41 +45,26 @@ module Tabula
47
45
  def self.new_from_array(array_of_rows)
48
46
  t = Table.new(array_of_rows.size, [])
49
47
  @extraction_method = "testing"
48
+ tlines = []
50
49
  array_of_rows.each_with_index do |row, index|
51
- t.lines[index].text_elements = row.each_with_index.map{|cell, inner_index| TextElement.new(index, inner_index, 1, 1, nil, nil, cell, nil)}
50
+ l = Line.new
51
+ l.text_elements = row.each_with_index.map{|cell, inner_index| TextElement.new(index, inner_index, 1, 1, nil, nil, cell, nil)}
52
+ tlines << l
52
53
  end
53
- t.rpad!
54
+ t.instance_variable_set(:@lines, tlines)
55
+ t.send(:rpad!)
54
56
  t
55
57
  end
56
58
 
57
- #for equality testing, return @lines stripped of leading columns of empty strings
58
- #TODO: write a method to strip all totally-empty columns (or not?)
59
- def lstrip_lines
60
- return @lines if @lines.include?(nil)
61
- min_leading_empty_strings = Float::INFINITY
62
- @lines.each do |line|
63
- empties = line.text_elements.map{|t| t.nil? || t.text.empty? }
64
- min_leading_empty_strings = [min_leading_empty_strings, empties.index(false)].min
65
- end
66
- if min_leading_empty_strings == 0
67
- @lines
68
- else
69
- @lines.each{|line| line.text_elements = line.text_elements[min_leading_empty_strings..-1]}
70
- @lines
71
- end
72
- end
73
- def lstrip_lines!
74
- @lines = self.lstrip_lines
75
- end
76
59
 
77
60
  #used for testing, ignores separator locations (they'll sometimes be nil/empty)
78
61
  def ==(other)
79
62
  self.instance_variable_set(:@lines, self.lstrip_lines)
80
63
  other.instance_variable_set(:@lines, other.lstrip_lines)
81
- self.instance_variable_set(:@lines, self.lines.rpad(nil, other.lines.size))
82
- other.instance_variable_set(:@lines, other.lines.rpad(nil, self.lines.size))
64
+ self.instance_variable_set(:@lines, self.lines.rpad(Line.new, other.lines.size))
65
+ other.instance_variable_set(:@lines, other.lines.rpad(Line.new, self.lines.size))
83
66
 
84
- self.lines.zip(other.lines).all? { |my, yours| my == yours }
67
+ self.rows.zip(other.rows).all? { |my, yours| my == yours }
85
68
 
86
69
  end
87
70
 
@@ -89,6 +72,7 @@ module Tabula
89
72
  {
90
73
  'json_class' => self.class.name,
91
74
  'extraction_method' => @extraction_method,
75
+ 'vertical_separators' => @separators,
92
76
  'data' => rows,
93
77
  }.to_json(*a)
94
78
  end
@@ -104,5 +88,39 @@ module Tabula
104
88
  Tabula::Writers.TSV(rows, out)
105
89
  out.string
106
90
  end
91
+
92
+ protected
93
+ def rpad!
94
+ max = lines.map{|l| l.text_elements.size}.max
95
+ lines.each do |line|
96
+ needed = max - line.text_elements.size
97
+ needed.times do
98
+ line.text_elements << TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
99
+ end
100
+ end
101
+ end
102
+
103
+ #for equality testing, return @lines stripped of leading columns of empty strings
104
+ #TODO: write a method to strip all totally-empty columns (or not?)
105
+ def lstrip_lines
106
+ min_leading_empty_strings = Float::INFINITY
107
+ @lines.each do |line|
108
+ empties = line.text_elements.map{|t| t.nil? || t.text.empty? }
109
+ min_leading_empty_strings = [min_leading_empty_strings,
110
+ empties.index(false) || 0].min
111
+ end
112
+ if min_leading_empty_strings == 0
113
+ @lines
114
+ else
115
+ @lines.each{|line| line.text_elements = line.text_elements[min_leading_empty_strings..-1]}
116
+ @lines
117
+ end
118
+ end
119
+ def lstrip_lines!
120
+ @lines = self.lstrip_lines
121
+ end
122
+
123
+ attr_accessor :lines
124
+
107
125
  end
108
126
  end