tabula-extractor 0.6.6-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
@@ -0,0 +1,7 @@
1
+ module Tabula
2
+ class PageArea < Page
3
+
4
+
5
+ end
6
+
7
+ end
@@ -0,0 +1,300 @@
1
+ module Tabula
2
+ class Ruling < java.awt.geom.Line2D::Float
3
+
4
+ attr_accessor :stroking_color
5
+
6
+ def initialize(top, left, width, height, stroking_color=nil)
7
+ super(left, top, left+width, top+height)
8
+ self.stroking_color = stroking_color
9
+ end
10
+
11
+ alias :top :getY1
12
+ alias :left :getX1
13
+ alias :bottom :getY2
14
+ alias :right :getX2
15
+
16
+ def top=(v)
17
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, v, right, bottom
18
+ end
19
+
20
+ def left=(v)
21
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], v, top, right, bottom
22
+ end
23
+
24
+ def bottom=(v)
25
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, top, right, v
26
+ end
27
+
28
+ def right=(v)
29
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, top, v, bottom
30
+ end
31
+
32
+ def width
33
+ right - left
34
+ end
35
+
36
+ def height
37
+ bottom - top
38
+ end
39
+
40
+
41
+ # attributes that make sense only for non-oblique lines
42
+ # these are used to have a single collapse method (in page, currently)
43
+ def position
44
+ raise NoMethodError, "Oblique line #{self.inspect} has no #position method." if oblique?
45
+ vertical? ? left : top
46
+ end
47
+ def start
48
+ raise NoMethodError, "Oblique line #{self.inspect} has no #start method." if oblique?
49
+ vertical? ? top : left
50
+ end
51
+ def end
52
+ raise NoMethodError, "Oblique line #{self.inspect} has no #end method." if oblique?
53
+ vertical? ? bottom : right
54
+ end
55
+ def position=(coord)
56
+ raise NoMethodError, "Oblique line #{self.inspect} has no #position= method." if oblique?
57
+ if vertical?
58
+ self.left = coord
59
+ self.right = coord
60
+ else
61
+ self.top = coord
62
+ self.bottom = coord
63
+ end
64
+ end
65
+ def start=(coord)
66
+ raise NoMethodError, "Oblique line #{self.inspect} has no #start= method." if oblique?
67
+ if vertical?
68
+ self.top = coord
69
+ else
70
+ self.left = coord
71
+ end
72
+ end
73
+ def end=(coord)
74
+ raise NoMethodError, "Oblique line #{self.inspect} has no #end= method." if oblique?
75
+ if vertical?
76
+ self.bottom = coord
77
+ else
78
+ self.right = coord
79
+ end
80
+ end
81
+
82
+ #ok wtf are you doing, Jeremy?
83
+ # some PDFs (garment factory audits, precise link TK) make tables by drawing lines that
84
+ # very nearly intersect each other, but not quite. E.g. a horizontal line spans the table at a Y val of 100
85
+ # and each vertical line (i.e. column separating ruling line) starts at 101 or 102.
86
+ # this is very annoying. so we check if those lines nearly overlap by expanding each pair
87
+ # by 2 pixels in each direction (so the vertical lines' top becomes 99 or 100, and then the expanded versions overlap)
88
+
89
+ PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2
90
+ COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1
91
+
92
+ # if the lines we're comparing are colinear or parallel, we expand them by a only 1 pixel,
93
+ # because the expansions are additive
94
+ # (e.g. two vertical lines, at x = 100, with one having y2 of 98 and the other having y1 of 102 would
95
+ # erroneously be said to nearlyIntersect if they were each expanded by 2 (since they'd both terminate at 100).
96
+ # The COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT is only 1 so the total expansion is 2.
97
+ # A total expansion amount of 2 is empirically verified to work sometimes. It's not a magic number from any
98
+ # source other than a little bit of experience.)
99
+
100
+ def nearlyIntersects?(another)
101
+ if self.intersectsLine(another)
102
+ true
103
+ elsif self.perpendicular_to?(another)
104
+ self.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT))
105
+ else
106
+ self.expand(COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT).intersectsLine(another.expand(COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT))
107
+ end
108
+ end
109
+
110
+ ##
111
+ # intersect this Ruling with a java.awt.geom.Rectangle2D
112
+ def intersect(area)
113
+ i = self.getBounds2D.createIntersection(area)
114
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], i.getX, i.getY, i.getX + i.getWidth, i.getY + i.getHeight
115
+ self
116
+ end
117
+
118
+ def expand(amt)
119
+ raise NoMethodError, "Oblique line #{self.inspect} has no #expand method." if oblique?
120
+ r = Ruling.new(self.top, self.left, self.width, self.height)
121
+ r.start = r.start - amt
122
+ r.end = r.end + amt
123
+ r
124
+ end
125
+
126
+
127
+ def length
128
+ Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
129
+ end
130
+
131
+ def vertical?
132
+ left == right
133
+ end
134
+
135
+ def horizontal?
136
+ top == bottom
137
+ end
138
+
139
+ def oblique?
140
+ !(vertical? || horizontal?)
141
+ end
142
+
143
+ def perpendicular_to?(other)
144
+ return self.vertical? == other.horizontal?
145
+ end
146
+
147
+ def to_json(arg)
148
+ [left, top, right, bottom].to_json
149
+ end
150
+
151
+ def colinear?(point)
152
+ point.x >= left && point.x <= right &&
153
+ point.y >= top && point.y <= bottom
154
+ end
155
+
156
+ ##
157
+ # calculate the intersection point between +self+ and other Ruling
158
+ def intersection_point(other)
159
+ # algo taken from http://mathworld.wolfram.com/Line-LineIntersection.html
160
+
161
+ #self and other should always be perpendicular, since one should be horizontal and one should be vertical
162
+ self_l = self.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)
163
+ other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)
164
+
165
+ return nil if !self_l.intersectsLine(other_l)
166
+
167
+ x1 = self_l.getX1; y1 = self_l.getY1
168
+ x2 = self_l.getX2; y2 = self_l.getY2
169
+ x3 = other_l.getX1; y3 = other_l.getY1
170
+ x4 = other_l.getX2; y4 = other_l.getY2
171
+
172
+ det = lambda { |a,b,c,d| a * d - b * c }
173
+
174
+ int_x = det.call(det.call(x1, y1, x2, y2), x1 - x2, det.call(x3, y3, x4, y4), x3 - x4) /
175
+ det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
176
+
177
+ int_y = det.call(det.call(x1, y1, x2, y2), y1 - y2,
178
+ det.call(x3, y3, x4, y4), y3 - y4) /
179
+ det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
180
+
181
+ return nil if int_x.nan? || int_y.nan? # TODO is this right?
182
+
183
+
184
+ java.awt.geom.Point2D::Float.new(int_x, int_y)
185
+ end
186
+
187
+ ##
188
+ # Find all intersection points between two list of +Ruling+
189
+ # (+horizontals+ and +verticals+)
190
+ # TODO: this is O(n^2) - optimize.
191
+ def self.find_intersections(horizontals, verticals)
192
+ horizontals.product(verticals).inject({}) do |memo, (h, v)|
193
+ ip = h.intersection_point(v)
194
+ unless ip.nil?
195
+ memo[ip] ||= []
196
+ # TODO: stupid hack for FLA pdfs where lines appear to intersect, but don't.
197
+ memo[ip] << [h.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), v.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)]
198
+ end
199
+ memo
200
+ end
201
+ end
202
+
203
+ ##
204
+ # crop an enumerable of +Ruling+ to an +area+
205
+ def self.crop_rulings_to_area(rulings, area)
206
+ rulings.reduce([]) do |memo, r|
207
+ if r.intersects(area)
208
+ memo << r.clone.intersect(area)
209
+ end
210
+ memo
211
+ end
212
+ end
213
+
214
+ # TODO do we really need this one anymore?
215
+ def self.clean_rulings(rulings, max_distance=4)
216
+
217
+ # merge horizontal and vertical lines
218
+ # TODO this should be iterative
219
+
220
+ skip = false
221
+
222
+ horiz = rulings.select { |r| r.horizontal? }
223
+ .group_by(&:top)
224
+ .values.reduce([]) do |memo, rs|
225
+
226
+ rs = rs.sort_by(&:left)
227
+ if rs.size > 1
228
+ memo +=
229
+ rs.each_cons(2)
230
+ .chunk { |p| p[1].left - p[0].right < 7 }
231
+ .select { |c| c[0] }
232
+ .map { |group|
233
+ group = group.last.flatten.uniq
234
+ Tabula::Ruling.new(group[0].top,
235
+ group[0].left,
236
+ group[-1].right - group[0].left,
237
+ 0)
238
+ }
239
+ Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
240
+ else
241
+ memo << rs.first
242
+ end
243
+ memo
244
+ end
245
+ .sort_by(&:top)
246
+
247
+ h = []
248
+ horiz.size.times do |i|
249
+
250
+ if i == horiz.size - 1
251
+ h << horiz[-1]
252
+ break
253
+ end
254
+
255
+ if skip
256
+ skip = false;
257
+ next
258
+ end
259
+ d = (horiz[i+1].top - horiz[i].top).abs
260
+
261
+ h << if d < max_distance # THRESHOLD DISTANCE between horizontal lines
262
+ skip = true
263
+ Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
264
+ else
265
+ horiz[i]
266
+ end
267
+ end
268
+ horiz = h
269
+
270
+ vert = rulings.select { |r| r.vertical? }
271
+ .group_by(&:left)
272
+ .values
273
+ .reduce([]) do |memo, rs|
274
+
275
+ rs = rs.sort_by(&:top)
276
+
277
+ if rs.size > 1
278
+ # Here be dragons:
279
+ # merge consecutive segments of lines that are close enough
280
+ memo +=
281
+ rs.each_cons(2)
282
+ .chunk { |p| p[1].top - p[0].bottom < 7 }
283
+ .select { |c| c[0] }
284
+ .map { |group|
285
+ group = group.last.flatten.uniq
286
+ Tabula::Ruling.new(group[0].top,
287
+ group[0].left,
288
+ 0,
289
+ group[-1].bottom - group[0].top)
290
+ }
291
+ else
292
+ memo << rs.first
293
+ end
294
+ memo
295
+ end.sort_by(&:left)
296
+
297
+ return horiz += vert
298
+ end
299
+ end
300
+ end
@@ -0,0 +1,92 @@
1
+ module Tabula
2
+ # a counterpart of Table, to be sure.
3
+ # not sure yet what their relationship ought to be.
4
+ class Spreadsheet < ZoneEntity
5
+ include Tabula::HasCells
6
+ attr_accessor :cells, :vertical_ruling_lines, :horizontal_ruling_lines, :cells_resolved
7
+
8
+ def initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) #, lines)
9
+ super(top, left, width, height)
10
+ @cells = cells
11
+ @page = page
12
+ @vertical_ruling_lines = vertical_ruling_lines
13
+ @horizontal_ruling_lines = horizontal_ruling_lines
14
+ end
15
+
16
+ def ruling_lines
17
+ @vertical_ruling_lines + @horizontal_ruling_lines
18
+ end
19
+
20
+ def ruling_lines=(lines)
21
+ @vertical_ruling_lines = lines.select{|vl| vl.vertical? && spr.intersectsLine(vl) }
22
+ @horizontal_ruling_lines = lines.select{|hl| hl.horizontal? && spr.intersectsLine(hl) }
23
+ end
24
+
25
+ def fill_in_cells!
26
+ unless @cells_resolved
27
+ @cells_resolved = true
28
+ cells.each do |cell|
29
+ cell.text_elements = @page.get_cell_text(cell)
30
+ end
31
+ end
32
+ end
33
+
34
+ # call `rows` with `evaluate_cells` as `false` to defer filling in the text in
35
+ # each cell, which can be computationally intensive.
36
+ def rows(evaluate_cells=true)
37
+ if evaluate_cells
38
+ fill_in_cells!
39
+ end
40
+ tops = cells.map(&:top).uniq.sort
41
+ array_of_rows = tops.map do |top|
42
+ cells.select{|c| c.top == top }.sort_by(&:left)
43
+ end
44
+ #here, insert another kind of placeholder for empty corners
45
+ # like in 01001523B_China.pdf
46
+ #TODO: support placeholders for "empty" cells in rows other than row 1, and in #cols
47
+ # puts array_of_rows[0].inspect
48
+ if array_of_rows.size > 2
49
+ if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
50
+ missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
51
+ # puts missing_spots.inspect
52
+ missing_spots.each do |missing_spot|
53
+ missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
54
+ missing_spot_placeholder.placeholder = true
55
+ array_of_rows[0] << missing_spot_placeholder
56
+ end
57
+ end
58
+ array_of_rows[0].sort_by!(&:left)
59
+ end
60
+ array_of_rows
61
+ end
62
+
63
+ # call `cols` with `evaluate_cells` as `false` to defer filling in the text in
64
+ # each cell, which can be computationally intensive.
65
+ def cols(evaluate_cells=true)
66
+ if evaluate_cells
67
+ fill_in_cells!
68
+ end
69
+ lefts = cells.map(&:left).uniq.sort
70
+ lefts.map do |left|
71
+ cells.select{|c| c.left == left }.sort_by(&:top)
72
+ end
73
+ end
74
+
75
+ def to_a
76
+ fill_in_cells!
77
+ rows.map{ |row_cells| row_cells.map(&:text) }
78
+ end
79
+
80
+ def to_csv
81
+ out = StringIO.new
82
+ Tabula::Writers.CSV(rows, out)
83
+ out.string
84
+ end
85
+
86
+ def to_tsv
87
+ out = StringIO.new
88
+ Tabula::Writers.TSV(rows, out)
89
+ out.string
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,81 @@
1
+ module Tabula
2
+ class Table
3
+ attr_reader :lines
4
+ def initialize(line_count, separators)
5
+ @separators = separators
6
+ @lines = (0...line_count).inject([]) { |m| m << Line.new }
7
+ end
8
+
9
+ def add_text_element(text_element, i, j)
10
+ if @lines.size <= i
11
+ @lines[i] = Line.new
12
+ end
13
+ if @lines[i].text_elements[j]
14
+ @lines[i].text_elements[j].merge!(text_element)
15
+ else
16
+ @lines[i].text_elements[j] = text_element
17
+ end
18
+ end
19
+
20
+ def rpad!
21
+ max = lines.map{|l| l.text_elements.size}.max
22
+ lines.each do |line|
23
+ needed = max - line.text_elements.size
24
+ needed.times do
25
+ line.text_elements << TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
26
+ end
27
+ end
28
+ end
29
+
30
+ def cols
31
+ self.rpad!
32
+ lines.map(&:text_elements).transpose
33
+ end
34
+
35
+ def rows
36
+ self.rpad!
37
+ lines.map(&:text_elements)
38
+ end
39
+
40
+ # create a new Table object from an array of arrays, representing a list of rows in a spreadsheet
41
+ # probably only used for testing
42
+ def self.new_from_array(array_of_rows)
43
+ t = Table.new(array_of_rows.size, [])
44
+ array_of_rows.each_with_index do |row, index|
45
+ t.lines[index].text_elements = row.map{|cell| TextElement.new(nil, nil, nil, nil, nil, nil, cell, nil)}
46
+ end
47
+ t
48
+ end
49
+
50
+ #for equality testing, return @lines stripped of leading columns of empty strings
51
+ #TODO: write a method to strip all totally-empty columns (or not?)
52
+ def lstrip_lines
53
+ return @lines if @lines.include?(nil)
54
+ min_leading_empty_strings = Float::INFINITY
55
+ @lines.each do |line|
56
+ empties = line.text_elements.map{|t| t.nil? || t.text.empty? }
57
+ min_leading_empty_strings = [min_leading_empty_strings, empties.index(false)].min
58
+ end
59
+ if min_leading_empty_strings == 0
60
+ @lines
61
+ else
62
+ @lines.each{|line| line.text_elements = line.text_elements[min_leading_empty_strings..-1]}
63
+ @lines
64
+ end
65
+ end
66
+ def lstrip_lines!
67
+ @lines = self.lstrip_lines
68
+ end
69
+
70
+ #used for testing, ignores separator locations (they'll sometimes be nil/empty)
71
+ def ==(other)
72
+ self.instance_variable_set(:@lines, self.lstrip_lines)
73
+ other.instance_variable_set(:@lines, other.lstrip_lines)
74
+ self.instance_variable_set(:@lines, self.lines.rpad(nil, other.lines.size))
75
+ other.instance_variable_set(:@lines, other.lines.rpad(nil, self.lines.size))
76
+
77
+ self.lines.zip(other.lines).all? { |my, yours| my == yours }
78
+
79
+ end
80
+ end
81
+ end