tabula-extractor 0.6.6-java → 0.7.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
@@ -0,0 +1,7 @@
1
+ module Tabula
2
+ class PageArea < Page
3
+
4
+
5
+ end
6
+
7
+ end
@@ -0,0 +1,300 @@
1
+ module Tabula
2
+ class Ruling < java.awt.geom.Line2D::Float
3
+
4
+ attr_accessor :stroking_color
5
+
6
+ def initialize(top, left, width, height, stroking_color=nil)
7
+ super(left, top, left+width, top+height)
8
+ self.stroking_color = stroking_color
9
+ end
10
+
11
+ alias :top :getY1
12
+ alias :left :getX1
13
+ alias :bottom :getY2
14
+ alias :right :getX2
15
+
16
+ def top=(v)
17
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, v, right, bottom
18
+ end
19
+
20
+ def left=(v)
21
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], v, top, right, bottom
22
+ end
23
+
24
+ def bottom=(v)
25
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, top, right, v
26
+ end
27
+
28
+ def right=(v)
29
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], left, top, v, bottom
30
+ end
31
+
32
+ def width
33
+ right - left
34
+ end
35
+
36
+ def height
37
+ bottom - top
38
+ end
39
+
40
+
41
+ # attributes that make sense only for non-oblique lines
42
+ # these are used to have a single collapse method (in page, currently)
43
+ def position
44
+ raise NoMethodError, "Oblique line #{self.inspect} has no #position method." if oblique?
45
+ vertical? ? left : top
46
+ end
47
+ def start
48
+ raise NoMethodError, "Oblique line #{self.inspect} has no #start method." if oblique?
49
+ vertical? ? top : left
50
+ end
51
+ def end
52
+ raise NoMethodError, "Oblique line #{self.inspect} has no #end method." if oblique?
53
+ vertical? ? bottom : right
54
+ end
55
+ def position=(coord)
56
+ raise NoMethodError, "Oblique line #{self.inspect} has no #position= method." if oblique?
57
+ if vertical?
58
+ self.left = coord
59
+ self.right = coord
60
+ else
61
+ self.top = coord
62
+ self.bottom = coord
63
+ end
64
+ end
65
+ def start=(coord)
66
+ raise NoMethodError, "Oblique line #{self.inspect} has no #start= method." if oblique?
67
+ if vertical?
68
+ self.top = coord
69
+ else
70
+ self.left = coord
71
+ end
72
+ end
73
+ def end=(coord)
74
+ raise NoMethodError, "Oblique line #{self.inspect} has no #end= method." if oblique?
75
+ if vertical?
76
+ self.bottom = coord
77
+ else
78
+ self.right = coord
79
+ end
80
+ end
81
+
82
+ #ok wtf are you doing, Jeremy?
83
+ # some PDFs (garment factory audits, precise link TK) make tables by drawing lines that
84
+ # very nearly intersect each other, but not quite. E.g. a horizontal line spans the table at a Y val of 100
85
+ # and each vertical line (i.e. column separating ruling line) starts at 101 or 102.
86
+ # this is very annoying. so we check if those lines nearly overlap by expanding each pair
87
+ # by 2 pixels in each direction (so the vertical lines' top becomes 99 or 100, and then the expanded versions overlap)
88
+
89
+ PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2
90
+ COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1
91
+
92
+ # if the lines we're comparing are colinear or parallel, we expand them by a only 1 pixel,
93
+ # because the expansions are additive
94
+ # (e.g. two vertical lines, at x = 100, with one having y2 of 98 and the other having y1 of 102 would
95
+ # erroneously be said to nearlyIntersect if they were each expanded by 2 (since they'd both terminate at 100).
96
+ # The COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT is only 1 so the total expansion is 2.
97
+ # A total expansion amount of 2 is empirically verified to work sometimes. It's not a magic number from any
98
+ # source other than a little bit of experience.)
99
+
100
+ def nearlyIntersects?(another)
101
+ if self.intersectsLine(another)
102
+ true
103
+ elsif self.perpendicular_to?(another)
104
+ self.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT))
105
+ else
106
+ self.expand(COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT).intersectsLine(another.expand(COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT))
107
+ end
108
+ end
109
+
110
+ ##
111
+ # intersect this Ruling with a java.awt.geom.Rectangle2D
112
+ def intersect(area)
113
+ i = self.getBounds2D.createIntersection(area)
114
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], i.getX, i.getY, i.getX + i.getWidth, i.getY + i.getHeight
115
+ self
116
+ end
117
+
118
+ def expand(amt)
119
+ raise NoMethodError, "Oblique line #{self.inspect} has no #expand method." if oblique?
120
+ r = Ruling.new(self.top, self.left, self.width, self.height)
121
+ r.start = r.start - amt
122
+ r.end = r.end + amt
123
+ r
124
+ end
125
+
126
+
127
+ def length
128
+ Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
129
+ end
130
+
131
+ def vertical?
132
+ left == right
133
+ end
134
+
135
+ def horizontal?
136
+ top == bottom
137
+ end
138
+
139
+ def oblique?
140
+ !(vertical? || horizontal?)
141
+ end
142
+
143
+ def perpendicular_to?(other)
144
+ return self.vertical? == other.horizontal?
145
+ end
146
+
147
+ def to_json(arg)
148
+ [left, top, right, bottom].to_json
149
+ end
150
+
151
+ def colinear?(point)
152
+ point.x >= left && point.x <= right &&
153
+ point.y >= top && point.y <= bottom
154
+ end
155
+
156
+ ##
157
+ # calculate the intersection point between +self+ and other Ruling
158
+ def intersection_point(other)
159
+ # algo taken from http://mathworld.wolfram.com/Line-LineIntersection.html
160
+
161
+ #self and other should always be perpendicular, since one should be horizontal and one should be vertical
162
+ self_l = self.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)
163
+ other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)
164
+
165
+ return nil if !self_l.intersectsLine(other_l)
166
+
167
+ x1 = self_l.getX1; y1 = self_l.getY1
168
+ x2 = self_l.getX2; y2 = self_l.getY2
169
+ x3 = other_l.getX1; y3 = other_l.getY1
170
+ x4 = other_l.getX2; y4 = other_l.getY2
171
+
172
+ det = lambda { |a,b,c,d| a * d - b * c }
173
+
174
+ int_x = det.call(det.call(x1, y1, x2, y2), x1 - x2, det.call(x3, y3, x4, y4), x3 - x4) /
175
+ det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
176
+
177
+ int_y = det.call(det.call(x1, y1, x2, y2), y1 - y2,
178
+ det.call(x3, y3, x4, y4), y3 - y4) /
179
+ det.call(x1 - x2, y1 - y2, x3 - x4, y3 - y4)
180
+
181
+ return nil if int_x.nan? || int_y.nan? # TODO is this right?
182
+
183
+
184
+ java.awt.geom.Point2D::Float.new(int_x, int_y)
185
+ end
186
+
187
+ ##
188
+ # Find all intersection points between two list of +Ruling+
189
+ # (+horizontals+ and +verticals+)
190
+ # TODO: this is O(n^2) - optimize.
191
+ def self.find_intersections(horizontals, verticals)
192
+ horizontals.product(verticals).inject({}) do |memo, (h, v)|
193
+ ip = h.intersection_point(v)
194
+ unless ip.nil?
195
+ memo[ip] ||= []
196
+ # TODO: stupid hack for FLA pdfs where lines appear to intersect, but don't.
197
+ memo[ip] << [h.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), v.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)]
198
+ end
199
+ memo
200
+ end
201
+ end
202
+
203
+ ##
204
+ # crop an enumerable of +Ruling+ to an +area+
205
+ def self.crop_rulings_to_area(rulings, area)
206
+ rulings.reduce([]) do |memo, r|
207
+ if r.intersects(area)
208
+ memo << r.clone.intersect(area)
209
+ end
210
+ memo
211
+ end
212
+ end
213
+
214
+ # TODO do we really need this one anymore?
215
+ def self.clean_rulings(rulings, max_distance=4)
216
+
217
+ # merge horizontal and vertical lines
218
+ # TODO this should be iterative
219
+
220
+ skip = false
221
+
222
+ horiz = rulings.select { |r| r.horizontal? }
223
+ .group_by(&:top)
224
+ .values.reduce([]) do |memo, rs|
225
+
226
+ rs = rs.sort_by(&:left)
227
+ if rs.size > 1
228
+ memo +=
229
+ rs.each_cons(2)
230
+ .chunk { |p| p[1].left - p[0].right < 7 }
231
+ .select { |c| c[0] }
232
+ .map { |group|
233
+ group = group.last.flatten.uniq
234
+ Tabula::Ruling.new(group[0].top,
235
+ group[0].left,
236
+ group[-1].right - group[0].left,
237
+ 0)
238
+ }
239
+ Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
240
+ else
241
+ memo << rs.first
242
+ end
243
+ memo
244
+ end
245
+ .sort_by(&:top)
246
+
247
+ h = []
248
+ horiz.size.times do |i|
249
+
250
+ if i == horiz.size - 1
251
+ h << horiz[-1]
252
+ break
253
+ end
254
+
255
+ if skip
256
+ skip = false;
257
+ next
258
+ end
259
+ d = (horiz[i+1].top - horiz[i].top).abs
260
+
261
+ h << if d < max_distance # THRESHOLD DISTANCE between horizontal lines
262
+ skip = true
263
+ Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
264
+ else
265
+ horiz[i]
266
+ end
267
+ end
268
+ horiz = h
269
+
270
+ vert = rulings.select { |r| r.vertical? }
271
+ .group_by(&:left)
272
+ .values
273
+ .reduce([]) do |memo, rs|
274
+
275
+ rs = rs.sort_by(&:top)
276
+
277
+ if rs.size > 1
278
+ # Here be dragons:
279
+ # merge consecutive segments of lines that are close enough
280
+ memo +=
281
+ rs.each_cons(2)
282
+ .chunk { |p| p[1].top - p[0].bottom < 7 }
283
+ .select { |c| c[0] }
284
+ .map { |group|
285
+ group = group.last.flatten.uniq
286
+ Tabula::Ruling.new(group[0].top,
287
+ group[0].left,
288
+ 0,
289
+ group[-1].bottom - group[0].top)
290
+ }
291
+ else
292
+ memo << rs.first
293
+ end
294
+ memo
295
+ end.sort_by(&:left)
296
+
297
+ return horiz += vert
298
+ end
299
+ end
300
+ end
@@ -0,0 +1,92 @@
1
+ module Tabula
2
+ # a counterpart of Table, to be sure.
3
+ # not sure yet what their relationship ought to be.
4
+ class Spreadsheet < ZoneEntity
5
+ include Tabula::HasCells
6
+ attr_accessor :cells, :vertical_ruling_lines, :horizontal_ruling_lines, :cells_resolved
7
+
8
+ def initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) #, lines)
9
+ super(top, left, width, height)
10
+ @cells = cells
11
+ @page = page
12
+ @vertical_ruling_lines = vertical_ruling_lines
13
+ @horizontal_ruling_lines = horizontal_ruling_lines
14
+ end
15
+
16
+ def ruling_lines
17
+ @vertical_ruling_lines + @horizontal_ruling_lines
18
+ end
19
+
20
+ def ruling_lines=(lines)
21
+ @vertical_ruling_lines = lines.select{|vl| vl.vertical? && spr.intersectsLine(vl) }
22
+ @horizontal_ruling_lines = lines.select{|hl| hl.horizontal? && spr.intersectsLine(hl) }
23
+ end
24
+
25
+ def fill_in_cells!
26
+ unless @cells_resolved
27
+ @cells_resolved = true
28
+ cells.each do |cell|
29
+ cell.text_elements = @page.get_cell_text(cell)
30
+ end
31
+ end
32
+ end
33
+
34
+ # call `rows` with `evaluate_cells` as `false` to defer filling in the text in
35
+ # each cell, which can be computationally intensive.
36
+ def rows(evaluate_cells=true)
37
+ if evaluate_cells
38
+ fill_in_cells!
39
+ end
40
+ tops = cells.map(&:top).uniq.sort
41
+ array_of_rows = tops.map do |top|
42
+ cells.select{|c| c.top == top }.sort_by(&:left)
43
+ end
44
+ #here, insert another kind of placeholder for empty corners
45
+ # like in 01001523B_China.pdf
46
+ #TODO: support placeholders for "empty" cells in rows other than row 1, and in #cols
47
+ # puts array_of_rows[0].inspect
48
+ if array_of_rows.size > 2
49
+ if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
50
+ missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
51
+ # puts missing_spots.inspect
52
+ missing_spots.each do |missing_spot|
53
+ missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
54
+ missing_spot_placeholder.placeholder = true
55
+ array_of_rows[0] << missing_spot_placeholder
56
+ end
57
+ end
58
+ array_of_rows[0].sort_by!(&:left)
59
+ end
60
+ array_of_rows
61
+ end
62
+
63
+ # call `cols` with `evaluate_cells` as `false` to defer filling in the text in
64
+ # each cell, which can be computationally intensive.
65
+ def cols(evaluate_cells=true)
66
+ if evaluate_cells
67
+ fill_in_cells!
68
+ end
69
+ lefts = cells.map(&:left).uniq.sort
70
+ lefts.map do |left|
71
+ cells.select{|c| c.left == left }.sort_by(&:top)
72
+ end
73
+ end
74
+
75
+ def to_a
76
+ fill_in_cells!
77
+ rows.map{ |row_cells| row_cells.map(&:text) }
78
+ end
79
+
80
+ def to_csv
81
+ out = StringIO.new
82
+ Tabula::Writers.CSV(rows, out)
83
+ out.string
84
+ end
85
+
86
+ def to_tsv
87
+ out = StringIO.new
88
+ Tabula::Writers.TSV(rows, out)
89
+ out.string
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,81 @@
1
+ module Tabula
2
+ class Table
3
+ attr_reader :lines
4
+ def initialize(line_count, separators)
5
+ @separators = separators
6
+ @lines = (0...line_count).inject([]) { |m| m << Line.new }
7
+ end
8
+
9
+ def add_text_element(text_element, i, j)
10
+ if @lines.size <= i
11
+ @lines[i] = Line.new
12
+ end
13
+ if @lines[i].text_elements[j]
14
+ @lines[i].text_elements[j].merge!(text_element)
15
+ else
16
+ @lines[i].text_elements[j] = text_element
17
+ end
18
+ end
19
+
20
+ def rpad!
21
+ max = lines.map{|l| l.text_elements.size}.max
22
+ lines.each do |line|
23
+ needed = max - line.text_elements.size
24
+ needed.times do
25
+ line.text_elements << TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
26
+ end
27
+ end
28
+ end
29
+
30
+ def cols
31
+ self.rpad!
32
+ lines.map(&:text_elements).transpose
33
+ end
34
+
35
+ def rows
36
+ self.rpad!
37
+ lines.map(&:text_elements)
38
+ end
39
+
40
+ # create a new Table object from an array of arrays, representing a list of rows in a spreadsheet
41
+ # probably only used for testing
42
+ def self.new_from_array(array_of_rows)
43
+ t = Table.new(array_of_rows.size, [])
44
+ array_of_rows.each_with_index do |row, index|
45
+ t.lines[index].text_elements = row.map{|cell| TextElement.new(nil, nil, nil, nil, nil, nil, cell, nil)}
46
+ end
47
+ t
48
+ end
49
+
50
+ #for equality testing, return @lines stripped of leading columns of empty strings
51
+ #TODO: write a method to strip all totally-empty columns (or not?)
52
+ def lstrip_lines
53
+ return @lines if @lines.include?(nil)
54
+ min_leading_empty_strings = Float::INFINITY
55
+ @lines.each do |line|
56
+ empties = line.text_elements.map{|t| t.nil? || t.text.empty? }
57
+ min_leading_empty_strings = [min_leading_empty_strings, empties.index(false)].min
58
+ end
59
+ if min_leading_empty_strings == 0
60
+ @lines
61
+ else
62
+ @lines.each{|line| line.text_elements = line.text_elements[min_leading_empty_strings..-1]}
63
+ @lines
64
+ end
65
+ end
66
+ def lstrip_lines!
67
+ @lines = self.lstrip_lines
68
+ end
69
+
70
+ #used for testing, ignores separator locations (they'll sometimes be nil/empty)
71
+ def ==(other)
72
+ self.instance_variable_set(:@lines, self.lstrip_lines)
73
+ other.instance_variable_set(:@lines, other.lstrip_lines)
74
+ self.instance_variable_set(:@lines, self.lines.rpad(nil, other.lines.size))
75
+ other.instance_variable_set(:@lines, other.lines.rpad(nil, self.lines.size))
76
+
77
+ self.lines.zip(other.lines).all? { |my, yours| my == yours }
78
+
79
+ end
80
+ end
81
+ end