tabula-extractor 0.5.1-java → 0.6.1-java

Sign up to get free protection for your applications and to get access to all the features.
data/bin/tabula CHANGED
@@ -34,6 +34,7 @@ EOS
34
34
 
35
35
  opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String
36
36
  opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
37
+ opt :guess, 'Guess the portion of the page to analyze per page. Slow.'
37
38
  opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
38
39
  opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
39
40
  end
@@ -51,7 +52,6 @@ EOS
51
52
  Trollop::die 'file does not exist' unless File.exists? pdf_filename
52
53
 
53
54
  return opts, pdf_filename
54
-
55
55
  end
56
56
 
57
57
  def main
@@ -60,11 +60,15 @@ def main
60
60
  area = opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)
61
61
  out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
62
62
  extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
63
- extractor.extract.each do |page|
64
- text = page.get_text(area)
65
- Tabula::Writers.send(opts[:format].to_sym,
66
- Tabula.make_table(text),
67
- out)
63
+ extractor.extract.each_with_index do |page, page_index|
64
+ page_areas = opts[:guess] ? Tabula::TableGuesser::find_rects_on_page(Tabula::TableGuesser::load_pdf(filename), page_index) : [area]
65
+
66
+ page_areas.each do |page_area|
67
+ text = page.get_text( page_area )
68
+ Tabula::Writers.send(opts[:format].to_sym,
69
+ Tabula.make_table(text),
70
+ out)
71
+ end
68
72
  end
69
73
  out.close
70
74
  end
data/lib/geom/point.rb ADDED
@@ -0,0 +1,21 @@
1
+ #
2
+ # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
+ # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
+ #
5
+
6
+
7
+ module Geometry
8
+ class Point < Struct.new(:x, :y)
9
+ def self.new_by_array(array)
10
+ self.new(array[0], array[1])
11
+ end
12
+
13
+ def ==(another_point)
14
+ x === another_point.x && y === another_point.y
15
+ end
16
+ end
17
+ end
18
+
19
+ def Point(x, y)
20
+ Geometry::Point.new(x, y)
21
+ end
@@ -0,0 +1,101 @@
1
+ #
2
+ # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
+ # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
+ #
5
+
6
+
7
+ module Geometry
8
+ class Rectangle < Struct.new(:point1, :point2)
9
+ SIMILARITY_DIVISOR = 20
10
+
11
+ def Rectangle.unionize(non_overlapping_rectangles, next_rect)
12
+ #if next_rect doesn't overlap any of non_overlapping_rectangles
13
+ if (overlapping = non_overlapping_rectangles.select{|r| next_rect.overlaps? r}) && !non_overlapping_rectangles.empty?
14
+ #remove all of those that it overlaps from non_overlapping_rectangles and
15
+ non_overlapping_rectangles -= overlapping
16
+ #add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
17
+ non_overlapping_rectangles << overlapping.inject(next_rect){|memo, overlap| memo.bounding_box(overlap) }
18
+
19
+ else
20
+ non_overlapping_rectangles << next_rect
21
+ end
22
+ end
23
+
24
+ def self.new_by_x_y_dims(x, y, width, height)
25
+ self.new( Point.new_by_array([x, y]), Point.new_by_array([x + width, y + height]) )
26
+ end
27
+
28
+ def x
29
+ [point1.x, point2.x].min
30
+ end
31
+
32
+ alias_method :left, :x
33
+
34
+ def y
35
+ #puts "y: [#{point1.y} #{point2.y}].min"
36
+ [point1.y, point2.y].min
37
+ end
38
+
39
+ alias_method :top, :y
40
+
41
+ def x2
42
+ [point1.x, point2.x].max
43
+ end
44
+
45
+ alias_method :right, :x2
46
+
47
+ def y2
48
+ #puts "y2: [#{point1.y} #{point2.y}].max"
49
+ [point1.y, point2.y].max
50
+ end
51
+
52
+ alias_method :bottom, :y2
53
+
54
+
55
+ def width
56
+ (point1.x - point2.x).abs
57
+ end
58
+
59
+ def height
60
+ (point1.y - point2.y).abs
61
+ end
62
+
63
+ def area
64
+ self.width * self.height
65
+ end
66
+
67
+ def similarity_hash
68
+ [self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
69
+ end
70
+
71
+ def dims(*format)
72
+ if format
73
+ format.map{|method| self.send(method)}
74
+ else
75
+ [self.x, self.y, self.width, self.height]
76
+ end
77
+ end
78
+
79
+ def contains?(other_x, other_y)
80
+ (other_x <= x2 && other_x >= x ) && (other_y <= y2 && other_y > y)
81
+ end
82
+
83
+ def overlaps?(other_rect)
84
+ return contains?(other_rect.x, other_rect.y) || contains?(other_rect.x2, other_rect.y2) ||
85
+ contains?(other_rect.x, other_rect.y2) || contains?(other_rect.x2, other_rect.y) ||
86
+ other_rect.contains?(x, y) || other_rect.contains?(x2, y2) ||
87
+ other_rect.contains?(x, y2) || other_rect.contains?(x2, y)
88
+ end
89
+
90
+ def bounding_box(other_rect)
91
+ #new rect with bounding box of these two
92
+ new_x1 = [x, other_rect.x].min
93
+ new_y1 = [x, other_rect.y].min
94
+ new_x2 = [x2, other_rect.x2].max
95
+ new_y2 = [y2, other_rect.y2].max
96
+ new_width = (new_x2 - new_x1).abs
97
+ new_height = (new_y2 - new_y1).abs
98
+ Rectangle.new_by_x_y_dims(new_x1, new_y1, new_width, new_height)
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,82 @@
1
+ #
2
+ # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
+ # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
+ #
5
+
6
+
7
+ module Geometry
8
+ include Math
9
+ extend Math
10
+
11
+ def Geometry.distance(point1, point2)
12
+ hypot point1.x - point2.x, point1.y - point2.y
13
+ end
14
+
15
+
16
+ class Segment < Struct.new(:point1, :point2)
17
+ def self.new_by_arrays(point1_coordinates, point2_coordinates)
18
+ self.new(Point.new_by_array(point1_coordinates),
19
+ Point.new_by_array(point2_coordinates))
20
+ end
21
+
22
+ def scale!(scale_factor)
23
+ self.point1.x = self.point1.x * scale_factor
24
+ self.point1.y = self.point1.y * scale_factor
25
+ self.point2.x = self.point2.x * scale_factor
26
+ self.point2.y = self.point2.y * scale_factor
27
+ end
28
+
29
+ def vertical?
30
+ point1.x == point2.x
31
+ end
32
+
33
+ def horizontal?
34
+ point1.y == point2.y
35
+ end
36
+
37
+ def leftmost_endpoint
38
+ ((point1.x <=> point2.x) == -1) ? point1 : point2
39
+ end
40
+
41
+ def rightmost_endpoint
42
+ ((point1.x <=> point2.x) == 1) ? point1 : point2
43
+ end
44
+
45
+ def topmost_endpoint
46
+ ((point1.y <=> point2.y) == 1) ? point1 : point2
47
+ end
48
+
49
+ def bottommost_endpoint
50
+ ((point1.y <=> point2.y) == -1) ? point1 : point2
51
+ end
52
+
53
+ def top
54
+ topmost_endpoint.y
55
+ end
56
+
57
+ def bottom
58
+ bottommost_endpoint.y
59
+ end
60
+ def width
61
+ (left - right).abs
62
+ end
63
+ def height
64
+ (bottom - top).abs
65
+ end
66
+
67
+ def left
68
+ leftmost_endpoint.x
69
+ end
70
+
71
+ def right
72
+ rightmost_endpoint.x
73
+ end
74
+ def length
75
+ Geometry.distance(point1, point2)
76
+ end
77
+ end
78
+ end
79
+
80
+ def Segment(point1, point2)
81
+ Geometry::Segment.new point1, point2
82
+ end
data/lib/tabula.rb CHANGED
@@ -7,5 +7,6 @@ require_relative './tabula/entities'
7
7
  require_relative './tabula/pdf_dump'
8
8
  require_relative './tabula/table_extractor'
9
9
  require_relative './tabula/writers'
10
+ require_relative './tabula/table_guesser'
10
11
  require_relative './tabula/line_segment_detector'
11
12
  require_relative './tabula/pdf_render'
@@ -99,10 +99,10 @@ module Tabula
99
99
 
100
100
  # spaces are not detected, b/c they have height == 0
101
101
  # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
102
- # self.texts.select { |t| t.overlaps? ze }
103
- self.texts.select { |t|
102
+ # self.texts.select { |t| t.overlaps? ze }
103
+ self.texts.select do |t|
104
104
  t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
105
- }
105
+ end
106
106
  end
107
107
 
108
108
  def to_json(options={})
@@ -120,7 +120,7 @@ module Tabula
120
120
  attr_accessor :font, :font_size, :text, :width_of_space
121
121
 
122
122
  CHARACTER_DISTANCE_THRESHOLD = 1.5
123
- TOLERANCE_FACTOR = 0.25
123
+ TOLERANCE_FACTOR = 0.25 #25
124
124
 
125
125
  def initialize(top, left, width, height, font, font_size, text, width_of_space)
126
126
  super(top, left, width, height)
@@ -149,7 +149,7 @@ module Tabula
149
149
  overlaps = self.vertically_overlaps?(other)
150
150
 
151
151
  up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
152
- down_tolerance = 0.95
152
+ down_tolerance = 0.90 #90?
153
153
 
154
154
  dist = self.horizontal_distance(other).abs
155
155
 
@@ -261,6 +261,10 @@ module Tabula
261
261
  r >= 0 and r < 1 and s >= 0 and s < 1
262
262
  end
263
263
 
264
+ def length
265
+ Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
266
+ end
267
+
264
268
  def vertical?
265
269
  left == right
266
270
  end
@@ -269,6 +273,13 @@ module Tabula
269
273
  top == bottom
270
274
  end
271
275
 
276
+ def right
277
+ left + width
278
+ end
279
+ def bottom
280
+ top + height
281
+ end
282
+
272
283
  def to_json(arg)
273
284
  [left, top, right, bottom].to_json
274
285
  end
@@ -46,7 +46,7 @@ module Tabula
46
46
  options = DETECT_LINES_DEFAULTS.merge(options)
47
47
 
48
48
  pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
49
- page = pdf_file.getDocumentCatalog.getAllPages[page_number - 1]
49
+ page = pdf_file.getDocumentCatalog.getAllPages[page_number]
50
50
  bi = Tabula::Render.pageToBufferedImage(page,
51
51
  options[:image_size])
52
52
  pdf_file.close
@@ -77,11 +77,12 @@ module Tabula
77
77
  class CharacterExtractor
78
78
  include Observable
79
79
 
80
+ #N.B. pages can be :all, a list of pages or a range.
80
81
  def initialize(pdf_filename, pages=[1])
81
82
  raise Errno::ENOENT unless File.exists?(pdf_filename)
82
83
  @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
83
84
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
84
- @pages = pages
85
+ @pages = pages == :all ? (1..@all_pages.size) : pages
85
86
  @extractor = TextExtractor.new
86
87
  end
87
88
 
@@ -20,8 +20,9 @@ module Tabula
20
20
  end
21
21
  end
22
22
 
23
- TRANSPARENT_WHITE = Color.new(255, 255, 255, 0)
23
+ TRANSPARENT_WHITE = java.awt.Color.new(255, 255, 255, 0)
24
24
 
25
+ # 2048 width is important, if this is too small, thin lines won't be drawn.
25
26
  def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
26
27
  cropbox = page.findCropBox
27
28
  widthPt, heightPt = cropbox.getWidth, cropbox.getHeight
@@ -27,21 +27,38 @@ module Tabula
27
27
  # (ie, take into account vertical ruling lines if available)
28
28
  def group_by_columns
29
29
  columns = []
30
- tes = self.text_elements.sort_by(&:left)
30
+ tes = self.text_elements.sort_by &:left
31
31
 
32
32
  # we don't have vertical rulings
33
- tes.each do |te|
34
- if column = columns.detect { |c| te.horizontally_overlaps?(c) }
35
- column << te
36
- else
37
- columns << Column.new(te.left, te.width, [te])
33
+ if self.options[:vertical_rulings].empty?
34
+ tes.each do |te|
35
+ if column = columns.detect { |c| te.horizontally_overlaps?(c) }
36
+ column << te
37
+ else
38
+ columns << Column.new(te.left, te.width, [te])
39
+ end
40
+ end
41
+ else
42
+ self.options[:vertical_rulings].sort_by! &:left
43
+ 1.upto(self.options[:vertical_rulings].size - 1) do |i|
44
+ left_ruling_line = self.options[:vertical_rulings][i - 1]
45
+ right_ruling_line = self.options[:vertical_rulings][i]
46
+ columns << Column.new(left_ruling_line.left, right_ruling_line.left - left_ruling_line.left, []) if (right_ruling_line.left - left_ruling_line.left > 10)
47
+ end
48
+ tes.each do |te|
49
+ if column = columns.detect { |c| te.horizontally_overlaps?(c) }
50
+ column << te
51
+ else
52
+ puts "couldn't find a place for #{te.inspect}"
53
+ #columns << Column.new(te.left, te.width, [te])
54
+ end
38
55
  end
39
56
  end
40
57
  columns
41
58
  end
42
59
 
43
60
  def get_columns
44
- Tabula.group_by_columns(text_elements).map do |c|
61
+ TableExtractor.new(text_elements).group_by_columns.map do |c|
45
62
  {'left' => c.left, 'right' => c.right, 'width' => c.width}
46
63
  end
47
64
  end
@@ -87,6 +104,7 @@ module Tabula
87
104
 
88
105
  private
89
106
 
107
+ #this is where spaces come from!
90
108
  def merge_words!
91
109
  return self.text_elements if @merged # only merge once. awful hack.
92
110
  @merged = true
@@ -97,9 +115,12 @@ module Tabula
97
115
 
98
116
  char2 = self.text_elements[i+1]
99
117
 
118
+
119
+
100
120
  next if char2.nil? or char1.nil?
101
121
 
102
122
  if self.text_elements[current_word_index].should_merge?(char2)
123
+ #puts "merging: #{self.text_elements[current_word_index].text}/#{self.text_elements[current_word_index].width}"
103
124
  self.text_elements[current_word_index].merge!(char2)
104
125
  char1 = char2
105
126
  self.text_elements[i+1] = nil
@@ -107,13 +128,14 @@ module Tabula
107
128
  # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
108
129
  if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
109
130
  self.text_elements[current_word_index].text += " "
110
- self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
131
+ #self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
111
132
  end
112
133
  current_word_index = i+1
113
134
  end
114
135
  i += 1
115
136
  end
116
- return self.text_elements.compact!
137
+ self.text_elements.compact!
138
+ return self.text_elements
117
139
  end
118
140
  end
119
141
 
@@ -174,7 +196,7 @@ module Tabula
174
196
 
175
197
  lines.sort_by!(&:top)
176
198
 
177
- columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
199
+ columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words]}).group_by_columns.sort_by(&:left)
178
200
 
179
201
  # # insert empty cells if needed
180
202
  lines.each_with_index do |l, line_index|
@@ -183,23 +205,21 @@ module Tabula
183
205
  l.text_elements.uniq! # TODO WHY do I have to do this?
184
206
  l.text_elements.sort_by!(&:left)
185
207
 
186
- next unless l.text_elements.size < columns.size
208
+ #next unless l.text_elements.size < columns.size
187
209
 
188
210
  columns.each_with_index do |c, i|
189
- if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
211
+ if (i > l.text_elements.size - 1) or (!l.text_elements[i].nil? and !c.text_elements.include?(l.text_elements[i]))
190
212
  l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
191
213
  end
192
214
  end
193
215
  end
194
216
 
195
217
  # # merge elements that are in the same column
196
- columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
197
-
198
218
  lines.each_with_index do |l, line_index|
199
219
  next if l.text_elements.nil?
200
220
 
201
221
  (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
202
- next if l.text_elements[t1].nil? or l.text_elements[t2].nil?
222
+ next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
203
223
 
204
224
  # if same column...
205
225
  if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
@@ -233,4 +253,96 @@ module Tabula
233
253
  line.text_elements.sort_by(&:left)
234
254
  end
235
255
  end
256
+
257
+
258
+ def Tabula.make_table_with_vertical_rulings(text_elements, options={})
259
+ extractor = TableExtractor.new(text_elements, options)
260
+
261
+ # group by lines
262
+ lines = []
263
+ line_boundaries = extractor.get_line_boundaries
264
+
265
+ # find all the text elements
266
+ # contained within each detected line (table row) boundary
267
+ line_boundaries.each do |lb|
268
+ line = Line.new
269
+
270
+ line_members = text_elements.find_all do |te|
271
+ te.vertically_overlaps?(lb)
272
+ end
273
+
274
+ text_elements -= line_members
275
+
276
+ line_members.sort_by(&:left).each do |te|
277
+ # skip text_elements that only contain spaces
278
+ next if te.text =~ ONLY_SPACES_RE
279
+ line << te
280
+ end
281
+
282
+ lines << line if line.text_elements.size > 0
283
+ end
284
+
285
+ lines.sort_by!(&:top)
286
+
287
+ vertical_rulings = options[:vertical_rulings]
288
+ columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)
289
+
290
+ # insert empty cells if needed
291
+ lines.each_with_index do |l, line_index|
292
+ next if l.text_elements.nil?
293
+ l.text_elements.compact! # TODO WHY do I have to do this?
294
+ l.text_elements.uniq! # TODO WHY do I have to do this?
295
+ l.text_elements.sort_by!(&:left)
296
+
297
+ columns.each_with_index do |c, i|
298
+ if (l.text_elements.select{|te| te && te.left >= c.left && te.right <= (c.left + c.width)}.empty?)
299
+ l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
300
+ end
301
+ end
302
+ end
303
+
304
+ # merge elements that are in the same column
305
+ lines.each_with_index do |l, line_index|
306
+ next if l.text_elements.nil?
307
+
308
+ (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2| #don't remove a string of empty cells
309
+ next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
310
+
311
+ # if same column...
312
+ if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
313
+ == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
314
+ if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
315
+ l.text_elements[t1].merge!(l.text_elements[t2])
316
+ l.text_elements[t2] = nil
317
+ else
318
+ l.text_elements[t2].merge!(l.text_elements[t1])
319
+ l.text_elements[t1] = nil
320
+ end
321
+ end
322
+ end
323
+
324
+ l.text_elements.compact!
325
+ end
326
+
327
+ # remove duplicate lines
328
+ # TODO this shouldn't have happened here, check why we have to do
329
+ # this (maybe duplication is happening in the column merging phase?)
330
+ (0..lines.size - 2).each do |i|
331
+ next if lines[i].nil?
332
+ # if any of the elements on the next line is duplicated, kill
333
+ # the next line
334
+ if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
335
+ lines[i+1] = nil
336
+ end
337
+ end
338
+
339
+ lines.compact.map do |line|
340
+ line.text_elements.sort_by(&:left)
341
+ end
342
+ end
343
+
344
+
345
+
346
+
347
+
236
348
  end
@@ -0,0 +1,199 @@
1
+ require 'java'
2
+ require 'json'
3
+ require_relative '../geom/point'
4
+ require_relative '../geom/segment'
5
+ require_relative '../geom/rectangle'
6
+ require_relative './pdf_render'
7
+ #CLASSPATH=:./target/javacpp.jar:./target/javacv.jar:./target/javacv-macosx-x86_64.jar:./target/PDFRenderer-0.9.1.jar
8
+
9
+
10
+ module Tabula
11
+ module TableGuesser
12
+
13
+ def TableGuesser.find_and_write_rects(filename, output_dir)
14
+ #writes to JSON the rectangles on each page in the specified PDF.
15
+ open(File.join(output_dir, "tables.json"), 'w') do |f|
16
+ f.write( JSON.dump(find_rects(filename).map{|a| a.map{|r| r.dims.map &:to_i }} ))
17
+ end
18
+ end
19
+
20
+ def TableGuesser.find_rects(filename)
21
+ pdf = load_pdfbox_pdf(filename)
22
+
23
+ if pdf.getNumberOfPages == 0
24
+ puts "not a pdf!"
25
+ exit
26
+ end
27
+
28
+ puts "pages: " + pdf.getNumberOfPages.to_s
29
+
30
+ tables = []
31
+ pdf.getNumberOfPages.times do |i|
32
+ #gotcha: with PDFView, PDF pages are 1-indexed. If you ask for page 0 and then page 1, you'll get the first page twice. So start with index 1.
33
+ tables << find_rects_on_page(pdf, i + 1)
34
+ end
35
+ tables
36
+ end
37
+
38
+ def TableGuesser.find_lines(filename)
39
+ if pdf.getNumberOfPages == 0
40
+ puts "not a pdf!"
41
+ exit
42
+ end
43
+
44
+ puts "pages: " + pdf.getNumberOfPages.to_s
45
+
46
+ lines = []
47
+ pdf.getNumberOfPages.times do |i|
48
+ lines << detect_lines_in_pdf_page(filename, i)
49
+ end
50
+ lines
51
+ end
52
+
53
+ def TableGuesser.find_lines_on_page(pdf, page_index)
54
+ Tabula::LSD.detect_lines_in_pdf_page(pdf, page_index)
55
+ end
56
+
57
+ def TableGuesser.find_rects_on_page(pdf, page_index)
58
+ find_rects_from_lines(find_lines_on_page(pdf, page_index, 10))
59
+ end
60
+
61
+ def TableGuesser.find_rects_from_lines(lines)
62
+ horizontal_lines = lines.select &:horizontal?
63
+ vertical_lines = lines.select &:vertical?
64
+ find_tables(vertical_lines, horizontal_lines).inject([]){|memo, next_rect| Geometry::Rectangle.unionize(memo, next_rect )}.sort_by(&:area).reverse
65
+ end
66
+
67
+
68
+ def TableGuesser.euclidean_distance(x1, y1, x2, y2)
69
+ return Math.sqrt( ((x1 - x2) ** 2) + ((y1 - y2) ** 2) )
70
+ end
71
+
72
+ def TableGuesser.is_upward_oriented(line, y_value)
73
+ #return true if this line is oriented upwards, i.e. if the majority of it's length is above y_value.
74
+ return (y_value - line.top > line.bottom - y_value);
75
+ end
76
+
77
+ def TableGuesser.find_tables(verticals, horizontals)
78
+ # /*
79
+ # * Find all the rectangles in the vertical and horizontal lines given.
80
+ # *
81
+ # * Rectangles are deduped with hashRectangle, which considers two rectangles identical if each point rounds to the same tens place as the other.
82
+ # *
83
+ # * TODO: generalize this.
84
+ # */
85
+ corner_proximity_threshold = 0.10;
86
+
87
+ rectangles = []
88
+ #find rectangles with one horizontal line and two vertical lines that end within $threshold to the ends of the horizontal line.
89
+
90
+ [true, false].each do |up_or_down_lines|
91
+ horizontals.each do |horizontal_line|
92
+ horizontal_line_length = horizontal_line.length
93
+
94
+ has_vertical_line_from_the_left = false
95
+ left_vertical_line = nil
96
+ #for the left vertical line.
97
+ verticals.each do |vertical_line|
98
+ #1. if it is correctly oriented (up or down) given the outer loop here. (We don't want a false-positive rectangle with one "arm" going down, and one going up.)
99
+ next unless is_upward_oriented(vertical_line, horizontal_line.top) == up_or_down_lines
100
+
101
+ vertical_line_length = vertical_line.length
102
+ longer_line_length = [horizontal_line_length, vertical_line_length].max
103
+ corner_proximity = corner_proximity_threshold * longer_line_length
104
+ #make this the left vertical line:
105
+ #2. if it begins near the left vertex of the horizontal line.
106
+ if euclidean_distance(horizontal_line.left, horizontal_line.top, vertical_line.left, vertical_line.top) < corner_proximity ||
107
+ euclidean_distance(horizontal_line.left, horizontal_line.top, vertical_line.left, vertical_line.bottom) < corner_proximity
108
+ #3. if it is farther to the left of the line we already have.
109
+ if left_vertical_line.nil? || left_vertical_line.left> vertical_line.left #is this line is more to the left than left_vertical_line. #"What's your opinion on Das Kapital?"
110
+ has_vertical_line_from_the_left = true
111
+ left_vertical_line = vertical_line
112
+ end
113
+ end
114
+ end
115
+
116
+ has_vertical_line_from_the_right = false;
117
+ right_vertical_line = nil
118
+ #for the right vertical line.
119
+ verticals.each do |vertical_line|
120
+ next unless is_upward_oriented(vertical_line, horizontal_line.top) == up_or_down_lines
121
+ vertical_line_length = vertical_line.length
122
+ longer_line_length = [horizontal_line_length, vertical_line_length].max
123
+ corner_proximity = corner_proximity_threshold * longer_line_length
124
+ if euclidean_distance(horizontal_line.right, horizontal_line.top, vertical_line.left, vertical_line.top) < corner_proximity ||
125
+ euclidean_distance(horizontal_line.right, horizontal_line.top, vertical_line.left, vertical_line.bottom) < corner_proximity
126
+
127
+ if right_vertical_line.nil? || right_vertical_line.right > vertical_line.right #is this line is more to the right than right_vertical_line. #"Can you recite all of John Galt's speech?"
128
+ #do two passes to guarantee we don't get a horizontal line with a upwards and downwards line coming from each of its corners.
129
+ #i.e. ensuring that both "arms" of the rectangle have the same orientation (up or down).
130
+ has_vertical_line_from_the_right = true
131
+ right_vertical_line = vertical_line
132
+ end
133
+ end
134
+ end
135
+
136
+ if has_vertical_line_from_the_right && has_vertical_line_from_the_left
137
+ #in case we eventually tolerate not-quite-vertical lines, this computers the distance in Y directly, rather than depending on the vertical lines' lengths.
138
+ height = [left_vertical_line.bottom - left_vertical_line.top, right_vertical_line.bottom - right_vertical_line.top].max
139
+
140
+ y = [left_vertical_line.top, right_vertical_line.top].min
141
+ width = horizontal_line.right - horizontal_line.left
142
+ r = Geometry::Rectangle.new_by_x_y_dims(horizontal_line.left, y, width, height ) #x, y, w, h
143
+ #rectangles.put(hashRectangle(r), r); #TODO: I dont' think I need this now that I'm in Rubyland
144
+ rectangles << r
145
+ end
146
+ end
147
+
148
+ #find rectangles with one vertical line and two horizontal lines that end within $threshold to the ends of the vertical line.
149
+ verticals.each do |vertical_line|
150
+ vertical_line_length = vertical_line.length
151
+
152
+ has_horizontal_line_from_the_top = false
153
+ top_horizontal_line = nil
154
+ #for the top horizontal line.
155
+ horizontals.each do |horizontal_line|
156
+ horizontal_line_length = horizontal_line.length
157
+ longer_line_length = [horizontal_line_length, vertical_line_length].max
158
+ corner_proximity = corner_proximity_threshold * longer_line_length
159
+
160
+ if euclidean_distance(vertical_line.left, vertical_line.top, horizontal_line.left, horizontal_line.top) < corner_proximity ||
161
+ euclidean_distance(vertical_line.left, vertical_line.top, horizontal_line.right, horizontal_line.top) < corner_proximity
162
+ if top_horizontal_line.nil? || top_horizontal_line.top > horizontal_line.top #is this line is more to the top than the one we've got already.
163
+ has_horizontal_line_from_the_top = true;
164
+ top_horizontal_line = horizontal_line;
165
+ end
166
+ end
167
+ end
168
+ has_horizontal_line_from_the_bottom = false;
169
+ bottom_horizontal_line = nil
170
+ #for the bottom horizontal line.
171
+ horizontals.each do |horizontal_line|
172
+ horizontal_line_length = horizontal_line.length
173
+ longer_line_length = [horizontal_line_length, vertical_line_length].max
174
+ corner_proximity = corner_proximity_threshold * longer_line_length
175
+
176
+ if euclidean_distance(vertical_line.left, vertical_line.bottom, horizontal_line.left, horizontal_line.top) < corner_proximity ||
177
+ euclidean_distance(vertical_line.left, vertical_line.bottom, horizontal_line.left, horizontal_line.top) < corner_proximity
178
+ if bottom_horizontal_line.nil? || bottom_horizontal_line.bottom > horizontal_line.bottom #is this line is more to the bottom than the one we've got already.
179
+ has_horizontal_line_from_the_bottom = true;
180
+ bottom_horizontal_line = horizontal_line;
181
+ end
182
+ end
183
+ end
184
+
185
+ if has_horizontal_line_from_the_bottom && has_horizontal_line_from_the_top
186
+ x = [top_horizontal_line.left, bottom_horizontal_line.left].min
187
+ y = vertical_line.top
188
+ width = [top_horizontal_line.right - top_horizontal_line.left, bottom_horizontal_line.right - bottom_horizontal_line.right].max
189
+ height = vertical_line.bottom - vertical_line.top
190
+ r = Geometry::Rectangle.new_by_x_y_dims(x, y, width, height); #x, y, w, h
191
+ #rectangles.put(hashRectangle(r), r);
192
+ rectangles << r
193
+ end
194
+ end
195
+ end
196
+ return rectangles.uniq &:similarity_hash
197
+ end
198
+ end
199
+ end
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.5.1'
2
+ VERSION = '0.6.1'
3
3
  end
@@ -6,7 +6,7 @@ require 'tabula/version'
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "tabula-extractor"
8
8
  s.version = Tabula::VERSION
9
- s.authors = ["Manuel Aristarán"]
9
+ s.authors = ["Manuel Aristarán", "Jeremy B. Merill", "Mike Tigas"]
10
10
  s.email = ["manuel@jazzido.com"]
11
11
  s.homepage = "https://github.com/jazzido/tabula-extractor"
12
12
  s.summary = %q{extract tables from PDF files}
@@ -14,7 +14,7 @@ Gem::Specification.new do |s|
14
14
 
15
15
  s.platform = 'java'
16
16
 
17
- shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll'].map { |f| 'ext/' + f }
17
+ shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll', 'liblsd64.dll'].map { |f| 'ext/' + f }
18
18
  s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
19
19
  s.test_files = `git ls-files -- {test,features}/*`.split("\n")
20
20
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
Binary file
data/test/tests.rb CHANGED
@@ -23,6 +23,8 @@ class TestPagesInfoExtractor < Minitest::Test
23
23
  end
24
24
  end
25
25
 
26
+ class TestTableGuesser < MiniTest::Unit::TestCase
27
+ end
26
28
 
27
29
  class TestDumper < Minitest::Test
28
30
 
@@ -60,6 +62,77 @@ class TestExtractor < Minitest::Test
60
62
  assert_equal expected, lines_to_array(Tabula.make_table(characters))
61
63
  end
62
64
 
65
+ def test_forest_disclosure_report_dont_regress
66
+ # this is the current state of the expected output. Ideally the output should be like
67
+ # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
68
+ # and a solution for half-x-height-offset lines.
69
+ pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
70
+ character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
71
+ lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
72
+ vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
73
+
74
+
75
+ characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
76
+ #top left bottom right
77
+ expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
78
+ ['TOTAL', '', '', '','$85.00'],
79
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
80
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
81
+ ['TOTAL', '', '', '', '$471.25'],
82
+ ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
83
+ ['TOTAL', '', '', '','$20.39'],
84
+ ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
85
+ ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
86
+ ['TOTAL', '', '', '', '$5,010.33'],
87
+ ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
88
+ ['TOTAL', '', '', '', '$193.67'],
89
+ ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
90
+
91
+ assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
92
+ end
93
+
94
+ def test_missing_spaces_around_an_ampersand
95
+ pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
96
+ character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
97
+ lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
98
+ vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
99
+
100
+
101
+ characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
102
+ #top left bottom right
103
+ expected = [
104
+ ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
105
+ ]
106
+
107
+ assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
108
+ end
109
+
110
+ def test_forest_disclosure_report
111
+ pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
112
+ character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
113
+ lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
114
+ vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
115
+
116
+
117
+ characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
118
+ #top left bottom right
119
+ expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
120
+ ['TOTAL', '', '', '','$85.00'],
121
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
122
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
123
+ ['TOTAL', '', '', '', '$471.25'],
124
+ ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
125
+ ['TOTAL', '', '', '','$20.39'],
126
+ ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
127
+ ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '$4,700.00'],
128
+ ['TOTAL', '', '', '', '$5,010.33'],
129
+ ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
130
+ ['TOTAL', '', '', '', '$193.67'],
131
+ ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
132
+
133
+ assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
134
+ end
135
+
63
136
  # TODO Spaces inserted in words - fails
64
137
  def test_bo_page24
65
138
  character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
metadata CHANGED
@@ -2,14 +2,16 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.5.1
5
+ version: 0.6.1
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
9
+ - Jeremy B. Merill
10
+ - Mike Tigas
9
11
  autorequire:
10
12
  bindir: bin
11
13
  cert_chain: []
12
- date: 2013-06-14 00:00:00.000000000 Z
14
+ date: 2013-06-18 00:00:00.000000000 Z
13
15
  dependencies:
14
16
  - !ruby/object:Gem::Dependency
15
17
  name: minitest
@@ -93,6 +95,9 @@ files:
93
95
  - ext/liblsd64.dll
94
96
  - ext/lsd.c
95
97
  - ext/lsd.h
98
+ - lib/geom/point.rb
99
+ - lib/geom/rectangle.rb
100
+ - lib/geom/segment.rb
96
101
  - lib/tabula.rb
97
102
  - lib/tabula/core_ext.rb
98
103
  - lib/tabula/entities.rb
@@ -100,6 +105,7 @@ files:
100
105
  - lib/tabula/pdf_dump.rb
101
106
  - lib/tabula/pdf_render.rb
102
107
  - lib/tabula/table_extractor.rb
108
+ - lib/tabula/table_guesser.rb
103
109
  - lib/tabula/version.rb
104
110
  - lib/tabula/whitespace.rb
105
111
  - lib/tabula/writers.rb
@@ -108,6 +114,7 @@ files:
108
114
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
109
115
  - test/data/argentina_diputados_voting_record.pdf
110
116
  - test/data/bo_page24.pdf
117
+ - test/data/frx_2012_disclosure.pdf
111
118
  - test/data/gre.pdf
112
119
  - test/data/tabla_subsidios.pdf
113
120
  - test/tests.rb
@@ -147,6 +154,7 @@ test_files:
147
154
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
148
155
  - test/data/argentina_diputados_voting_record.pdf
149
156
  - test/data/bo_page24.pdf
157
+ - test/data/frx_2012_disclosure.pdf
150
158
  - test/data/gre.pdf
151
159
  - test/data/tabla_subsidios.pdf
152
160
  - test/tests.rb