tabula-extractor 0.5.1-java → 0.6.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/tabula CHANGED
@@ -34,6 +34,7 @@ EOS
34
34
 
35
35
  opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String
36
36
  opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
37
+ opt :guess, 'Guess the portion of the page to analyze per page. Slow.'
37
38
  opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
38
39
  opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
39
40
  end
@@ -51,7 +52,6 @@ EOS
51
52
  Trollop::die 'file does not exist' unless File.exists? pdf_filename
52
53
 
53
54
  return opts, pdf_filename
54
-
55
55
  end
56
56
 
57
57
  def main
@@ -60,11 +60,15 @@ def main
60
60
  area = opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)
61
61
  out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
62
62
  extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
63
- extractor.extract.each do |page|
64
- text = page.get_text(area)
65
- Tabula::Writers.send(opts[:format].to_sym,
66
- Tabula.make_table(text),
67
- out)
63
+ extractor.extract.each_with_index do |page, page_index|
64
+ page_areas = opts[:guess] ? Tabula::TableGuesser::find_rects_on_page(Tabula::TableGuesser::load_pdf(filename), page_index) : [area]
65
+
66
+ page_areas.each do |page_area|
67
+ text = page.get_text( page_area )
68
+ Tabula::Writers.send(opts[:format].to_sym,
69
+ Tabula.make_table(text),
70
+ out)
71
+ end
68
72
  end
69
73
  out.close
70
74
  end
data/lib/geom/point.rb ADDED
@@ -0,0 +1,21 @@
1
+ #
2
+ # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
+ # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
+ #
5
+
6
+
7
+ module Geometry
8
+ class Point < Struct.new(:x, :y)
9
+ def self.new_by_array(array)
10
+ self.new(array[0], array[1])
11
+ end
12
+
13
+ def ==(another_point)
14
+ x === another_point.x && y === another_point.y
15
+ end
16
+ end
17
+ end
18
+
19
+ def Point(x, y)
20
+ Geometry::Point.new(x, y)
21
+ end
@@ -0,0 +1,101 @@
1
+ #
2
+ # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
+ # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
+ #
5
+
6
+
7
+ module Geometry
8
+ class Rectangle < Struct.new(:point1, :point2)
9
+ SIMILARITY_DIVISOR = 20
10
+
11
+ def Rectangle.unionize(non_overlapping_rectangles, next_rect)
12
+ #if next_rect doesn't overlap any of non_overlapping_rectangles
13
+ if (overlapping = non_overlapping_rectangles.select{|r| next_rect.overlaps? r}) && !non_overlapping_rectangles.empty?
14
+ #remove all of those that it overlaps from non_overlapping_rectangles and
15
+ non_overlapping_rectangles -= overlapping
16
+ #add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
17
+ non_overlapping_rectangles << overlapping.inject(next_rect){|memo, overlap| memo.bounding_box(overlap) }
18
+
19
+ else
20
+ non_overlapping_rectangles << next_rect
21
+ end
22
+ end
23
+
24
+ def self.new_by_x_y_dims(x, y, width, height)
25
+ self.new( Point.new_by_array([x, y]), Point.new_by_array([x + width, y + height]) )
26
+ end
27
+
28
+ def x
29
+ [point1.x, point2.x].min
30
+ end
31
+
32
+ alias_method :left, :x
33
+
34
+ def y
35
+ #puts "y: [#{point1.y} #{point2.y}].min"
36
+ [point1.y, point2.y].min
37
+ end
38
+
39
+ alias_method :top, :y
40
+
41
+ def x2
42
+ [point1.x, point2.x].max
43
+ end
44
+
45
+ alias_method :right, :x2
46
+
47
+ def y2
48
+ #puts "y2: [#{point1.y} #{point2.y}].max"
49
+ [point1.y, point2.y].max
50
+ end
51
+
52
+ alias_method :bottom, :y2
53
+
54
+
55
+ def width
56
+ (point1.x - point2.x).abs
57
+ end
58
+
59
+ def height
60
+ (point1.y - point2.y).abs
61
+ end
62
+
63
+ def area
64
+ self.width * self.height
65
+ end
66
+
67
+ def similarity_hash
68
+ [self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
69
+ end
70
+
71
+ def dims(*format)
72
+ if format
73
+ format.map{|method| self.send(method)}
74
+ else
75
+ [self.x, self.y, self.width, self.height]
76
+ end
77
+ end
78
+
79
+ def contains?(other_x, other_y)
80
+ (other_x <= x2 && other_x >= x ) && (other_y <= y2 && other_y > y)
81
+ end
82
+
83
+ def overlaps?(other_rect)
84
+ return contains?(other_rect.x, other_rect.y) || contains?(other_rect.x2, other_rect.y2) ||
85
+ contains?(other_rect.x, other_rect.y2) || contains?(other_rect.x2, other_rect.y) ||
86
+ other_rect.contains?(x, y) || other_rect.contains?(x2, y2) ||
87
+ other_rect.contains?(x, y2) || other_rect.contains?(x2, y)
88
+ end
89
+
90
+ def bounding_box(other_rect)
91
+ #new rect with bounding box of these two
92
+ new_x1 = [x, other_rect.x].min
93
+ new_y1 = [x, other_rect.y].min
94
+ new_x2 = [x2, other_rect.x2].max
95
+ new_y2 = [y2, other_rect.y2].max
96
+ new_width = (new_x2 - new_x1).abs
97
+ new_height = (new_y2 - new_y1).abs
98
+ Rectangle.new_by_x_y_dims(new_x1, new_y1, new_width, new_height)
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,82 @@
1
+ #
2
+ # Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
3
+ # MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
4
+ #
5
+
6
+
7
+ module Geometry
8
+ include Math
9
+ extend Math
10
+
11
+ def Geometry.distance(point1, point2)
12
+ hypot point1.x - point2.x, point1.y - point2.y
13
+ end
14
+
15
+
16
+ class Segment < Struct.new(:point1, :point2)
17
+ def self.new_by_arrays(point1_coordinates, point2_coordinates)
18
+ self.new(Point.new_by_array(point1_coordinates),
19
+ Point.new_by_array(point2_coordinates))
20
+ end
21
+
22
+ def scale!(scale_factor)
23
+ self.point1.x = self.point1.x * scale_factor
24
+ self.point1.y = self.point1.y * scale_factor
25
+ self.point2.x = self.point2.x * scale_factor
26
+ self.point2.y = self.point2.y * scale_factor
27
+ end
28
+
29
+ def vertical?
30
+ point1.x == point2.x
31
+ end
32
+
33
+ def horizontal?
34
+ point1.y == point2.y
35
+ end
36
+
37
+ def leftmost_endpoint
38
+ ((point1.x <=> point2.x) == -1) ? point1 : point2
39
+ end
40
+
41
+ def rightmost_endpoint
42
+ ((point1.x <=> point2.x) == 1) ? point1 : point2
43
+ end
44
+
45
+ def topmost_endpoint
46
+ ((point1.y <=> point2.y) == 1) ? point1 : point2
47
+ end
48
+
49
+ def bottommost_endpoint
50
+ ((point1.y <=> point2.y) == -1) ? point1 : point2
51
+ end
52
+
53
+ def top
54
+ topmost_endpoint.y
55
+ end
56
+
57
+ def bottom
58
+ bottommost_endpoint.y
59
+ end
60
+ def width
61
+ (left - right).abs
62
+ end
63
+ def height
64
+ (bottom - top).abs
65
+ end
66
+
67
+ def left
68
+ leftmost_endpoint.x
69
+ end
70
+
71
+ def right
72
+ rightmost_endpoint.x
73
+ end
74
+ def length
75
+ Geometry.distance(point1, point2)
76
+ end
77
+ end
78
+ end
79
+
80
+ def Segment(point1, point2)
81
+ Geometry::Segment.new point1, point2
82
+ end
data/lib/tabula.rb CHANGED
@@ -7,5 +7,6 @@ require_relative './tabula/entities'
7
7
  require_relative './tabula/pdf_dump'
8
8
  require_relative './tabula/table_extractor'
9
9
  require_relative './tabula/writers'
10
+ require_relative './tabula/table_guesser'
10
11
  require_relative './tabula/line_segment_detector'
11
12
  require_relative './tabula/pdf_render'
@@ -99,10 +99,10 @@ module Tabula
99
99
 
100
100
  # spaces are not detected, b/c they have height == 0
101
101
  # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
102
- # self.texts.select { |t| t.overlaps? ze }
103
- self.texts.select { |t|
102
+ # self.texts.select { |t| t.overlaps? ze }
103
+ self.texts.select do |t|
104
104
  t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
105
- }
105
+ end
106
106
  end
107
107
 
108
108
  def to_json(options={})
@@ -120,7 +120,7 @@ module Tabula
120
120
  attr_accessor :font, :font_size, :text, :width_of_space
121
121
 
122
122
  CHARACTER_DISTANCE_THRESHOLD = 1.5
123
- TOLERANCE_FACTOR = 0.25
123
+ TOLERANCE_FACTOR = 0.25 #25
124
124
 
125
125
  def initialize(top, left, width, height, font, font_size, text, width_of_space)
126
126
  super(top, left, width, height)
@@ -149,7 +149,7 @@ module Tabula
149
149
  overlaps = self.vertically_overlaps?(other)
150
150
 
151
151
  up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
152
- down_tolerance = 0.95
152
+ down_tolerance = 0.90 #90?
153
153
 
154
154
  dist = self.horizontal_distance(other).abs
155
155
 
@@ -261,6 +261,10 @@ module Tabula
261
261
  r >= 0 and r < 1 and s >= 0 and s < 1
262
262
  end
263
263
 
264
+ def length
265
+ Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
266
+ end
267
+
264
268
  def vertical?
265
269
  left == right
266
270
  end
@@ -269,6 +273,13 @@ module Tabula
269
273
  top == bottom
270
274
  end
271
275
 
276
+ def right
277
+ left + width
278
+ end
279
+ def bottom
280
+ top + height
281
+ end
282
+
272
283
  def to_json(arg)
273
284
  [left, top, right, bottom].to_json
274
285
  end
@@ -46,7 +46,7 @@ module Tabula
46
46
  options = DETECT_LINES_DEFAULTS.merge(options)
47
47
 
48
48
  pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
49
- page = pdf_file.getDocumentCatalog.getAllPages[page_number - 1]
49
+ page = pdf_file.getDocumentCatalog.getAllPages[page_number]
50
50
  bi = Tabula::Render.pageToBufferedImage(page,
51
51
  options[:image_size])
52
52
  pdf_file.close
@@ -77,11 +77,12 @@ module Tabula
77
77
  class CharacterExtractor
78
78
  include Observable
79
79
 
80
+ #N.B. pages can be :all, a list of pages or a range.
80
81
  def initialize(pdf_filename, pages=[1])
81
82
  raise Errno::ENOENT unless File.exists?(pdf_filename)
82
83
  @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
83
84
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
84
- @pages = pages
85
+ @pages = pages == :all ? (1..@all_pages.size) : pages
85
86
  @extractor = TextExtractor.new
86
87
  end
87
88
 
@@ -20,8 +20,9 @@ module Tabula
20
20
  end
21
21
  end
22
22
 
23
- TRANSPARENT_WHITE = Color.new(255, 255, 255, 0)
23
+ TRANSPARENT_WHITE = java.awt.Color.new(255, 255, 255, 0)
24
24
 
25
+ # 2048 width is important, if this is too small, thin lines won't be drawn.
25
26
  def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
26
27
  cropbox = page.findCropBox
27
28
  widthPt, heightPt = cropbox.getWidth, cropbox.getHeight
@@ -27,21 +27,38 @@ module Tabula
27
27
  # (ie, take into account vertical ruling lines if available)
28
28
  def group_by_columns
29
29
  columns = []
30
- tes = self.text_elements.sort_by(&:left)
30
+ tes = self.text_elements.sort_by &:left
31
31
 
32
32
  # we don't have vertical rulings
33
- tes.each do |te|
34
- if column = columns.detect { |c| te.horizontally_overlaps?(c) }
35
- column << te
36
- else
37
- columns << Column.new(te.left, te.width, [te])
33
+ if self.options[:vertical_rulings].empty?
34
+ tes.each do |te|
35
+ if column = columns.detect { |c| te.horizontally_overlaps?(c) }
36
+ column << te
37
+ else
38
+ columns << Column.new(te.left, te.width, [te])
39
+ end
40
+ end
41
+ else
42
+ self.options[:vertical_rulings].sort_by! &:left
43
+ 1.upto(self.options[:vertical_rulings].size - 1) do |i|
44
+ left_ruling_line = self.options[:vertical_rulings][i - 1]
45
+ right_ruling_line = self.options[:vertical_rulings][i]
46
+ columns << Column.new(left_ruling_line.left, right_ruling_line.left - left_ruling_line.left, []) if (right_ruling_line.left - left_ruling_line.left > 10)
47
+ end
48
+ tes.each do |te|
49
+ if column = columns.detect { |c| te.horizontally_overlaps?(c) }
50
+ column << te
51
+ else
52
+ puts "couldn't find a place for #{te.inspect}"
53
+ #columns << Column.new(te.left, te.width, [te])
54
+ end
38
55
  end
39
56
  end
40
57
  columns
41
58
  end
42
59
 
43
60
  def get_columns
44
- Tabula.group_by_columns(text_elements).map do |c|
61
+ TableExtractor.new(text_elements).group_by_columns.map do |c|
45
62
  {'left' => c.left, 'right' => c.right, 'width' => c.width}
46
63
  end
47
64
  end
@@ -87,6 +104,7 @@ module Tabula
87
104
 
88
105
  private
89
106
 
107
+ #this is where spaces come from!
90
108
  def merge_words!
91
109
  return self.text_elements if @merged # only merge once. awful hack.
92
110
  @merged = true
@@ -97,9 +115,12 @@ module Tabula
97
115
 
98
116
  char2 = self.text_elements[i+1]
99
117
 
118
+
119
+
100
120
  next if char2.nil? or char1.nil?
101
121
 
102
122
  if self.text_elements[current_word_index].should_merge?(char2)
123
+ #puts "merging: #{self.text_elements[current_word_index].text}/#{self.text_elements[current_word_index].width}"
103
124
  self.text_elements[current_word_index].merge!(char2)
104
125
  char1 = char2
105
126
  self.text_elements[i+1] = nil
@@ -107,13 +128,14 @@ module Tabula
107
128
  # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
108
129
  if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
109
130
  self.text_elements[current_word_index].text += " "
110
- self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
131
+ #self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
111
132
  end
112
133
  current_word_index = i+1
113
134
  end
114
135
  i += 1
115
136
  end
116
- return self.text_elements.compact!
137
+ self.text_elements.compact!
138
+ return self.text_elements
117
139
  end
118
140
  end
119
141
 
@@ -174,7 +196,7 @@ module Tabula
174
196
 
175
197
  lines.sort_by!(&:top)
176
198
 
177
- columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
199
+ columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words]}).group_by_columns.sort_by(&:left)
178
200
 
179
201
  # # insert empty cells if needed
180
202
  lines.each_with_index do |l, line_index|
@@ -183,23 +205,21 @@ module Tabula
183
205
  l.text_elements.uniq! # TODO WHY do I have to do this?
184
206
  l.text_elements.sort_by!(&:left)
185
207
 
186
- next unless l.text_elements.size < columns.size
208
+ #next unless l.text_elements.size < columns.size
187
209
 
188
210
  columns.each_with_index do |c, i|
189
- if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
211
+ if (i > l.text_elements.size - 1) or (!l.text_elements[i].nil? and !c.text_elements.include?(l.text_elements[i]))
190
212
  l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
191
213
  end
192
214
  end
193
215
  end
194
216
 
195
217
  # # merge elements that are in the same column
196
- columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
197
-
198
218
  lines.each_with_index do |l, line_index|
199
219
  next if l.text_elements.nil?
200
220
 
201
221
  (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
202
- next if l.text_elements[t1].nil? or l.text_elements[t2].nil?
222
+ next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
203
223
 
204
224
  # if same column...
205
225
  if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
@@ -233,4 +253,96 @@ module Tabula
233
253
  line.text_elements.sort_by(&:left)
234
254
  end
235
255
  end
256
+
257
+
258
+ def Tabula.make_table_with_vertical_rulings(text_elements, options={})
259
+ extractor = TableExtractor.new(text_elements, options)
260
+
261
+ # group by lines
262
+ lines = []
263
+ line_boundaries = extractor.get_line_boundaries
264
+
265
+ # find all the text elements
266
+ # contained within each detected line (table row) boundary
267
+ line_boundaries.each do |lb|
268
+ line = Line.new
269
+
270
+ line_members = text_elements.find_all do |te|
271
+ te.vertically_overlaps?(lb)
272
+ end
273
+
274
+ text_elements -= line_members
275
+
276
+ line_members.sort_by(&:left).each do |te|
277
+ # skip text_elements that only contain spaces
278
+ next if te.text =~ ONLY_SPACES_RE
279
+ line << te
280
+ end
281
+
282
+ lines << line if line.text_elements.size > 0
283
+ end
284
+
285
+ lines.sort_by!(&:top)
286
+
287
+ vertical_rulings = options[:vertical_rulings]
288
+ columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)
289
+
290
+ # insert empty cells if needed
291
+ lines.each_with_index do |l, line_index|
292
+ next if l.text_elements.nil?
293
+ l.text_elements.compact! # TODO WHY do I have to do this?
294
+ l.text_elements.uniq! # TODO WHY do I have to do this?
295
+ l.text_elements.sort_by!(&:left)
296
+
297
+ columns.each_with_index do |c, i|
298
+ if (l.text_elements.select{|te| te && te.left >= c.left && te.right <= (c.left + c.width)}.empty?)
299
+ l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
300
+ end
301
+ end
302
+ end
303
+
304
+ # merge elements that are in the same column
305
+ lines.each_with_index do |l, line_index|
306
+ next if l.text_elements.nil?
307
+
308
+ (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2| #don't remove a string of empty cells
309
+ next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
310
+
311
+ # if same column...
312
+ if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
313
+ == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
314
+ if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
315
+ l.text_elements[t1].merge!(l.text_elements[t2])
316
+ l.text_elements[t2] = nil
317
+ else
318
+ l.text_elements[t2].merge!(l.text_elements[t1])
319
+ l.text_elements[t1] = nil
320
+ end
321
+ end
322
+ end
323
+
324
+ l.text_elements.compact!
325
+ end
326
+
327
+ # remove duplicate lines
328
+ # TODO this shouldn't have happened here, check why we have to do
329
+ # this (maybe duplication is happening in the column merging phase?)
330
+ (0..lines.size - 2).each do |i|
331
+ next if lines[i].nil?
332
+ # if any of the elements on the next line is duplicated, kill
333
+ # the next line
334
+ if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
335
+ lines[i+1] = nil
336
+ end
337
+ end
338
+
339
+ lines.compact.map do |line|
340
+ line.text_elements.sort_by(&:left)
341
+ end
342
+ end
343
+
344
+
345
+
346
+
347
+
236
348
  end
@@ -0,0 +1,199 @@
1
+ require 'java'
2
+ require 'json'
3
+ require_relative '../geom/point'
4
+ require_relative '../geom/segment'
5
+ require_relative '../geom/rectangle'
6
+ require_relative './pdf_render'
7
+ #CLASSPATH=:./target/javacpp.jar:./target/javacv.jar:./target/javacv-macosx-x86_64.jar:./target/PDFRenderer-0.9.1.jar
8
+
9
+
10
+ module Tabula
11
+ module TableGuesser
12
+
13
+ def TableGuesser.find_and_write_rects(filename, output_dir)
14
+ #writes to JSON the rectangles on each page in the specified PDF.
15
+ open(File.join(output_dir, "tables.json"), 'w') do |f|
16
+ f.write( JSON.dump(find_rects(filename).map{|a| a.map{|r| r.dims.map &:to_i }} ))
17
+ end
18
+ end
19
+
20
+ def TableGuesser.find_rects(filename)
21
+ pdf = load_pdfbox_pdf(filename)
22
+
23
+ if pdf.getNumberOfPages == 0
24
+ puts "not a pdf!"
25
+ exit
26
+ end
27
+
28
+ puts "pages: " + pdf.getNumberOfPages.to_s
29
+
30
+ tables = []
31
+ pdf.getNumberOfPages.times do |i|
32
+ #gotcha: with PDFView, PDF pages are 1-indexed. If you ask for page 0 and then page 1, you'll get the first page twice. So start with index 1.
33
+ tables << find_rects_on_page(pdf, i + 1)
34
+ end
35
+ tables
36
+ end
37
+
38
+ def TableGuesser.find_lines(filename)
39
+ if pdf.getNumberOfPages == 0
40
+ puts "not a pdf!"
41
+ exit
42
+ end
43
+
44
+ puts "pages: " + pdf.getNumberOfPages.to_s
45
+
46
+ lines = []
47
+ pdf.getNumberOfPages.times do |i|
48
+ lines << detect_lines_in_pdf_page(filename, i)
49
+ end
50
+ lines
51
+ end
52
+
53
+ def TableGuesser.find_lines_on_page(pdf, page_index)
54
+ Tabula::LSD.detect_lines_in_pdf_page(pdf, page_index)
55
+ end
56
+
57
+ def TableGuesser.find_rects_on_page(pdf, page_index)
58
+ find_rects_from_lines(find_lines_on_page(pdf, page_index, 10))
59
+ end
60
+
61
+ def TableGuesser.find_rects_from_lines(lines)
62
+ horizontal_lines = lines.select &:horizontal?
63
+ vertical_lines = lines.select &:vertical?
64
+ find_tables(vertical_lines, horizontal_lines).inject([]){|memo, next_rect| Geometry::Rectangle.unionize(memo, next_rect )}.sort_by(&:area).reverse
65
+ end
66
+
67
+
68
+ def TableGuesser.euclidean_distance(x1, y1, x2, y2)
69
+ return Math.sqrt( ((x1 - x2) ** 2) + ((y1 - y2) ** 2) )
70
+ end
71
+
72
+ def TableGuesser.is_upward_oriented(line, y_value)
73
+ #return true if this line is oriented upwards, i.e. if the majority of it's length is above y_value.
74
+ return (y_value - line.top > line.bottom - y_value);
75
+ end
76
+
77
+ def TableGuesser.find_tables(verticals, horizontals)
78
+ # /*
79
+ # * Find all the rectangles in the vertical and horizontal lines given.
80
+ # *
81
+ # * Rectangles are deduped with hashRectangle, which considers two rectangles identical if each point rounds to the same tens place as the other.
82
+ # *
83
+ # * TODO: generalize this.
84
+ # */
85
+ corner_proximity_threshold = 0.10;
86
+
87
+ rectangles = []
88
+ #find rectangles with one horizontal line and two vertical lines that end within $threshold to the ends of the horizontal line.
89
+
90
+ [true, false].each do |up_or_down_lines|
91
+ horizontals.each do |horizontal_line|
92
+ horizontal_line_length = horizontal_line.length
93
+
94
+ has_vertical_line_from_the_left = false
95
+ left_vertical_line = nil
96
+ #for the left vertical line.
97
+ verticals.each do |vertical_line|
98
+ #1. if it is correctly oriented (up or down) given the outer loop here. (We don't want a false-positive rectangle with one "arm" going down, and one going up.)
99
+ next unless is_upward_oriented(vertical_line, horizontal_line.top) == up_or_down_lines
100
+
101
+ vertical_line_length = vertical_line.length
102
+ longer_line_length = [horizontal_line_length, vertical_line_length].max
103
+ corner_proximity = corner_proximity_threshold * longer_line_length
104
+ #make this the left vertical line:
105
+ #2. if it begins near the left vertex of the horizontal line.
106
+ if euclidean_distance(horizontal_line.left, horizontal_line.top, vertical_line.left, vertical_line.top) < corner_proximity ||
107
+ euclidean_distance(horizontal_line.left, horizontal_line.top, vertical_line.left, vertical_line.bottom) < corner_proximity
108
+ #3. if it is farther to the left of the line we already have.
109
+ if left_vertical_line.nil? || left_vertical_line.left> vertical_line.left #is this line is more to the left than left_vertical_line. #"What's your opinion on Das Kapital?"
110
+ has_vertical_line_from_the_left = true
111
+ left_vertical_line = vertical_line
112
+ end
113
+ end
114
+ end
115
+
116
+ has_vertical_line_from_the_right = false;
117
+ right_vertical_line = nil
118
+ #for the right vertical line.
119
+ verticals.each do |vertical_line|
120
+ next unless is_upward_oriented(vertical_line, horizontal_line.top) == up_or_down_lines
121
+ vertical_line_length = vertical_line.length
122
+ longer_line_length = [horizontal_line_length, vertical_line_length].max
123
+ corner_proximity = corner_proximity_threshold * longer_line_length
124
+ if euclidean_distance(horizontal_line.right, horizontal_line.top, vertical_line.left, vertical_line.top) < corner_proximity ||
125
+ euclidean_distance(horizontal_line.right, horizontal_line.top, vertical_line.left, vertical_line.bottom) < corner_proximity
126
+
127
+ if right_vertical_line.nil? || right_vertical_line.right > vertical_line.right #is this line is more to the right than right_vertical_line. #"Can you recite all of John Galt's speech?"
128
+ #do two passes to guarantee we don't get a horizontal line with a upwards and downwards line coming from each of its corners.
129
+ #i.e. ensuring that both "arms" of the rectangle have the same orientation (up or down).
130
+ has_vertical_line_from_the_right = true
131
+ right_vertical_line = vertical_line
132
+ end
133
+ end
134
+ end
135
+
136
+ if has_vertical_line_from_the_right && has_vertical_line_from_the_left
137
+ #in case we eventually tolerate not-quite-vertical lines, this computers the distance in Y directly, rather than depending on the vertical lines' lengths.
138
+ height = [left_vertical_line.bottom - left_vertical_line.top, right_vertical_line.bottom - right_vertical_line.top].max
139
+
140
+ y = [left_vertical_line.top, right_vertical_line.top].min
141
+ width = horizontal_line.right - horizontal_line.left
142
+ r = Geometry::Rectangle.new_by_x_y_dims(horizontal_line.left, y, width, height ) #x, y, w, h
143
+ #rectangles.put(hashRectangle(r), r); #TODO: I dont' think I need this now that I'm in Rubyland
144
+ rectangles << r
145
+ end
146
+ end
147
+
148
+ #find rectangles with one vertical line and two horizontal lines that end within $threshold to the ends of the vertical line.
149
+ verticals.each do |vertical_line|
150
+ vertical_line_length = vertical_line.length
151
+
152
+ has_horizontal_line_from_the_top = false
153
+ top_horizontal_line = nil
154
+ #for the top horizontal line.
155
+ horizontals.each do |horizontal_line|
156
+ horizontal_line_length = horizontal_line.length
157
+ longer_line_length = [horizontal_line_length, vertical_line_length].max
158
+ corner_proximity = corner_proximity_threshold * longer_line_length
159
+
160
+ if euclidean_distance(vertical_line.left, vertical_line.top, horizontal_line.left, horizontal_line.top) < corner_proximity ||
161
+ euclidean_distance(vertical_line.left, vertical_line.top, horizontal_line.right, horizontal_line.top) < corner_proximity
162
+ if top_horizontal_line.nil? || top_horizontal_line.top > horizontal_line.top #is this line is more to the top than the one we've got already.
163
+ has_horizontal_line_from_the_top = true;
164
+ top_horizontal_line = horizontal_line;
165
+ end
166
+ end
167
+ end
168
+ has_horizontal_line_from_the_bottom = false;
169
+ bottom_horizontal_line = nil
170
+ #for the bottom horizontal line.
171
+ horizontals.each do |horizontal_line|
172
+ horizontal_line_length = horizontal_line.length
173
+ longer_line_length = [horizontal_line_length, vertical_line_length].max
174
+ corner_proximity = corner_proximity_threshold * longer_line_length
175
+
176
+ if euclidean_distance(vertical_line.left, vertical_line.bottom, horizontal_line.left, horizontal_line.top) < corner_proximity ||
177
+ euclidean_distance(vertical_line.left, vertical_line.bottom, horizontal_line.left, horizontal_line.top) < corner_proximity
178
+ if bottom_horizontal_line.nil? || bottom_horizontal_line.bottom > horizontal_line.bottom #is this line is more to the bottom than the one we've got already.
179
+ has_horizontal_line_from_the_bottom = true;
180
+ bottom_horizontal_line = horizontal_line;
181
+ end
182
+ end
183
+ end
184
+
185
+ if has_horizontal_line_from_the_bottom && has_horizontal_line_from_the_top
186
+ x = [top_horizontal_line.left, bottom_horizontal_line.left].min
187
+ y = vertical_line.top
188
+ width = [top_horizontal_line.right - top_horizontal_line.left, bottom_horizontal_line.right - bottom_horizontal_line.right].max
189
+ height = vertical_line.bottom - vertical_line.top
190
+ r = Geometry::Rectangle.new_by_x_y_dims(x, y, width, height); #x, y, w, h
191
+ #rectangles.put(hashRectangle(r), r);
192
+ rectangles << r
193
+ end
194
+ end
195
+ end
196
+ return rectangles.uniq &:similarity_hash
197
+ end
198
+ end
199
+ end
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.5.1'
2
+ VERSION = '0.6.1'
3
3
  end
@@ -6,7 +6,7 @@ require 'tabula/version'
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "tabula-extractor"
8
8
  s.version = Tabula::VERSION
9
- s.authors = ["Manuel Aristarán"]
9
+ s.authors = ["Manuel Aristarán", "Jeremy B. Merill", "Mike Tigas"]
10
10
  s.email = ["manuel@jazzido.com"]
11
11
  s.homepage = "https://github.com/jazzido/tabula-extractor"
12
12
  s.summary = %q{extract tables from PDF files}
@@ -14,7 +14,7 @@ Gem::Specification.new do |s|
14
14
 
15
15
  s.platform = 'java'
16
16
 
17
- shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll'].map { |f| 'ext/' + f }
17
+ shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll', 'liblsd64.dll'].map { |f| 'ext/' + f }
18
18
  s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
19
19
  s.test_files = `git ls-files -- {test,features}/*`.split("\n")
20
20
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
Binary file
data/test/tests.rb CHANGED
@@ -23,6 +23,8 @@ class TestPagesInfoExtractor < Minitest::Test
23
23
  end
24
24
  end
25
25
 
26
+ class TestTableGuesser < MiniTest::Unit::TestCase
27
+ end
26
28
 
27
29
  class TestDumper < Minitest::Test
28
30
 
@@ -60,6 +62,77 @@ class TestExtractor < Minitest::Test
60
62
  assert_equal expected, lines_to_array(Tabula.make_table(characters))
61
63
  end
62
64
 
65
+ def test_forest_disclosure_report_dont_regress
66
+ # this is the current state of the expected output. Ideally the output should be like
67
+ # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
68
+ # and a solution for half-x-height-offset lines.
69
+ pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
70
+ character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
71
+ lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
72
+ vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
73
+
74
+
75
+ characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
76
+ #top left bottom right
77
+ expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
78
+ ['TOTAL', '', '', '','$85.00'],
79
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
80
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
81
+ ['TOTAL', '', '', '', '$471.25'],
82
+ ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
83
+ ['TOTAL', '', '', '','$20.39'],
84
+ ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
85
+ ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
86
+ ['TOTAL', '', '', '', '$5,010.33'],
87
+ ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
88
+ ['TOTAL', '', '', '', '$193.67'],
89
+ ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
90
+
91
+ assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
92
+ end
93
+
94
+ def test_missing_spaces_around_an_ampersand
95
+ pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
96
+ character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
97
+ lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
98
+ vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
99
+
100
+
101
+ characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
102
+ #top left bottom right
103
+ expected = [
104
+ ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
105
+ ]
106
+
107
+ assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
108
+ end
109
+
110
+ def test_forest_disclosure_report
111
+ pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
112
+ character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
113
+ lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
114
+ vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
115
+
116
+
117
+ characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
118
+ #top left bottom right
119
+ expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
120
+ ['TOTAL', '', '', '','$85.00'],
121
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
122
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
123
+ ['TOTAL', '', '', '', '$471.25'],
124
+ ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
125
+ ['TOTAL', '', '', '','$20.39'],
126
+ ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
127
+ ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '$4,700.00'],
128
+ ['TOTAL', '', '', '', '$5,010.33'],
129
+ ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
130
+ ['TOTAL', '', '', '', '$193.67'],
131
+ ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
132
+
133
+ assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
134
+ end
135
+
63
136
  # TODO Spaces inserted in words - fails
64
137
  def test_bo_page24
65
138
  character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
metadata CHANGED
@@ -2,14 +2,16 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.5.1
5
+ version: 0.6.1
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
9
+ - Jeremy B. Merill
10
+ - Mike Tigas
9
11
  autorequire:
10
12
  bindir: bin
11
13
  cert_chain: []
12
- date: 2013-06-14 00:00:00.000000000 Z
14
+ date: 2013-06-18 00:00:00.000000000 Z
13
15
  dependencies:
14
16
  - !ruby/object:Gem::Dependency
15
17
  name: minitest
@@ -93,6 +95,9 @@ files:
93
95
  - ext/liblsd64.dll
94
96
  - ext/lsd.c
95
97
  - ext/lsd.h
98
+ - lib/geom/point.rb
99
+ - lib/geom/rectangle.rb
100
+ - lib/geom/segment.rb
96
101
  - lib/tabula.rb
97
102
  - lib/tabula/core_ext.rb
98
103
  - lib/tabula/entities.rb
@@ -100,6 +105,7 @@ files:
100
105
  - lib/tabula/pdf_dump.rb
101
106
  - lib/tabula/pdf_render.rb
102
107
  - lib/tabula/table_extractor.rb
108
+ - lib/tabula/table_guesser.rb
103
109
  - lib/tabula/version.rb
104
110
  - lib/tabula/whitespace.rb
105
111
  - lib/tabula/writers.rb
@@ -108,6 +114,7 @@ files:
108
114
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
109
115
  - test/data/argentina_diputados_voting_record.pdf
110
116
  - test/data/bo_page24.pdf
117
+ - test/data/frx_2012_disclosure.pdf
111
118
  - test/data/gre.pdf
112
119
  - test/data/tabla_subsidios.pdf
113
120
  - test/tests.rb
@@ -147,6 +154,7 @@ test_files:
147
154
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
148
155
  - test/data/argentina_diputados_voting_record.pdf
149
156
  - test/data/bo_page24.pdf
157
+ - test/data/frx_2012_disclosure.pdf
150
158
  - test/data/gre.pdf
151
159
  - test/data/tabla_subsidios.pdf
152
160
  - test/tests.rb