tabula-extractor 0.5.1-java → 0.6.1-java
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/tabula +10 -6
- data/lib/geom/point.rb +21 -0
- data/lib/geom/rectangle.rb +101 -0
- data/lib/geom/segment.rb +82 -0
- data/lib/tabula.rb +1 -0
- data/lib/tabula/entities.rb +16 -5
- data/lib/tabula/line_segment_detector.rb +1 -1
- data/lib/tabula/pdf_dump.rb +2 -1
- data/lib/tabula/pdf_render.rb +2 -1
- data/lib/tabula/table_extractor.rb +127 -15
- data/lib/tabula/table_guesser.rb +199 -0
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -2
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/tests.rb +73 -0
- metadata +10 -2
data/bin/tabula
CHANGED
@@ -34,6 +34,7 @@ EOS
|
|
34
34
|
|
35
35
|
opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String
|
36
36
|
opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
|
37
|
+
opt :guess, 'Guess the portion of the page to analyze per page. Slow.'
|
37
38
|
opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
|
38
39
|
opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
|
39
40
|
end
|
@@ -51,7 +52,6 @@ EOS
|
|
51
52
|
Trollop::die 'file does not exist' unless File.exists? pdf_filename
|
52
53
|
|
53
54
|
return opts, pdf_filename
|
54
|
-
|
55
55
|
end
|
56
56
|
|
57
57
|
def main
|
@@ -60,11 +60,15 @@ def main
|
|
60
60
|
area = opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)
|
61
61
|
out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
|
62
62
|
extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
|
63
|
-
extractor.extract.
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
63
|
+
extractor.extract.each_with_index do |page, page_index|
|
64
|
+
page_areas = opts[:guess] ? Tabula::TableGuesser::find_rects_on_page(Tabula::TableGuesser::load_pdf(filename), page_index) : [area]
|
65
|
+
|
66
|
+
page_areas.each do |page_area|
|
67
|
+
text = page.get_text( page_area )
|
68
|
+
Tabula::Writers.send(opts[:format].to_sym,
|
69
|
+
Tabula.make_table(text),
|
70
|
+
out)
|
71
|
+
end
|
68
72
|
end
|
69
73
|
out.close
|
70
74
|
end
|
data/lib/geom/point.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#
|
2
|
+
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
+
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
+
#
|
5
|
+
|
6
|
+
|
7
|
+
module Geometry
|
8
|
+
class Point < Struct.new(:x, :y)
|
9
|
+
def self.new_by_array(array)
|
10
|
+
self.new(array[0], array[1])
|
11
|
+
end
|
12
|
+
|
13
|
+
def ==(another_point)
|
14
|
+
x === another_point.x && y === another_point.y
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def Point(x, y)
|
20
|
+
Geometry::Point.new(x, y)
|
21
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
#
|
2
|
+
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
+
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
+
#
|
5
|
+
|
6
|
+
|
7
|
+
module Geometry
|
8
|
+
class Rectangle < Struct.new(:point1, :point2)
|
9
|
+
SIMILARITY_DIVISOR = 20
|
10
|
+
|
11
|
+
def Rectangle.unionize(non_overlapping_rectangles, next_rect)
|
12
|
+
#if next_rect doesn't overlap any of non_overlapping_rectangles
|
13
|
+
if (overlapping = non_overlapping_rectangles.select{|r| next_rect.overlaps? r}) && !non_overlapping_rectangles.empty?
|
14
|
+
#remove all of those that it overlaps from non_overlapping_rectangles and
|
15
|
+
non_overlapping_rectangles -= overlapping
|
16
|
+
#add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
|
17
|
+
non_overlapping_rectangles << overlapping.inject(next_rect){|memo, overlap| memo.bounding_box(overlap) }
|
18
|
+
|
19
|
+
else
|
20
|
+
non_overlapping_rectangles << next_rect
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.new_by_x_y_dims(x, y, width, height)
|
25
|
+
self.new( Point.new_by_array([x, y]), Point.new_by_array([x + width, y + height]) )
|
26
|
+
end
|
27
|
+
|
28
|
+
def x
|
29
|
+
[point1.x, point2.x].min
|
30
|
+
end
|
31
|
+
|
32
|
+
alias_method :left, :x
|
33
|
+
|
34
|
+
def y
|
35
|
+
#puts "y: [#{point1.y} #{point2.y}].min"
|
36
|
+
[point1.y, point2.y].min
|
37
|
+
end
|
38
|
+
|
39
|
+
alias_method :top, :y
|
40
|
+
|
41
|
+
def x2
|
42
|
+
[point1.x, point2.x].max
|
43
|
+
end
|
44
|
+
|
45
|
+
alias_method :right, :x2
|
46
|
+
|
47
|
+
def y2
|
48
|
+
#puts "y2: [#{point1.y} #{point2.y}].max"
|
49
|
+
[point1.y, point2.y].max
|
50
|
+
end
|
51
|
+
|
52
|
+
alias_method :bottom, :y2
|
53
|
+
|
54
|
+
|
55
|
+
def width
|
56
|
+
(point1.x - point2.x).abs
|
57
|
+
end
|
58
|
+
|
59
|
+
def height
|
60
|
+
(point1.y - point2.y).abs
|
61
|
+
end
|
62
|
+
|
63
|
+
def area
|
64
|
+
self.width * self.height
|
65
|
+
end
|
66
|
+
|
67
|
+
def similarity_hash
|
68
|
+
[self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
|
69
|
+
end
|
70
|
+
|
71
|
+
def dims(*format)
|
72
|
+
if format
|
73
|
+
format.map{|method| self.send(method)}
|
74
|
+
else
|
75
|
+
[self.x, self.y, self.width, self.height]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def contains?(other_x, other_y)
|
80
|
+
(other_x <= x2 && other_x >= x ) && (other_y <= y2 && other_y > y)
|
81
|
+
end
|
82
|
+
|
83
|
+
def overlaps?(other_rect)
|
84
|
+
return contains?(other_rect.x, other_rect.y) || contains?(other_rect.x2, other_rect.y2) ||
|
85
|
+
contains?(other_rect.x, other_rect.y2) || contains?(other_rect.x2, other_rect.y) ||
|
86
|
+
other_rect.contains?(x, y) || other_rect.contains?(x2, y2) ||
|
87
|
+
other_rect.contains?(x, y2) || other_rect.contains?(x2, y)
|
88
|
+
end
|
89
|
+
|
90
|
+
def bounding_box(other_rect)
|
91
|
+
#new rect with bounding box of these two
|
92
|
+
new_x1 = [x, other_rect.x].min
|
93
|
+
new_y1 = [x, other_rect.y].min
|
94
|
+
new_x2 = [x2, other_rect.x2].max
|
95
|
+
new_y2 = [y2, other_rect.y2].max
|
96
|
+
new_width = (new_x2 - new_x1).abs
|
97
|
+
new_height = (new_y2 - new_y1).abs
|
98
|
+
Rectangle.new_by_x_y_dims(new_x1, new_y1, new_width, new_height)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
data/lib/geom/segment.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
#
|
2
|
+
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
+
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
+
#
|
5
|
+
|
6
|
+
|
7
|
+
module Geometry
|
8
|
+
include Math
|
9
|
+
extend Math
|
10
|
+
|
11
|
+
def Geometry.distance(point1, point2)
|
12
|
+
hypot point1.x - point2.x, point1.y - point2.y
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
class Segment < Struct.new(:point1, :point2)
|
17
|
+
def self.new_by_arrays(point1_coordinates, point2_coordinates)
|
18
|
+
self.new(Point.new_by_array(point1_coordinates),
|
19
|
+
Point.new_by_array(point2_coordinates))
|
20
|
+
end
|
21
|
+
|
22
|
+
def scale!(scale_factor)
|
23
|
+
self.point1.x = self.point1.x * scale_factor
|
24
|
+
self.point1.y = self.point1.y * scale_factor
|
25
|
+
self.point2.x = self.point2.x * scale_factor
|
26
|
+
self.point2.y = self.point2.y * scale_factor
|
27
|
+
end
|
28
|
+
|
29
|
+
def vertical?
|
30
|
+
point1.x == point2.x
|
31
|
+
end
|
32
|
+
|
33
|
+
def horizontal?
|
34
|
+
point1.y == point2.y
|
35
|
+
end
|
36
|
+
|
37
|
+
def leftmost_endpoint
|
38
|
+
((point1.x <=> point2.x) == -1) ? point1 : point2
|
39
|
+
end
|
40
|
+
|
41
|
+
def rightmost_endpoint
|
42
|
+
((point1.x <=> point2.x) == 1) ? point1 : point2
|
43
|
+
end
|
44
|
+
|
45
|
+
def topmost_endpoint
|
46
|
+
((point1.y <=> point2.y) == 1) ? point1 : point2
|
47
|
+
end
|
48
|
+
|
49
|
+
def bottommost_endpoint
|
50
|
+
((point1.y <=> point2.y) == -1) ? point1 : point2
|
51
|
+
end
|
52
|
+
|
53
|
+
def top
|
54
|
+
topmost_endpoint.y
|
55
|
+
end
|
56
|
+
|
57
|
+
def bottom
|
58
|
+
bottommost_endpoint.y
|
59
|
+
end
|
60
|
+
def width
|
61
|
+
(left - right).abs
|
62
|
+
end
|
63
|
+
def height
|
64
|
+
(bottom - top).abs
|
65
|
+
end
|
66
|
+
|
67
|
+
def left
|
68
|
+
leftmost_endpoint.x
|
69
|
+
end
|
70
|
+
|
71
|
+
def right
|
72
|
+
rightmost_endpoint.x
|
73
|
+
end
|
74
|
+
def length
|
75
|
+
Geometry.distance(point1, point2)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def Segment(point1, point2)
|
81
|
+
Geometry::Segment.new point1, point2
|
82
|
+
end
|
data/lib/tabula.rb
CHANGED
@@ -7,5 +7,6 @@ require_relative './tabula/entities'
|
|
7
7
|
require_relative './tabula/pdf_dump'
|
8
8
|
require_relative './tabula/table_extractor'
|
9
9
|
require_relative './tabula/writers'
|
10
|
+
require_relative './tabula/table_guesser'
|
10
11
|
require_relative './tabula/line_segment_detector'
|
11
12
|
require_relative './tabula/pdf_render'
|
data/lib/tabula/entities.rb
CHANGED
@@ -99,10 +99,10 @@ module Tabula
|
|
99
99
|
|
100
100
|
# spaces are not detected, b/c they have height == 0
|
101
101
|
# ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
|
102
|
-
# self.texts.select { |t| t.overlaps? ze }
|
103
|
-
self.texts.select
|
102
|
+
# self.texts.select { |t| t.overlaps? ze }
|
103
|
+
self.texts.select do |t|
|
104
104
|
t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
|
105
|
-
|
105
|
+
end
|
106
106
|
end
|
107
107
|
|
108
108
|
def to_json(options={})
|
@@ -120,7 +120,7 @@ module Tabula
|
|
120
120
|
attr_accessor :font, :font_size, :text, :width_of_space
|
121
121
|
|
122
122
|
CHARACTER_DISTANCE_THRESHOLD = 1.5
|
123
|
-
TOLERANCE_FACTOR = 0.25
|
123
|
+
TOLERANCE_FACTOR = 0.25 #25
|
124
124
|
|
125
125
|
def initialize(top, left, width, height, font, font_size, text, width_of_space)
|
126
126
|
super(top, left, width, height)
|
@@ -149,7 +149,7 @@ module Tabula
|
|
149
149
|
overlaps = self.vertically_overlaps?(other)
|
150
150
|
|
151
151
|
up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
|
152
|
-
down_tolerance = 0.
|
152
|
+
down_tolerance = 0.90 #90?
|
153
153
|
|
154
154
|
dist = self.horizontal_distance(other).abs
|
155
155
|
|
@@ -261,6 +261,10 @@ module Tabula
|
|
261
261
|
r >= 0 and r < 1 and s >= 0 and s < 1
|
262
262
|
end
|
263
263
|
|
264
|
+
def length
|
265
|
+
Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
|
266
|
+
end
|
267
|
+
|
264
268
|
def vertical?
|
265
269
|
left == right
|
266
270
|
end
|
@@ -269,6 +273,13 @@ module Tabula
|
|
269
273
|
top == bottom
|
270
274
|
end
|
271
275
|
|
276
|
+
def right
|
277
|
+
left + width
|
278
|
+
end
|
279
|
+
def bottom
|
280
|
+
top + height
|
281
|
+
end
|
282
|
+
|
272
283
|
def to_json(arg)
|
273
284
|
[left, top, right, bottom].to_json
|
274
285
|
end
|
@@ -46,7 +46,7 @@ module Tabula
|
|
46
46
|
options = DETECT_LINES_DEFAULTS.merge(options)
|
47
47
|
|
48
48
|
pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
|
49
|
-
page = pdf_file.getDocumentCatalog.getAllPages[page_number
|
49
|
+
page = pdf_file.getDocumentCatalog.getAllPages[page_number]
|
50
50
|
bi = Tabula::Render.pageToBufferedImage(page,
|
51
51
|
options[:image_size])
|
52
52
|
pdf_file.close
|
data/lib/tabula/pdf_dump.rb
CHANGED
@@ -77,11 +77,12 @@ module Tabula
|
|
77
77
|
class CharacterExtractor
|
78
78
|
include Observable
|
79
79
|
|
80
|
+
#N.B. pages can be :all, a list of pages or a range.
|
80
81
|
def initialize(pdf_filename, pages=[1])
|
81
82
|
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
82
83
|
@pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
|
83
84
|
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
84
|
-
@pages = pages
|
85
|
+
@pages = pages == :all ? (1..@all_pages.size) : pages
|
85
86
|
@extractor = TextExtractor.new
|
86
87
|
end
|
87
88
|
|
data/lib/tabula/pdf_render.rb
CHANGED
@@ -20,8 +20,9 @@ module Tabula
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
TRANSPARENT_WHITE = Color.new(255, 255, 255, 0)
|
23
|
+
TRANSPARENT_WHITE = java.awt.Color.new(255, 255, 255, 0)
|
24
24
|
|
25
|
+
# 2048 width is important, if this is too small, thin lines won't be drawn.
|
25
26
|
def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
|
26
27
|
cropbox = page.findCropBox
|
27
28
|
widthPt, heightPt = cropbox.getWidth, cropbox.getHeight
|
@@ -27,21 +27,38 @@ module Tabula
|
|
27
27
|
# (ie, take into account vertical ruling lines if available)
|
28
28
|
def group_by_columns
|
29
29
|
columns = []
|
30
|
-
tes = self.text_elements.sort_by
|
30
|
+
tes = self.text_elements.sort_by &:left
|
31
31
|
|
32
32
|
# we don't have vertical rulings
|
33
|
-
|
34
|
-
|
35
|
-
column
|
36
|
-
|
37
|
-
|
33
|
+
if self.options[:vertical_rulings].empty?
|
34
|
+
tes.each do |te|
|
35
|
+
if column = columns.detect { |c| te.horizontally_overlaps?(c) }
|
36
|
+
column << te
|
37
|
+
else
|
38
|
+
columns << Column.new(te.left, te.width, [te])
|
39
|
+
end
|
40
|
+
end
|
41
|
+
else
|
42
|
+
self.options[:vertical_rulings].sort_by! &:left
|
43
|
+
1.upto(self.options[:vertical_rulings].size - 1) do |i|
|
44
|
+
left_ruling_line = self.options[:vertical_rulings][i - 1]
|
45
|
+
right_ruling_line = self.options[:vertical_rulings][i]
|
46
|
+
columns << Column.new(left_ruling_line.left, right_ruling_line.left - left_ruling_line.left, []) if (right_ruling_line.left - left_ruling_line.left > 10)
|
47
|
+
end
|
48
|
+
tes.each do |te|
|
49
|
+
if column = columns.detect { |c| te.horizontally_overlaps?(c) }
|
50
|
+
column << te
|
51
|
+
else
|
52
|
+
puts "couldn't find a place for #{te.inspect}"
|
53
|
+
#columns << Column.new(te.left, te.width, [te])
|
54
|
+
end
|
38
55
|
end
|
39
56
|
end
|
40
57
|
columns
|
41
58
|
end
|
42
59
|
|
43
60
|
def get_columns
|
44
|
-
|
61
|
+
TableExtractor.new(text_elements).group_by_columns.map do |c|
|
45
62
|
{'left' => c.left, 'right' => c.right, 'width' => c.width}
|
46
63
|
end
|
47
64
|
end
|
@@ -87,6 +104,7 @@ module Tabula
|
|
87
104
|
|
88
105
|
private
|
89
106
|
|
107
|
+
#this is where spaces come from!
|
90
108
|
def merge_words!
|
91
109
|
return self.text_elements if @merged # only merge once. awful hack.
|
92
110
|
@merged = true
|
@@ -97,9 +115,12 @@ module Tabula
|
|
97
115
|
|
98
116
|
char2 = self.text_elements[i+1]
|
99
117
|
|
118
|
+
|
119
|
+
|
100
120
|
next if char2.nil? or char1.nil?
|
101
121
|
|
102
122
|
if self.text_elements[current_word_index].should_merge?(char2)
|
123
|
+
#puts "merging: #{self.text_elements[current_word_index].text}/#{self.text_elements[current_word_index].width}"
|
103
124
|
self.text_elements[current_word_index].merge!(char2)
|
104
125
|
char1 = char2
|
105
126
|
self.text_elements[i+1] = nil
|
@@ -107,13 +128,14 @@ module Tabula
|
|
107
128
|
# is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
|
108
129
|
if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
|
109
130
|
self.text_elements[current_word_index].text += " "
|
110
|
-
self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
|
131
|
+
#self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
|
111
132
|
end
|
112
133
|
current_word_index = i+1
|
113
134
|
end
|
114
135
|
i += 1
|
115
136
|
end
|
116
|
-
|
137
|
+
self.text_elements.compact!
|
138
|
+
return self.text_elements
|
117
139
|
end
|
118
140
|
end
|
119
141
|
|
@@ -174,7 +196,7 @@ module Tabula
|
|
174
196
|
|
175
197
|
lines.sort_by!(&:top)
|
176
198
|
|
177
|
-
columns =
|
199
|
+
columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words]}).group_by_columns.sort_by(&:left)
|
178
200
|
|
179
201
|
# # insert empty cells if needed
|
180
202
|
lines.each_with_index do |l, line_index|
|
@@ -183,23 +205,21 @@ module Tabula
|
|
183
205
|
l.text_elements.uniq! # TODO WHY do I have to do this?
|
184
206
|
l.text_elements.sort_by!(&:left)
|
185
207
|
|
186
|
-
next unless l.text_elements.size < columns.size
|
208
|
+
#next unless l.text_elements.size < columns.size
|
187
209
|
|
188
210
|
columns.each_with_index do |c, i|
|
189
|
-
if (i > l.text_elements.size - 1) or !l.text_elements
|
211
|
+
if (i > l.text_elements.size - 1) or (!l.text_elements[i].nil? and !c.text_elements.include?(l.text_elements[i]))
|
190
212
|
l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
|
191
213
|
end
|
192
214
|
end
|
193
215
|
end
|
194
216
|
|
195
217
|
# # merge elements that are in the same column
|
196
|
-
columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
|
197
|
-
|
198
218
|
lines.each_with_index do |l, line_index|
|
199
219
|
next if l.text_elements.nil?
|
200
220
|
|
201
221
|
(0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
|
202
|
-
next if l.text_elements[t1].nil? or l.text_elements[t2].nil?
|
222
|
+
next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
|
203
223
|
|
204
224
|
# if same column...
|
205
225
|
if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
|
@@ -233,4 +253,96 @@ module Tabula
|
|
233
253
|
line.text_elements.sort_by(&:left)
|
234
254
|
end
|
235
255
|
end
|
256
|
+
|
257
|
+
|
258
|
+
def Tabula.make_table_with_vertical_rulings(text_elements, options={})
|
259
|
+
extractor = TableExtractor.new(text_elements, options)
|
260
|
+
|
261
|
+
# group by lines
|
262
|
+
lines = []
|
263
|
+
line_boundaries = extractor.get_line_boundaries
|
264
|
+
|
265
|
+
# find all the text elements
|
266
|
+
# contained within each detected line (table row) boundary
|
267
|
+
line_boundaries.each do |lb|
|
268
|
+
line = Line.new
|
269
|
+
|
270
|
+
line_members = text_elements.find_all do |te|
|
271
|
+
te.vertically_overlaps?(lb)
|
272
|
+
end
|
273
|
+
|
274
|
+
text_elements -= line_members
|
275
|
+
|
276
|
+
line_members.sort_by(&:left).each do |te|
|
277
|
+
# skip text_elements that only contain spaces
|
278
|
+
next if te.text =~ ONLY_SPACES_RE
|
279
|
+
line << te
|
280
|
+
end
|
281
|
+
|
282
|
+
lines << line if line.text_elements.size > 0
|
283
|
+
end
|
284
|
+
|
285
|
+
lines.sort_by!(&:top)
|
286
|
+
|
287
|
+
vertical_rulings = options[:vertical_rulings]
|
288
|
+
columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)
|
289
|
+
|
290
|
+
# insert empty cells if needed
|
291
|
+
lines.each_with_index do |l, line_index|
|
292
|
+
next if l.text_elements.nil?
|
293
|
+
l.text_elements.compact! # TODO WHY do I have to do this?
|
294
|
+
l.text_elements.uniq! # TODO WHY do I have to do this?
|
295
|
+
l.text_elements.sort_by!(&:left)
|
296
|
+
|
297
|
+
columns.each_with_index do |c, i|
|
298
|
+
if (l.text_elements.select{|te| te && te.left >= c.left && te.right <= (c.left + c.width)}.empty?)
|
299
|
+
l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
# merge elements that are in the same column
|
305
|
+
lines.each_with_index do |l, line_index|
|
306
|
+
next if l.text_elements.nil?
|
307
|
+
|
308
|
+
(0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2| #don't remove a string of empty cells
|
309
|
+
next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
|
310
|
+
|
311
|
+
# if same column...
|
312
|
+
if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
|
313
|
+
== columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
|
314
|
+
if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
|
315
|
+
l.text_elements[t1].merge!(l.text_elements[t2])
|
316
|
+
l.text_elements[t2] = nil
|
317
|
+
else
|
318
|
+
l.text_elements[t2].merge!(l.text_elements[t1])
|
319
|
+
l.text_elements[t1] = nil
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
l.text_elements.compact!
|
325
|
+
end
|
326
|
+
|
327
|
+
# remove duplicate lines
|
328
|
+
# TODO this shouldn't have happened here, check why we have to do
|
329
|
+
# this (maybe duplication is happening in the column merging phase?)
|
330
|
+
(0..lines.size - 2).each do |i|
|
331
|
+
next if lines[i].nil?
|
332
|
+
# if any of the elements on the next line is duplicated, kill
|
333
|
+
# the next line
|
334
|
+
if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
|
335
|
+
lines[i+1] = nil
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
lines.compact.map do |line|
|
340
|
+
line.text_elements.sort_by(&:left)
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
|
345
|
+
|
346
|
+
|
347
|
+
|
236
348
|
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
require 'java'
|
2
|
+
require 'json'
|
3
|
+
require_relative '../geom/point'
|
4
|
+
require_relative '../geom/segment'
|
5
|
+
require_relative '../geom/rectangle'
|
6
|
+
require_relative './pdf_render'
|
7
|
+
#CLASSPATH=:./target/javacpp.jar:./target/javacv.jar:./target/javacv-macosx-x86_64.jar:./target/PDFRenderer-0.9.1.jar
|
8
|
+
|
9
|
+
|
10
|
+
module Tabula
|
11
|
+
module TableGuesser
|
12
|
+
|
13
|
+
def TableGuesser.find_and_write_rects(filename, output_dir)
|
14
|
+
#writes to JSON the rectangles on each page in the specified PDF.
|
15
|
+
open(File.join(output_dir, "tables.json"), 'w') do |f|
|
16
|
+
f.write( JSON.dump(find_rects(filename).map{|a| a.map{|r| r.dims.map &:to_i }} ))
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def TableGuesser.find_rects(filename)
|
21
|
+
pdf = load_pdfbox_pdf(filename)
|
22
|
+
|
23
|
+
if pdf.getNumberOfPages == 0
|
24
|
+
puts "not a pdf!"
|
25
|
+
exit
|
26
|
+
end
|
27
|
+
|
28
|
+
puts "pages: " + pdf.getNumberOfPages.to_s
|
29
|
+
|
30
|
+
tables = []
|
31
|
+
pdf.getNumberOfPages.times do |i|
|
32
|
+
#gotcha: with PDFView, PDF pages are 1-indexed. If you ask for page 0 and then page 1, you'll get the first page twice. So start with index 1.
|
33
|
+
tables << find_rects_on_page(pdf, i + 1)
|
34
|
+
end
|
35
|
+
tables
|
36
|
+
end
|
37
|
+
|
38
|
+
def TableGuesser.find_lines(filename)
|
39
|
+
if pdf.getNumberOfPages == 0
|
40
|
+
puts "not a pdf!"
|
41
|
+
exit
|
42
|
+
end
|
43
|
+
|
44
|
+
puts "pages: " + pdf.getNumberOfPages.to_s
|
45
|
+
|
46
|
+
lines = []
|
47
|
+
pdf.getNumberOfPages.times do |i|
|
48
|
+
lines << detect_lines_in_pdf_page(filename, i)
|
49
|
+
end
|
50
|
+
lines
|
51
|
+
end
|
52
|
+
|
53
|
+
def TableGuesser.find_lines_on_page(pdf, page_index)
|
54
|
+
Tabula::LSD.detect_lines_in_pdf_page(pdf, page_index)
|
55
|
+
end
|
56
|
+
|
57
|
+
def TableGuesser.find_rects_on_page(pdf, page_index)
|
58
|
+
find_rects_from_lines(find_lines_on_page(pdf, page_index, 10))
|
59
|
+
end
|
60
|
+
|
61
|
+
def TableGuesser.find_rects_from_lines(lines)
|
62
|
+
horizontal_lines = lines.select &:horizontal?
|
63
|
+
vertical_lines = lines.select &:vertical?
|
64
|
+
find_tables(vertical_lines, horizontal_lines).inject([]){|memo, next_rect| Geometry::Rectangle.unionize(memo, next_rect )}.sort_by(&:area).reverse
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
def TableGuesser.euclidean_distance(x1, y1, x2, y2)
|
69
|
+
return Math.sqrt( ((x1 - x2) ** 2) + ((y1 - y2) ** 2) )
|
70
|
+
end
|
71
|
+
|
72
|
+
def TableGuesser.is_upward_oriented(line, y_value)
|
73
|
+
#return true if this line is oriented upwards, i.e. if the majority of it's length is above y_value.
|
74
|
+
return (y_value - line.top > line.bottom - y_value);
|
75
|
+
end
|
76
|
+
|
77
|
+
def TableGuesser.find_tables(verticals, horizontals)
|
78
|
+
# /*
|
79
|
+
# * Find all the rectangles in the vertical and horizontal lines given.
|
80
|
+
# *
|
81
|
+
# * Rectangles are deduped with hashRectangle, which considers two rectangles identical if each point rounds to the same tens place as the other.
|
82
|
+
# *
|
83
|
+
# * TODO: generalize this.
|
84
|
+
# */
|
85
|
+
corner_proximity_threshold = 0.10;
|
86
|
+
|
87
|
+
rectangles = []
|
88
|
+
#find rectangles with one horizontal line and two vertical lines that end within $threshold to the ends of the horizontal line.
|
89
|
+
|
90
|
+
[true, false].each do |up_or_down_lines|
|
91
|
+
horizontals.each do |horizontal_line|
|
92
|
+
horizontal_line_length = horizontal_line.length
|
93
|
+
|
94
|
+
has_vertical_line_from_the_left = false
|
95
|
+
left_vertical_line = nil
|
96
|
+
#for the left vertical line.
|
97
|
+
verticals.each do |vertical_line|
|
98
|
+
#1. if it is correctly oriented (up or down) given the outer loop here. (We don't want a false-positive rectangle with one "arm" going down, and one going up.)
|
99
|
+
next unless is_upward_oriented(vertical_line, horizontal_line.top) == up_or_down_lines
|
100
|
+
|
101
|
+
vertical_line_length = vertical_line.length
|
102
|
+
longer_line_length = [horizontal_line_length, vertical_line_length].max
|
103
|
+
corner_proximity = corner_proximity_threshold * longer_line_length
|
104
|
+
#make this the left vertical line:
|
105
|
+
#2. if it begins near the left vertex of the horizontal line.
|
106
|
+
if euclidean_distance(horizontal_line.left, horizontal_line.top, vertical_line.left, vertical_line.top) < corner_proximity ||
|
107
|
+
euclidean_distance(horizontal_line.left, horizontal_line.top, vertical_line.left, vertical_line.bottom) < corner_proximity
|
108
|
+
#3. if it is farther to the left of the line we already have.
|
109
|
+
if left_vertical_line.nil? || left_vertical_line.left> vertical_line.left #is this line is more to the left than left_vertical_line. #"What's your opinion on Das Kapital?"
|
110
|
+
has_vertical_line_from_the_left = true
|
111
|
+
left_vertical_line = vertical_line
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
has_vertical_line_from_the_right = false;
|
117
|
+
right_vertical_line = nil
|
118
|
+
#for the right vertical line.
|
119
|
+
verticals.each do |vertical_line|
|
120
|
+
next unless is_upward_oriented(vertical_line, horizontal_line.top) == up_or_down_lines
|
121
|
+
vertical_line_length = vertical_line.length
|
122
|
+
longer_line_length = [horizontal_line_length, vertical_line_length].max
|
123
|
+
corner_proximity = corner_proximity_threshold * longer_line_length
|
124
|
+
if euclidean_distance(horizontal_line.right, horizontal_line.top, vertical_line.left, vertical_line.top) < corner_proximity ||
|
125
|
+
euclidean_distance(horizontal_line.right, horizontal_line.top, vertical_line.left, vertical_line.bottom) < corner_proximity
|
126
|
+
|
127
|
+
if right_vertical_line.nil? || right_vertical_line.right > vertical_line.right #is this line is more to the right than right_vertical_line. #"Can you recite all of John Galt's speech?"
|
128
|
+
#do two passes to guarantee we don't get a horizontal line with a upwards and downwards line coming from each of its corners.
|
129
|
+
#i.e. ensuring that both "arms" of the rectangle have the same orientation (up or down).
|
130
|
+
has_vertical_line_from_the_right = true
|
131
|
+
right_vertical_line = vertical_line
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
if has_vertical_line_from_the_right && has_vertical_line_from_the_left
|
137
|
+
#in case we eventually tolerate not-quite-vertical lines, this computers the distance in Y directly, rather than depending on the vertical lines' lengths.
|
138
|
+
height = [left_vertical_line.bottom - left_vertical_line.top, right_vertical_line.bottom - right_vertical_line.top].max
|
139
|
+
|
140
|
+
y = [left_vertical_line.top, right_vertical_line.top].min
|
141
|
+
width = horizontal_line.right - horizontal_line.left
|
142
|
+
r = Geometry::Rectangle.new_by_x_y_dims(horizontal_line.left, y, width, height ) #x, y, w, h
|
143
|
+
#rectangles.put(hashRectangle(r), r); #TODO: I dont' think I need this now that I'm in Rubyland
|
144
|
+
rectangles << r
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
#find rectangles with one vertical line and two horizontal lines that end within $threshold to the ends of the vertical line.
|
149
|
+
verticals.each do |vertical_line|
|
150
|
+
vertical_line_length = vertical_line.length
|
151
|
+
|
152
|
+
has_horizontal_line_from_the_top = false
|
153
|
+
top_horizontal_line = nil
|
154
|
+
#for the top horizontal line.
|
155
|
+
horizontals.each do |horizontal_line|
|
156
|
+
horizontal_line_length = horizontal_line.length
|
157
|
+
longer_line_length = [horizontal_line_length, vertical_line_length].max
|
158
|
+
corner_proximity = corner_proximity_threshold * longer_line_length
|
159
|
+
|
160
|
+
if euclidean_distance(vertical_line.left, vertical_line.top, horizontal_line.left, horizontal_line.top) < corner_proximity ||
|
161
|
+
euclidean_distance(vertical_line.left, vertical_line.top, horizontal_line.right, horizontal_line.top) < corner_proximity
|
162
|
+
if top_horizontal_line.nil? || top_horizontal_line.top > horizontal_line.top #is this line is more to the top than the one we've got already.
|
163
|
+
has_horizontal_line_from_the_top = true;
|
164
|
+
top_horizontal_line = horizontal_line;
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
has_horizontal_line_from_the_bottom = false;
|
169
|
+
bottom_horizontal_line = nil
|
170
|
+
#for the bottom horizontal line.
|
171
|
+
horizontals.each do |horizontal_line|
|
172
|
+
horizontal_line_length = horizontal_line.length
|
173
|
+
longer_line_length = [horizontal_line_length, vertical_line_length].max
|
174
|
+
corner_proximity = corner_proximity_threshold * longer_line_length
|
175
|
+
|
176
|
+
if euclidean_distance(vertical_line.left, vertical_line.bottom, horizontal_line.left, horizontal_line.top) < corner_proximity ||
|
177
|
+
euclidean_distance(vertical_line.left, vertical_line.bottom, horizontal_line.left, horizontal_line.top) < corner_proximity
|
178
|
+
if bottom_horizontal_line.nil? || bottom_horizontal_line.bottom > horizontal_line.bottom #is this line is more to the bottom than the one we've got already.
|
179
|
+
has_horizontal_line_from_the_bottom = true;
|
180
|
+
bottom_horizontal_line = horizontal_line;
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
if has_horizontal_line_from_the_bottom && has_horizontal_line_from_the_top
|
186
|
+
x = [top_horizontal_line.left, bottom_horizontal_line.left].min
|
187
|
+
y = vertical_line.top
|
188
|
+
width = [top_horizontal_line.right - top_horizontal_line.left, bottom_horizontal_line.right - bottom_horizontal_line.right].max
|
189
|
+
height = vertical_line.bottom - vertical_line.top
|
190
|
+
r = Geometry::Rectangle.new_by_x_y_dims(x, y, width, height); #x, y, w, h
|
191
|
+
#rectangles.put(hashRectangle(r), r);
|
192
|
+
rectangles << r
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
return rectangles.uniq &:similarity_hash
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
data/lib/tabula/version.rb
CHANGED
data/tabula-extractor.gemspec
CHANGED
@@ -6,7 +6,7 @@ require 'tabula/version'
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "tabula-extractor"
|
8
8
|
s.version = Tabula::VERSION
|
9
|
-
s.authors = ["Manuel Aristarán"]
|
9
|
+
s.authors = ["Manuel Aristarán", "Jeremy B. Merill", "Mike Tigas"]
|
10
10
|
s.email = ["manuel@jazzido.com"]
|
11
11
|
s.homepage = "https://github.com/jazzido/tabula-extractor"
|
12
12
|
s.summary = %q{extract tables from PDF files}
|
@@ -14,7 +14,7 @@ Gem::Specification.new do |s|
|
|
14
14
|
|
15
15
|
s.platform = 'java'
|
16
16
|
|
17
|
-
shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll'].map { |f| 'ext/' + f }
|
17
|
+
shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll', 'liblsd64.dll'].map { |f| 'ext/' + f }
|
18
18
|
s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
|
19
19
|
s.test_files = `git ls-files -- {test,features}/*`.split("\n")
|
20
20
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
Binary file
|
data/test/tests.rb
CHANGED
@@ -23,6 +23,8 @@ class TestPagesInfoExtractor < Minitest::Test
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
+
class TestTableGuesser < MiniTest::Unit::TestCase
|
27
|
+
end
|
26
28
|
|
27
29
|
class TestDumper < Minitest::Test
|
28
30
|
|
@@ -60,6 +62,77 @@ class TestExtractor < Minitest::Test
|
|
60
62
|
assert_equal expected, lines_to_array(Tabula.make_table(characters))
|
61
63
|
end
|
62
64
|
|
65
|
+
def test_forest_disclosure_report_dont_regress
|
66
|
+
# this is the current state of the expected output. Ideally the output should be like
|
67
|
+
# test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
|
68
|
+
# and a solution for half-x-height-offset lines.
|
69
|
+
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
70
|
+
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
71
|
+
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
72
|
+
vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
|
73
|
+
|
74
|
+
|
75
|
+
characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
|
76
|
+
#top left bottom right
|
77
|
+
expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
|
78
|
+
['TOTAL', '', '', '','$85.00'],
|
79
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
|
80
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
|
81
|
+
['TOTAL', '', '', '', '$471.25'],
|
82
|
+
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
|
83
|
+
['TOTAL', '', '', '','$20.39'],
|
84
|
+
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
|
85
|
+
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
86
|
+
['TOTAL', '', '', '', '$5,010.33'],
|
87
|
+
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
|
88
|
+
['TOTAL', '', '', '', '$193.67'],
|
89
|
+
['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
|
90
|
+
|
91
|
+
assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_missing_spaces_around_an_ampersand
|
95
|
+
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
96
|
+
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
97
|
+
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
98
|
+
vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
|
99
|
+
|
100
|
+
|
101
|
+
characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
|
102
|
+
#top left bottom right
|
103
|
+
expected = [
|
104
|
+
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
105
|
+
]
|
106
|
+
|
107
|
+
assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
|
108
|
+
end
|
109
|
+
|
110
|
+
def test_forest_disclosure_report
|
111
|
+
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
112
|
+
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
113
|
+
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
114
|
+
vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
|
115
|
+
|
116
|
+
|
117
|
+
characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
|
118
|
+
#top left bottom right
|
119
|
+
expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
|
120
|
+
['TOTAL', '', '', '','$85.00'],
|
121
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
|
122
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
|
123
|
+
['TOTAL', '', '', '', '$471.25'],
|
124
|
+
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
|
125
|
+
['TOTAL', '', '', '','$20.39'],
|
126
|
+
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
|
127
|
+
['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '$4,700.00'],
|
128
|
+
['TOTAL', '', '', '', '$5,010.33'],
|
129
|
+
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
|
130
|
+
['TOTAL', '', '', '', '$193.67'],
|
131
|
+
['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
|
132
|
+
|
133
|
+
assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
|
134
|
+
end
|
135
|
+
|
63
136
|
# TODO Spaces inserted in words - fails
|
64
137
|
def test_bo_page24
|
65
138
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
|
metadata
CHANGED
@@ -2,14 +2,16 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.6.1
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
9
|
+
- Jeremy B. Merill
|
10
|
+
- Mike Tigas
|
9
11
|
autorequire:
|
10
12
|
bindir: bin
|
11
13
|
cert_chain: []
|
12
|
-
date: 2013-06-
|
14
|
+
date: 2013-06-18 00:00:00.000000000 Z
|
13
15
|
dependencies:
|
14
16
|
- !ruby/object:Gem::Dependency
|
15
17
|
name: minitest
|
@@ -93,6 +95,9 @@ files:
|
|
93
95
|
- ext/liblsd64.dll
|
94
96
|
- ext/lsd.c
|
95
97
|
- ext/lsd.h
|
98
|
+
- lib/geom/point.rb
|
99
|
+
- lib/geom/rectangle.rb
|
100
|
+
- lib/geom/segment.rb
|
96
101
|
- lib/tabula.rb
|
97
102
|
- lib/tabula/core_ext.rb
|
98
103
|
- lib/tabula/entities.rb
|
@@ -100,6 +105,7 @@ files:
|
|
100
105
|
- lib/tabula/pdf_dump.rb
|
101
106
|
- lib/tabula/pdf_render.rb
|
102
107
|
- lib/tabula/table_extractor.rb
|
108
|
+
- lib/tabula/table_guesser.rb
|
103
109
|
- lib/tabula/version.rb
|
104
110
|
- lib/tabula/whitespace.rb
|
105
111
|
- lib/tabula/writers.rb
|
@@ -108,6 +114,7 @@ files:
|
|
108
114
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
109
115
|
- test/data/argentina_diputados_voting_record.pdf
|
110
116
|
- test/data/bo_page24.pdf
|
117
|
+
- test/data/frx_2012_disclosure.pdf
|
111
118
|
- test/data/gre.pdf
|
112
119
|
- test/data/tabla_subsidios.pdf
|
113
120
|
- test/tests.rb
|
@@ -147,6 +154,7 @@ test_files:
|
|
147
154
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
148
155
|
- test/data/argentina_diputados_voting_record.pdf
|
149
156
|
- test/data/bo_page24.pdf
|
157
|
+
- test/data/frx_2012_disclosure.pdf
|
150
158
|
- test/data/gre.pdf
|
151
159
|
- test/data/tabla_subsidios.pdf
|
152
160
|
- test/tests.rb
|