tabula-extractor 0.5.1-java → 0.6.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/tabula +10 -6
- data/lib/geom/point.rb +21 -0
- data/lib/geom/rectangle.rb +101 -0
- data/lib/geom/segment.rb +82 -0
- data/lib/tabula.rb +1 -0
- data/lib/tabula/entities.rb +16 -5
- data/lib/tabula/line_segment_detector.rb +1 -1
- data/lib/tabula/pdf_dump.rb +2 -1
- data/lib/tabula/pdf_render.rb +2 -1
- data/lib/tabula/table_extractor.rb +127 -15
- data/lib/tabula/table_guesser.rb +199 -0
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -2
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/tests.rb +73 -0
- metadata +10 -2
data/bin/tabula
CHANGED
@@ -34,6 +34,7 @@ EOS
|
|
34
34
|
|
35
35
|
opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String
|
36
36
|
opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
|
37
|
+
opt :guess, 'Guess the portion of the page to analyze per page. Slow.'
|
37
38
|
opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
|
38
39
|
opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
|
39
40
|
end
|
@@ -51,7 +52,6 @@ EOS
|
|
51
52
|
Trollop::die 'file does not exist' unless File.exists? pdf_filename
|
52
53
|
|
53
54
|
return opts, pdf_filename
|
54
|
-
|
55
55
|
end
|
56
56
|
|
57
57
|
def main
|
@@ -60,11 +60,15 @@ def main
|
|
60
60
|
area = opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)
|
61
61
|
out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
|
62
62
|
extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
|
63
|
-
extractor.extract.
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
63
|
+
extractor.extract.each_with_index do |page, page_index|
|
64
|
+
page_areas = opts[:guess] ? Tabula::TableGuesser::find_rects_on_page(Tabula::TableGuesser::load_pdf(filename), page_index) : [area]
|
65
|
+
|
66
|
+
page_areas.each do |page_area|
|
67
|
+
text = page.get_text( page_area )
|
68
|
+
Tabula::Writers.send(opts[:format].to_sym,
|
69
|
+
Tabula.make_table(text),
|
70
|
+
out)
|
71
|
+
end
|
68
72
|
end
|
69
73
|
out.close
|
70
74
|
end
|
data/lib/geom/point.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#
|
2
|
+
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
+
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
+
#
|
5
|
+
|
6
|
+
|
7
|
+
module Geometry
|
8
|
+
class Point < Struct.new(:x, :y)
|
9
|
+
def self.new_by_array(array)
|
10
|
+
self.new(array[0], array[1])
|
11
|
+
end
|
12
|
+
|
13
|
+
def ==(another_point)
|
14
|
+
x === another_point.x && y === another_point.y
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def Point(x, y)
|
20
|
+
Geometry::Point.new(x, y)
|
21
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
#
|
2
|
+
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
+
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
+
#
|
5
|
+
|
6
|
+
|
7
|
+
module Geometry
|
8
|
+
class Rectangle < Struct.new(:point1, :point2)
|
9
|
+
SIMILARITY_DIVISOR = 20
|
10
|
+
|
11
|
+
def Rectangle.unionize(non_overlapping_rectangles, next_rect)
|
12
|
+
#if next_rect doesn't overlap any of non_overlapping_rectangles
|
13
|
+
if (overlapping = non_overlapping_rectangles.select{|r| next_rect.overlaps? r}) && !non_overlapping_rectangles.empty?
|
14
|
+
#remove all of those that it overlaps from non_overlapping_rectangles and
|
15
|
+
non_overlapping_rectangles -= overlapping
|
16
|
+
#add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
|
17
|
+
non_overlapping_rectangles << overlapping.inject(next_rect){|memo, overlap| memo.bounding_box(overlap) }
|
18
|
+
|
19
|
+
else
|
20
|
+
non_overlapping_rectangles << next_rect
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.new_by_x_y_dims(x, y, width, height)
|
25
|
+
self.new( Point.new_by_array([x, y]), Point.new_by_array([x + width, y + height]) )
|
26
|
+
end
|
27
|
+
|
28
|
+
def x
|
29
|
+
[point1.x, point2.x].min
|
30
|
+
end
|
31
|
+
|
32
|
+
alias_method :left, :x
|
33
|
+
|
34
|
+
def y
|
35
|
+
#puts "y: [#{point1.y} #{point2.y}].min"
|
36
|
+
[point1.y, point2.y].min
|
37
|
+
end
|
38
|
+
|
39
|
+
alias_method :top, :y
|
40
|
+
|
41
|
+
def x2
|
42
|
+
[point1.x, point2.x].max
|
43
|
+
end
|
44
|
+
|
45
|
+
alias_method :right, :x2
|
46
|
+
|
47
|
+
def y2
|
48
|
+
#puts "y2: [#{point1.y} #{point2.y}].max"
|
49
|
+
[point1.y, point2.y].max
|
50
|
+
end
|
51
|
+
|
52
|
+
alias_method :bottom, :y2
|
53
|
+
|
54
|
+
|
55
|
+
def width
|
56
|
+
(point1.x - point2.x).abs
|
57
|
+
end
|
58
|
+
|
59
|
+
def height
|
60
|
+
(point1.y - point2.y).abs
|
61
|
+
end
|
62
|
+
|
63
|
+
def area
|
64
|
+
self.width * self.height
|
65
|
+
end
|
66
|
+
|
67
|
+
def similarity_hash
|
68
|
+
[self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
|
69
|
+
end
|
70
|
+
|
71
|
+
def dims(*format)
|
72
|
+
if format
|
73
|
+
format.map{|method| self.send(method)}
|
74
|
+
else
|
75
|
+
[self.x, self.y, self.width, self.height]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def contains?(other_x, other_y)
|
80
|
+
(other_x <= x2 && other_x >= x ) && (other_y <= y2 && other_y > y)
|
81
|
+
end
|
82
|
+
|
83
|
+
def overlaps?(other_rect)
|
84
|
+
return contains?(other_rect.x, other_rect.y) || contains?(other_rect.x2, other_rect.y2) ||
|
85
|
+
contains?(other_rect.x, other_rect.y2) || contains?(other_rect.x2, other_rect.y) ||
|
86
|
+
other_rect.contains?(x, y) || other_rect.contains?(x2, y2) ||
|
87
|
+
other_rect.contains?(x, y2) || other_rect.contains?(x2, y)
|
88
|
+
end
|
89
|
+
|
90
|
+
def bounding_box(other_rect)
|
91
|
+
#new rect with bounding box of these two
|
92
|
+
new_x1 = [x, other_rect.x].min
|
93
|
+
new_y1 = [x, other_rect.y].min
|
94
|
+
new_x2 = [x2, other_rect.x2].max
|
95
|
+
new_y2 = [y2, other_rect.y2].max
|
96
|
+
new_width = (new_x2 - new_x1).abs
|
97
|
+
new_height = (new_y2 - new_y1).abs
|
98
|
+
Rectangle.new_by_x_y_dims(new_x1, new_y1, new_width, new_height)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
data/lib/geom/segment.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
#
|
2
|
+
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
+
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
+
#
|
5
|
+
|
6
|
+
|
7
|
+
module Geometry
|
8
|
+
include Math
|
9
|
+
extend Math
|
10
|
+
|
11
|
+
def Geometry.distance(point1, point2)
|
12
|
+
hypot point1.x - point2.x, point1.y - point2.y
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
class Segment < Struct.new(:point1, :point2)
|
17
|
+
def self.new_by_arrays(point1_coordinates, point2_coordinates)
|
18
|
+
self.new(Point.new_by_array(point1_coordinates),
|
19
|
+
Point.new_by_array(point2_coordinates))
|
20
|
+
end
|
21
|
+
|
22
|
+
def scale!(scale_factor)
|
23
|
+
self.point1.x = self.point1.x * scale_factor
|
24
|
+
self.point1.y = self.point1.y * scale_factor
|
25
|
+
self.point2.x = self.point2.x * scale_factor
|
26
|
+
self.point2.y = self.point2.y * scale_factor
|
27
|
+
end
|
28
|
+
|
29
|
+
def vertical?
|
30
|
+
point1.x == point2.x
|
31
|
+
end
|
32
|
+
|
33
|
+
def horizontal?
|
34
|
+
point1.y == point2.y
|
35
|
+
end
|
36
|
+
|
37
|
+
def leftmost_endpoint
|
38
|
+
((point1.x <=> point2.x) == -1) ? point1 : point2
|
39
|
+
end
|
40
|
+
|
41
|
+
def rightmost_endpoint
|
42
|
+
((point1.x <=> point2.x) == 1) ? point1 : point2
|
43
|
+
end
|
44
|
+
|
45
|
+
def topmost_endpoint
|
46
|
+
((point1.y <=> point2.y) == 1) ? point1 : point2
|
47
|
+
end
|
48
|
+
|
49
|
+
def bottommost_endpoint
|
50
|
+
((point1.y <=> point2.y) == -1) ? point1 : point2
|
51
|
+
end
|
52
|
+
|
53
|
+
def top
|
54
|
+
topmost_endpoint.y
|
55
|
+
end
|
56
|
+
|
57
|
+
def bottom
|
58
|
+
bottommost_endpoint.y
|
59
|
+
end
|
60
|
+
def width
|
61
|
+
(left - right).abs
|
62
|
+
end
|
63
|
+
def height
|
64
|
+
(bottom - top).abs
|
65
|
+
end
|
66
|
+
|
67
|
+
def left
|
68
|
+
leftmost_endpoint.x
|
69
|
+
end
|
70
|
+
|
71
|
+
def right
|
72
|
+
rightmost_endpoint.x
|
73
|
+
end
|
74
|
+
def length
|
75
|
+
Geometry.distance(point1, point2)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def Segment(point1, point2)
|
81
|
+
Geometry::Segment.new point1, point2
|
82
|
+
end
|
data/lib/tabula.rb
CHANGED
@@ -7,5 +7,6 @@ require_relative './tabula/entities'
|
|
7
7
|
require_relative './tabula/pdf_dump'
|
8
8
|
require_relative './tabula/table_extractor'
|
9
9
|
require_relative './tabula/writers'
|
10
|
+
require_relative './tabula/table_guesser'
|
10
11
|
require_relative './tabula/line_segment_detector'
|
11
12
|
require_relative './tabula/pdf_render'
|
data/lib/tabula/entities.rb
CHANGED
@@ -99,10 +99,10 @@ module Tabula
|
|
99
99
|
|
100
100
|
# spaces are not detected, b/c they have height == 0
|
101
101
|
# ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
|
102
|
-
# self.texts.select { |t| t.overlaps? ze }
|
103
|
-
self.texts.select
|
102
|
+
# self.texts.select { |t| t.overlaps? ze }
|
103
|
+
self.texts.select do |t|
|
104
104
|
t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
|
105
|
-
|
105
|
+
end
|
106
106
|
end
|
107
107
|
|
108
108
|
def to_json(options={})
|
@@ -120,7 +120,7 @@ module Tabula
|
|
120
120
|
attr_accessor :font, :font_size, :text, :width_of_space
|
121
121
|
|
122
122
|
CHARACTER_DISTANCE_THRESHOLD = 1.5
|
123
|
-
TOLERANCE_FACTOR = 0.25
|
123
|
+
TOLERANCE_FACTOR = 0.25 #25
|
124
124
|
|
125
125
|
def initialize(top, left, width, height, font, font_size, text, width_of_space)
|
126
126
|
super(top, left, width, height)
|
@@ -149,7 +149,7 @@ module Tabula
|
|
149
149
|
overlaps = self.vertically_overlaps?(other)
|
150
150
|
|
151
151
|
up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
|
152
|
-
down_tolerance = 0.
|
152
|
+
down_tolerance = 0.90 #90?
|
153
153
|
|
154
154
|
dist = self.horizontal_distance(other).abs
|
155
155
|
|
@@ -261,6 +261,10 @@ module Tabula
|
|
261
261
|
r >= 0 and r < 1 and s >= 0 and s < 1
|
262
262
|
end
|
263
263
|
|
264
|
+
def length
|
265
|
+
Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
|
266
|
+
end
|
267
|
+
|
264
268
|
def vertical?
|
265
269
|
left == right
|
266
270
|
end
|
@@ -269,6 +273,13 @@ module Tabula
|
|
269
273
|
top == bottom
|
270
274
|
end
|
271
275
|
|
276
|
+
def right
|
277
|
+
left + width
|
278
|
+
end
|
279
|
+
def bottom
|
280
|
+
top + height
|
281
|
+
end
|
282
|
+
|
272
283
|
def to_json(arg)
|
273
284
|
[left, top, right, bottom].to_json
|
274
285
|
end
|
@@ -46,7 +46,7 @@ module Tabula
|
|
46
46
|
options = DETECT_LINES_DEFAULTS.merge(options)
|
47
47
|
|
48
48
|
pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
|
49
|
-
page = pdf_file.getDocumentCatalog.getAllPages[page_number
|
49
|
+
page = pdf_file.getDocumentCatalog.getAllPages[page_number]
|
50
50
|
bi = Tabula::Render.pageToBufferedImage(page,
|
51
51
|
options[:image_size])
|
52
52
|
pdf_file.close
|
data/lib/tabula/pdf_dump.rb
CHANGED
@@ -77,11 +77,12 @@ module Tabula
|
|
77
77
|
class CharacterExtractor
|
78
78
|
include Observable
|
79
79
|
|
80
|
+
#N.B. pages can be :all, a list of pages or a range.
|
80
81
|
def initialize(pdf_filename, pages=[1])
|
81
82
|
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
82
83
|
@pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
|
83
84
|
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
84
|
-
@pages = pages
|
85
|
+
@pages = pages == :all ? (1..@all_pages.size) : pages
|
85
86
|
@extractor = TextExtractor.new
|
86
87
|
end
|
87
88
|
|
data/lib/tabula/pdf_render.rb
CHANGED
@@ -20,8 +20,9 @@ module Tabula
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
TRANSPARENT_WHITE = Color.new(255, 255, 255, 0)
|
23
|
+
TRANSPARENT_WHITE = java.awt.Color.new(255, 255, 255, 0)
|
24
24
|
|
25
|
+
# 2048 width is important, if this is too small, thin lines won't be drawn.
|
25
26
|
def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
|
26
27
|
cropbox = page.findCropBox
|
27
28
|
widthPt, heightPt = cropbox.getWidth, cropbox.getHeight
|
@@ -27,21 +27,38 @@ module Tabula
|
|
27
27
|
# (ie, take into account vertical ruling lines if available)
|
28
28
|
def group_by_columns
|
29
29
|
columns = []
|
30
|
-
tes = self.text_elements.sort_by
|
30
|
+
tes = self.text_elements.sort_by &:left
|
31
31
|
|
32
32
|
# we don't have vertical rulings
|
33
|
-
|
34
|
-
|
35
|
-
column
|
36
|
-
|
37
|
-
|
33
|
+
if self.options[:vertical_rulings].empty?
|
34
|
+
tes.each do |te|
|
35
|
+
if column = columns.detect { |c| te.horizontally_overlaps?(c) }
|
36
|
+
column << te
|
37
|
+
else
|
38
|
+
columns << Column.new(te.left, te.width, [te])
|
39
|
+
end
|
40
|
+
end
|
41
|
+
else
|
42
|
+
self.options[:vertical_rulings].sort_by! &:left
|
43
|
+
1.upto(self.options[:vertical_rulings].size - 1) do |i|
|
44
|
+
left_ruling_line = self.options[:vertical_rulings][i - 1]
|
45
|
+
right_ruling_line = self.options[:vertical_rulings][i]
|
46
|
+
columns << Column.new(left_ruling_line.left, right_ruling_line.left - left_ruling_line.left, []) if (right_ruling_line.left - left_ruling_line.left > 10)
|
47
|
+
end
|
48
|
+
tes.each do |te|
|
49
|
+
if column = columns.detect { |c| te.horizontally_overlaps?(c) }
|
50
|
+
column << te
|
51
|
+
else
|
52
|
+
puts "couldn't find a place for #{te.inspect}"
|
53
|
+
#columns << Column.new(te.left, te.width, [te])
|
54
|
+
end
|
38
55
|
end
|
39
56
|
end
|
40
57
|
columns
|
41
58
|
end
|
42
59
|
|
43
60
|
def get_columns
|
44
|
-
|
61
|
+
TableExtractor.new(text_elements).group_by_columns.map do |c|
|
45
62
|
{'left' => c.left, 'right' => c.right, 'width' => c.width}
|
46
63
|
end
|
47
64
|
end
|
@@ -87,6 +104,7 @@ module Tabula
|
|
87
104
|
|
88
105
|
private
|
89
106
|
|
107
|
+
#this is where spaces come from!
|
90
108
|
def merge_words!
|
91
109
|
return self.text_elements if @merged # only merge once. awful hack.
|
92
110
|
@merged = true
|
@@ -97,9 +115,12 @@ module Tabula
|
|
97
115
|
|
98
116
|
char2 = self.text_elements[i+1]
|
99
117
|
|
118
|
+
|
119
|
+
|
100
120
|
next if char2.nil? or char1.nil?
|
101
121
|
|
102
122
|
if self.text_elements[current_word_index].should_merge?(char2)
|
123
|
+
#puts "merging: #{self.text_elements[current_word_index].text}/#{self.text_elements[current_word_index].width}"
|
103
124
|
self.text_elements[current_word_index].merge!(char2)
|
104
125
|
char1 = char2
|
105
126
|
self.text_elements[i+1] = nil
|
@@ -107,13 +128,14 @@ module Tabula
|
|
107
128
|
# is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
|
108
129
|
if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
|
109
130
|
self.text_elements[current_word_index].text += " "
|
110
|
-
self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
|
131
|
+
#self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
|
111
132
|
end
|
112
133
|
current_word_index = i+1
|
113
134
|
end
|
114
135
|
i += 1
|
115
136
|
end
|
116
|
-
|
137
|
+
self.text_elements.compact!
|
138
|
+
return self.text_elements
|
117
139
|
end
|
118
140
|
end
|
119
141
|
|
@@ -174,7 +196,7 @@ module Tabula
|
|
174
196
|
|
175
197
|
lines.sort_by!(&:top)
|
176
198
|
|
177
|
-
columns =
|
199
|
+
columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words]}).group_by_columns.sort_by(&:left)
|
178
200
|
|
179
201
|
# # insert empty cells if needed
|
180
202
|
lines.each_with_index do |l, line_index|
|
@@ -183,23 +205,21 @@ module Tabula
|
|
183
205
|
l.text_elements.uniq! # TODO WHY do I have to do this?
|
184
206
|
l.text_elements.sort_by!(&:left)
|
185
207
|
|
186
|
-
next unless l.text_elements.size < columns.size
|
208
|
+
#next unless l.text_elements.size < columns.size
|
187
209
|
|
188
210
|
columns.each_with_index do |c, i|
|
189
|
-
if (i > l.text_elements.size - 1) or !l.text_elements
|
211
|
+
if (i > l.text_elements.size - 1) or (!l.text_elements[i].nil? and !c.text_elements.include?(l.text_elements[i]))
|
190
212
|
l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
|
191
213
|
end
|
192
214
|
end
|
193
215
|
end
|
194
216
|
|
195
217
|
# # merge elements that are in the same column
|
196
|
-
columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
|
197
|
-
|
198
218
|
lines.each_with_index do |l, line_index|
|
199
219
|
next if l.text_elements.nil?
|
200
220
|
|
201
221
|
(0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
|
202
|
-
next if l.text_elements[t1].nil? or l.text_elements[t2].nil?
|
222
|
+
next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
|
203
223
|
|
204
224
|
# if same column...
|
205
225
|
if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
|
@@ -233,4 +253,96 @@ module Tabula
|
|
233
253
|
line.text_elements.sort_by(&:left)
|
234
254
|
end
|
235
255
|
end
|
256
|
+
|
257
|
+
|
258
|
+
def Tabula.make_table_with_vertical_rulings(text_elements, options={})
|
259
|
+
extractor = TableExtractor.new(text_elements, options)
|
260
|
+
|
261
|
+
# group by lines
|
262
|
+
lines = []
|
263
|
+
line_boundaries = extractor.get_line_boundaries
|
264
|
+
|
265
|
+
# find all the text elements
|
266
|
+
# contained within each detected line (table row) boundary
|
267
|
+
line_boundaries.each do |lb|
|
268
|
+
line = Line.new
|
269
|
+
|
270
|
+
line_members = text_elements.find_all do |te|
|
271
|
+
te.vertically_overlaps?(lb)
|
272
|
+
end
|
273
|
+
|
274
|
+
text_elements -= line_members
|
275
|
+
|
276
|
+
line_members.sort_by(&:left).each do |te|
|
277
|
+
# skip text_elements that only contain spaces
|
278
|
+
next if te.text =~ ONLY_SPACES_RE
|
279
|
+
line << te
|
280
|
+
end
|
281
|
+
|
282
|
+
lines << line if line.text_elements.size > 0
|
283
|
+
end
|
284
|
+
|
285
|
+
lines.sort_by!(&:top)
|
286
|
+
|
287
|
+
vertical_rulings = options[:vertical_rulings]
|
288
|
+
columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)
|
289
|
+
|
290
|
+
# insert empty cells if needed
|
291
|
+
lines.each_with_index do |l, line_index|
|
292
|
+
next if l.text_elements.nil?
|
293
|
+
l.text_elements.compact! # TODO WHY do I have to do this?
|
294
|
+
l.text_elements.uniq! # TODO WHY do I have to do this?
|
295
|
+
l.text_elements.sort_by!(&:left)
|
296
|
+
|
297
|
+
columns.each_with_index do |c, i|
|
298
|
+
if (l.text_elements.select{|te| te && te.left >= c.left && te.right <= (c.left + c.width)}.empty?)
|
299
|
+
l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
# merge elements that are in the same column
|
305
|
+
lines.each_with_index do |l, line_index|
|
306
|
+
next if l.text_elements.nil?
|
307
|
+
|
308
|
+
(0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2| #don't remove a string of empty cells
|
309
|
+
next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
|
310
|
+
|
311
|
+
# if same column...
|
312
|
+
if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
|
313
|
+
== columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
|
314
|
+
if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
|
315
|
+
l.text_elements[t1].merge!(l.text_elements[t2])
|
316
|
+
l.text_elements[t2] = nil
|
317
|
+
else
|
318
|
+
l.text_elements[t2].merge!(l.text_elements[t1])
|
319
|
+
l.text_elements[t1] = nil
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
l.text_elements.compact!
|
325
|
+
end
|
326
|
+
|
327
|
+
# remove duplicate lines
|
328
|
+
# TODO this shouldn't have happened here, check why we have to do
|
329
|
+
# this (maybe duplication is happening in the column merging phase?)
|
330
|
+
(0..lines.size - 2).each do |i|
|
331
|
+
next if lines[i].nil?
|
332
|
+
# if any of the elements on the next line is duplicated, kill
|
333
|
+
# the next line
|
334
|
+
if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
|
335
|
+
lines[i+1] = nil
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
lines.compact.map do |line|
|
340
|
+
line.text_elements.sort_by(&:left)
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
|
345
|
+
|
346
|
+
|
347
|
+
|
236
348
|
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
require 'java'
|
2
|
+
require 'json'
|
3
|
+
require_relative '../geom/point'
|
4
|
+
require_relative '../geom/segment'
|
5
|
+
require_relative '../geom/rectangle'
|
6
|
+
require_relative './pdf_render'
|
7
|
+
#CLASSPATH=:./target/javacpp.jar:./target/javacv.jar:./target/javacv-macosx-x86_64.jar:./target/PDFRenderer-0.9.1.jar
|
8
|
+
|
9
|
+
|
10
|
+
module Tabula
|
11
|
+
module TableGuesser
|
12
|
+
|
13
|
+
def TableGuesser.find_and_write_rects(filename, output_dir)
|
14
|
+
#writes to JSON the rectangles on each page in the specified PDF.
|
15
|
+
open(File.join(output_dir, "tables.json"), 'w') do |f|
|
16
|
+
f.write( JSON.dump(find_rects(filename).map{|a| a.map{|r| r.dims.map &:to_i }} ))
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def TableGuesser.find_rects(filename)
|
21
|
+
pdf = load_pdfbox_pdf(filename)
|
22
|
+
|
23
|
+
if pdf.getNumberOfPages == 0
|
24
|
+
puts "not a pdf!"
|
25
|
+
exit
|
26
|
+
end
|
27
|
+
|
28
|
+
puts "pages: " + pdf.getNumberOfPages.to_s
|
29
|
+
|
30
|
+
tables = []
|
31
|
+
pdf.getNumberOfPages.times do |i|
|
32
|
+
#gotcha: with PDFView, PDF pages are 1-indexed. If you ask for page 0 and then page 1, you'll get the first page twice. So start with index 1.
|
33
|
+
tables << find_rects_on_page(pdf, i + 1)
|
34
|
+
end
|
35
|
+
tables
|
36
|
+
end
|
37
|
+
|
38
|
+
def TableGuesser.find_lines(filename)
|
39
|
+
if pdf.getNumberOfPages == 0
|
40
|
+
puts "not a pdf!"
|
41
|
+
exit
|
42
|
+
end
|
43
|
+
|
44
|
+
puts "pages: " + pdf.getNumberOfPages.to_s
|
45
|
+
|
46
|
+
lines = []
|
47
|
+
pdf.getNumberOfPages.times do |i|
|
48
|
+
lines << detect_lines_in_pdf_page(filename, i)
|
49
|
+
end
|
50
|
+
lines
|
51
|
+
end
|
52
|
+
|
53
|
+
def TableGuesser.find_lines_on_page(pdf, page_index)
|
54
|
+
Tabula::LSD.detect_lines_in_pdf_page(pdf, page_index)
|
55
|
+
end
|
56
|
+
|
57
|
+
def TableGuesser.find_rects_on_page(pdf, page_index)
|
58
|
+
find_rects_from_lines(find_lines_on_page(pdf, page_index, 10))
|
59
|
+
end
|
60
|
+
|
61
|
+
def TableGuesser.find_rects_from_lines(lines)
|
62
|
+
horizontal_lines = lines.select &:horizontal?
|
63
|
+
vertical_lines = lines.select &:vertical?
|
64
|
+
find_tables(vertical_lines, horizontal_lines).inject([]){|memo, next_rect| Geometry::Rectangle.unionize(memo, next_rect )}.sort_by(&:area).reverse
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
def TableGuesser.euclidean_distance(x1, y1, x2, y2)
|
69
|
+
return Math.sqrt( ((x1 - x2) ** 2) + ((y1 - y2) ** 2) )
|
70
|
+
end
|
71
|
+
|
72
|
+
def TableGuesser.is_upward_oriented(line, y_value)
|
73
|
+
#return true if this line is oriented upwards, i.e. if the majority of it's length is above y_value.
|
74
|
+
return (y_value - line.top > line.bottom - y_value);
|
75
|
+
end
|
76
|
+
|
77
|
+
def TableGuesser.find_tables(verticals, horizontals)
|
78
|
+
# /*
|
79
|
+
# * Find all the rectangles in the vertical and horizontal lines given.
|
80
|
+
# *
|
81
|
+
# * Rectangles are deduped with hashRectangle, which considers two rectangles identical if each point rounds to the same tens place as the other.
|
82
|
+
# *
|
83
|
+
# * TODO: generalize this.
|
84
|
+
# */
|
85
|
+
corner_proximity_threshold = 0.10;
|
86
|
+
|
87
|
+
rectangles = []
|
88
|
+
#find rectangles with one horizontal line and two vertical lines that end within $threshold to the ends of the horizontal line.
|
89
|
+
|
90
|
+
[true, false].each do |up_or_down_lines|
|
91
|
+
horizontals.each do |horizontal_line|
|
92
|
+
horizontal_line_length = horizontal_line.length
|
93
|
+
|
94
|
+
has_vertical_line_from_the_left = false
|
95
|
+
left_vertical_line = nil
|
96
|
+
#for the left vertical line.
|
97
|
+
verticals.each do |vertical_line|
|
98
|
+
#1. if it is correctly oriented (up or down) given the outer loop here. (We don't want a false-positive rectangle with one "arm" going down, and one going up.)
|
99
|
+
next unless is_upward_oriented(vertical_line, horizontal_line.top) == up_or_down_lines
|
100
|
+
|
101
|
+
vertical_line_length = vertical_line.length
|
102
|
+
longer_line_length = [horizontal_line_length, vertical_line_length].max
|
103
|
+
corner_proximity = corner_proximity_threshold * longer_line_length
|
104
|
+
#make this the left vertical line:
|
105
|
+
#2. if it begins near the left vertex of the horizontal line.
|
106
|
+
if euclidean_distance(horizontal_line.left, horizontal_line.top, vertical_line.left, vertical_line.top) < corner_proximity ||
|
107
|
+
euclidean_distance(horizontal_line.left, horizontal_line.top, vertical_line.left, vertical_line.bottom) < corner_proximity
|
108
|
+
#3. if it is farther to the left of the line we already have.
|
109
|
+
if left_vertical_line.nil? || left_vertical_line.left> vertical_line.left #is this line is more to the left than left_vertical_line. #"What's your opinion on Das Kapital?"
|
110
|
+
has_vertical_line_from_the_left = true
|
111
|
+
left_vertical_line = vertical_line
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
has_vertical_line_from_the_right = false;
|
117
|
+
right_vertical_line = nil
|
118
|
+
#for the right vertical line.
|
119
|
+
verticals.each do |vertical_line|
|
120
|
+
next unless is_upward_oriented(vertical_line, horizontal_line.top) == up_or_down_lines
|
121
|
+
vertical_line_length = vertical_line.length
|
122
|
+
longer_line_length = [horizontal_line_length, vertical_line_length].max
|
123
|
+
corner_proximity = corner_proximity_threshold * longer_line_length
|
124
|
+
if euclidean_distance(horizontal_line.right, horizontal_line.top, vertical_line.left, vertical_line.top) < corner_proximity ||
|
125
|
+
euclidean_distance(horizontal_line.right, horizontal_line.top, vertical_line.left, vertical_line.bottom) < corner_proximity
|
126
|
+
|
127
|
+
if right_vertical_line.nil? || right_vertical_line.right > vertical_line.right #is this line is more to the right than right_vertical_line. #"Can you recite all of John Galt's speech?"
|
128
|
+
#do two passes to guarantee we don't get a horizontal line with a upwards and downwards line coming from each of its corners.
|
129
|
+
#i.e. ensuring that both "arms" of the rectangle have the same orientation (up or down).
|
130
|
+
has_vertical_line_from_the_right = true
|
131
|
+
right_vertical_line = vertical_line
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
if has_vertical_line_from_the_right && has_vertical_line_from_the_left
|
137
|
+
#in case we eventually tolerate not-quite-vertical lines, this computers the distance in Y directly, rather than depending on the vertical lines' lengths.
|
138
|
+
height = [left_vertical_line.bottom - left_vertical_line.top, right_vertical_line.bottom - right_vertical_line.top].max
|
139
|
+
|
140
|
+
y = [left_vertical_line.top, right_vertical_line.top].min
|
141
|
+
width = horizontal_line.right - horizontal_line.left
|
142
|
+
r = Geometry::Rectangle.new_by_x_y_dims(horizontal_line.left, y, width, height ) #x, y, w, h
|
143
|
+
#rectangles.put(hashRectangle(r), r); #TODO: I dont' think I need this now that I'm in Rubyland
|
144
|
+
rectangles << r
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
#find rectangles with one vertical line and two horizontal lines that end within $threshold to the ends of the vertical line.
|
149
|
+
verticals.each do |vertical_line|
|
150
|
+
vertical_line_length = vertical_line.length
|
151
|
+
|
152
|
+
has_horizontal_line_from_the_top = false
|
153
|
+
top_horizontal_line = nil
|
154
|
+
#for the top horizontal line.
|
155
|
+
horizontals.each do |horizontal_line|
|
156
|
+
horizontal_line_length = horizontal_line.length
|
157
|
+
longer_line_length = [horizontal_line_length, vertical_line_length].max
|
158
|
+
corner_proximity = corner_proximity_threshold * longer_line_length
|
159
|
+
|
160
|
+
if euclidean_distance(vertical_line.left, vertical_line.top, horizontal_line.left, horizontal_line.top) < corner_proximity ||
|
161
|
+
euclidean_distance(vertical_line.left, vertical_line.top, horizontal_line.right, horizontal_line.top) < corner_proximity
|
162
|
+
if top_horizontal_line.nil? || top_horizontal_line.top > horizontal_line.top #is this line is more to the top than the one we've got already.
|
163
|
+
has_horizontal_line_from_the_top = true;
|
164
|
+
top_horizontal_line = horizontal_line;
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
has_horizontal_line_from_the_bottom = false;
|
169
|
+
bottom_horizontal_line = nil
|
170
|
+
#for the bottom horizontal line.
|
171
|
+
horizontals.each do |horizontal_line|
|
172
|
+
horizontal_line_length = horizontal_line.length
|
173
|
+
longer_line_length = [horizontal_line_length, vertical_line_length].max
|
174
|
+
corner_proximity = corner_proximity_threshold * longer_line_length
|
175
|
+
|
176
|
+
if euclidean_distance(vertical_line.left, vertical_line.bottom, horizontal_line.left, horizontal_line.top) < corner_proximity ||
|
177
|
+
euclidean_distance(vertical_line.left, vertical_line.bottom, horizontal_line.left, horizontal_line.top) < corner_proximity
|
178
|
+
if bottom_horizontal_line.nil? || bottom_horizontal_line.bottom > horizontal_line.bottom #is this line is more to the bottom than the one we've got already.
|
179
|
+
has_horizontal_line_from_the_bottom = true;
|
180
|
+
bottom_horizontal_line = horizontal_line;
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
if has_horizontal_line_from_the_bottom && has_horizontal_line_from_the_top
|
186
|
+
x = [top_horizontal_line.left, bottom_horizontal_line.left].min
|
187
|
+
y = vertical_line.top
|
188
|
+
width = [top_horizontal_line.right - top_horizontal_line.left, bottom_horizontal_line.right - bottom_horizontal_line.right].max
|
189
|
+
height = vertical_line.bottom - vertical_line.top
|
190
|
+
r = Geometry::Rectangle.new_by_x_y_dims(x, y, width, height); #x, y, w, h
|
191
|
+
#rectangles.put(hashRectangle(r), r);
|
192
|
+
rectangles << r
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
return rectangles.uniq &:similarity_hash
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
data/lib/tabula/version.rb
CHANGED
data/tabula-extractor.gemspec
CHANGED
@@ -6,7 +6,7 @@ require 'tabula/version'
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "tabula-extractor"
|
8
8
|
s.version = Tabula::VERSION
|
9
|
-
s.authors = ["Manuel Aristarán"]
|
9
|
+
s.authors = ["Manuel Aristarán", "Jeremy B. Merill", "Mike Tigas"]
|
10
10
|
s.email = ["manuel@jazzido.com"]
|
11
11
|
s.homepage = "https://github.com/jazzido/tabula-extractor"
|
12
12
|
s.summary = %q{extract tables from PDF files}
|
@@ -14,7 +14,7 @@ Gem::Specification.new do |s|
|
|
14
14
|
|
15
15
|
s.platform = 'java'
|
16
16
|
|
17
|
-
shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll'].map { |f| 'ext/' + f }
|
17
|
+
shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll', 'liblsd64.dll'].map { |f| 'ext/' + f }
|
18
18
|
s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
|
19
19
|
s.test_files = `git ls-files -- {test,features}/*`.split("\n")
|
20
20
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
Binary file
|
data/test/tests.rb
CHANGED
@@ -23,6 +23,8 @@ class TestPagesInfoExtractor < Minitest::Test
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
+
class TestTableGuesser < MiniTest::Unit::TestCase
|
27
|
+
end
|
26
28
|
|
27
29
|
class TestDumper < Minitest::Test
|
28
30
|
|
@@ -60,6 +62,77 @@ class TestExtractor < Minitest::Test
|
|
60
62
|
assert_equal expected, lines_to_array(Tabula.make_table(characters))
|
61
63
|
end
|
62
64
|
|
65
|
+
def test_forest_disclosure_report_dont_regress
|
66
|
+
# this is the current state of the expected output. Ideally the output should be like
|
67
|
+
# test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
|
68
|
+
# and a solution for half-x-height-offset lines.
|
69
|
+
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
70
|
+
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
71
|
+
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
72
|
+
vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
|
73
|
+
|
74
|
+
|
75
|
+
characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
|
76
|
+
#top left bottom right
|
77
|
+
expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
|
78
|
+
['TOTAL', '', '', '','$85.00'],
|
79
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
|
80
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
|
81
|
+
['TOTAL', '', '', '', '$471.25'],
|
82
|
+
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
|
83
|
+
['TOTAL', '', '', '','$20.39'],
|
84
|
+
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
|
85
|
+
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
86
|
+
['TOTAL', '', '', '', '$5,010.33'],
|
87
|
+
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
|
88
|
+
['TOTAL', '', '', '', '$193.67'],
|
89
|
+
['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
|
90
|
+
|
91
|
+
assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_missing_spaces_around_an_ampersand
|
95
|
+
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
96
|
+
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
97
|
+
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
98
|
+
vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
|
99
|
+
|
100
|
+
|
101
|
+
characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
|
102
|
+
#top left bottom right
|
103
|
+
expected = [
|
104
|
+
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
105
|
+
]
|
106
|
+
|
107
|
+
assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
|
108
|
+
end
|
109
|
+
|
110
|
+
def test_forest_disclosure_report
|
111
|
+
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
112
|
+
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
113
|
+
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
114
|
+
vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
|
115
|
+
|
116
|
+
|
117
|
+
characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
|
118
|
+
#top left bottom right
|
119
|
+
expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
|
120
|
+
['TOTAL', '', '', '','$85.00'],
|
121
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
|
122
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
|
123
|
+
['TOTAL', '', '', '', '$471.25'],
|
124
|
+
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
|
125
|
+
['TOTAL', '', '', '','$20.39'],
|
126
|
+
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
|
127
|
+
['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '$4,700.00'],
|
128
|
+
['TOTAL', '', '', '', '$5,010.33'],
|
129
|
+
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
|
130
|
+
['TOTAL', '', '', '', '$193.67'],
|
131
|
+
['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
|
132
|
+
|
133
|
+
assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
|
134
|
+
end
|
135
|
+
|
63
136
|
# TODO Spaces inserted in words - fails
|
64
137
|
def test_bo_page24
|
65
138
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
|
metadata
CHANGED
@@ -2,14 +2,16 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.6.1
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
9
|
+
- Jeremy B. Merill
|
10
|
+
- Mike Tigas
|
9
11
|
autorequire:
|
10
12
|
bindir: bin
|
11
13
|
cert_chain: []
|
12
|
-
date: 2013-06-
|
14
|
+
date: 2013-06-18 00:00:00.000000000 Z
|
13
15
|
dependencies:
|
14
16
|
- !ruby/object:Gem::Dependency
|
15
17
|
name: minitest
|
@@ -93,6 +95,9 @@ files:
|
|
93
95
|
- ext/liblsd64.dll
|
94
96
|
- ext/lsd.c
|
95
97
|
- ext/lsd.h
|
98
|
+
- lib/geom/point.rb
|
99
|
+
- lib/geom/rectangle.rb
|
100
|
+
- lib/geom/segment.rb
|
96
101
|
- lib/tabula.rb
|
97
102
|
- lib/tabula/core_ext.rb
|
98
103
|
- lib/tabula/entities.rb
|
@@ -100,6 +105,7 @@ files:
|
|
100
105
|
- lib/tabula/pdf_dump.rb
|
101
106
|
- lib/tabula/pdf_render.rb
|
102
107
|
- lib/tabula/table_extractor.rb
|
108
|
+
- lib/tabula/table_guesser.rb
|
103
109
|
- lib/tabula/version.rb
|
104
110
|
- lib/tabula/whitespace.rb
|
105
111
|
- lib/tabula/writers.rb
|
@@ -108,6 +114,7 @@ files:
|
|
108
114
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
109
115
|
- test/data/argentina_diputados_voting_record.pdf
|
110
116
|
- test/data/bo_page24.pdf
|
117
|
+
- test/data/frx_2012_disclosure.pdf
|
111
118
|
- test/data/gre.pdf
|
112
119
|
- test/data/tabla_subsidios.pdf
|
113
120
|
- test/tests.rb
|
@@ -147,6 +154,7 @@ test_files:
|
|
147
154
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
148
155
|
- test/data/argentina_diputados_voting_record.pdf
|
149
156
|
- test/data/bo_page24.pdf
|
157
|
+
- test/data/frx_2012_disclosure.pdf
|
150
158
|
- test/data/gre.pdf
|
151
159
|
- test/data/tabla_subsidios.pdf
|
152
160
|
- test/tests.rb
|