tabula-extractor 0.6.6-java → 0.7.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 0.6.6
|
4
|
+
version: 0.7.0
|
6
5
|
platform: java
|
7
6
|
authors:
|
8
7
|
- Manuel Aristarán
|
@@ -11,7 +10,7 @@ authors:
|
|
11
10
|
autorequire:
|
12
11
|
bindir: bin
|
13
12
|
cert_chain: []
|
14
|
-
date:
|
13
|
+
date: 2014-01-07 00:00:00.000000000 Z
|
15
14
|
dependencies:
|
16
15
|
- !ruby/object:Gem::Dependency
|
17
16
|
name: minitest
|
@@ -20,13 +19,11 @@ dependencies:
|
|
20
19
|
- - '>='
|
21
20
|
- !ruby/object:Gem::Version
|
22
21
|
version: '0'
|
23
|
-
none: false
|
24
22
|
requirement: !ruby/object:Gem::Requirement
|
25
23
|
requirements:
|
26
24
|
- - '>='
|
27
25
|
- !ruby/object:Gem::Version
|
28
26
|
version: '0'
|
29
|
-
none: false
|
30
27
|
prerelease: false
|
31
28
|
type: :development
|
32
29
|
- !ruby/object:Gem::Dependency
|
@@ -36,13 +33,11 @@ dependencies:
|
|
36
33
|
- - '>='
|
37
34
|
- !ruby/object:Gem::Version
|
38
35
|
version: 1.3.4
|
39
|
-
none: false
|
40
36
|
requirement: !ruby/object:Gem::Requirement
|
41
37
|
requirements:
|
42
38
|
- - '>='
|
43
39
|
- !ruby/object:Gem::Version
|
44
40
|
version: 1.3.4
|
45
|
-
none: false
|
46
41
|
prerelease: false
|
47
42
|
type: :development
|
48
43
|
- !ruby/object:Gem::Dependency
|
@@ -52,13 +47,25 @@ dependencies:
|
|
52
47
|
- - '>='
|
53
48
|
- !ruby/object:Gem::Version
|
54
49
|
version: '0'
|
55
|
-
none: false
|
56
50
|
requirement: !ruby/object:Gem::Requirement
|
57
51
|
requirements:
|
58
52
|
- - '>='
|
59
53
|
- !ruby/object:Gem::Version
|
60
54
|
version: '0'
|
61
|
-
|
55
|
+
prerelease: false
|
56
|
+
type: :development
|
57
|
+
- !ruby/object:Gem::Dependency
|
58
|
+
name: pry
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
62
69
|
prerelease: false
|
63
70
|
type: :development
|
64
71
|
- !ruby/object:Gem::Dependency
|
@@ -68,13 +75,11 @@ dependencies:
|
|
68
75
|
- - ~>
|
69
76
|
- !ruby/object:Gem::Version
|
70
77
|
version: '2.0'
|
71
|
-
none: false
|
72
78
|
requirement: !ruby/object:Gem::Requirement
|
73
79
|
requirements:
|
74
80
|
- - ~>
|
75
81
|
- !ruby/object:Gem::Version
|
76
82
|
version: '2.0'
|
77
|
-
none: false
|
78
83
|
prerelease: false
|
79
84
|
type: :runtime
|
80
85
|
description: extract tables from PDF files
|
@@ -109,34 +114,65 @@ files:
|
|
109
114
|
- ext/liblsd64.dll
|
110
115
|
- ext/lsd.c
|
111
116
|
- ext/lsd.h
|
112
|
-
- lib/geom/point.rb
|
113
|
-
- lib/geom/rectangle.rb
|
114
|
-
- lib/geom/segment.rb
|
115
117
|
- lib/tabula.rb
|
116
118
|
- lib/tabula/core_ext.rb
|
117
119
|
- lib/tabula/entities.rb
|
120
|
+
- lib/tabula/entities/cell.rb
|
121
|
+
- lib/tabula/entities/has_cells.rb
|
122
|
+
- lib/tabula/entities/line.rb
|
123
|
+
- lib/tabula/entities/page.rb
|
124
|
+
- lib/tabula/entities/page_area.rb
|
125
|
+
- lib/tabula/entities/ruling.rb
|
126
|
+
- lib/tabula/entities/spreadsheet.rb
|
127
|
+
- lib/tabula/entities/table.rb
|
128
|
+
- lib/tabula/entities/text_chunk.rb
|
129
|
+
- lib/tabula/entities/text_element.rb
|
130
|
+
- lib/tabula/entities/zone_entity.rb
|
131
|
+
- lib/tabula/extraction.rb
|
118
132
|
- lib/tabula/line_segment_detector.rb
|
119
|
-
- lib/tabula/
|
133
|
+
- lib/tabula/pdf_line_extractor.rb
|
120
134
|
- lib/tabula/pdf_render.rb
|
135
|
+
- lib/tabula/spreadsheet_extractor.rb
|
121
136
|
- lib/tabula/table_extractor.rb
|
122
137
|
- lib/tabula/table_guesser.rb
|
123
138
|
- lib/tabula/version.rb
|
124
|
-
- lib/tabula/whitespace.rb
|
125
139
|
- lib/tabula/writers.rb
|
126
140
|
- tabula-extractor.gemspec
|
127
141
|
- target/pdfbox-app-2.0.0-SNAPSHOT.jar
|
142
|
+
- test/data/47008204D_USA.page4.pdf
|
143
|
+
- test/data/560015757GV_China.page1.pdf
|
128
144
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
145
|
+
- test/data/GSK_2012_Q4.page437.pdf
|
146
|
+
- test/data/S2MNCEbirdisland.pdf
|
129
147
|
- test/data/argentina_diputados_voting_record.pdf
|
130
148
|
- test/data/bo_page24.pdf
|
149
|
+
- test/data/campaign_donors.pdf
|
131
150
|
- test/data/frx_2012_disclosure.pdf
|
151
|
+
- test/data/frx_2012_disclosure.tsv
|
132
152
|
- test/data/gre.pdf
|
153
|
+
- test/data/no_tables.pdf
|
154
|
+
- test/data/puertos1.pdf
|
155
|
+
- test/data/spanning_cells.csv
|
156
|
+
- test/data/spanning_cells.pdf
|
157
|
+
- test/data/strongschools.pdf
|
133
158
|
- test/data/tabla_subsidios.pdf
|
159
|
+
- test/data/vertical_rulings_bug.pdf
|
160
|
+
- test/data/vietnam3.pdf
|
161
|
+
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
162
|
+
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
163
|
+
- test/heuristic-test-set/original/bo_page24.pdf
|
164
|
+
- test/heuristic-test-set/original/campaign_donors.pdf
|
165
|
+
- test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
|
166
|
+
- test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
|
167
|
+
- test/heuristic-test-set/spreadsheet/strongschools.pdf
|
168
|
+
- test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
|
169
|
+
- test/heuristic.rb
|
170
|
+
- test/test_bin_tabula.sh
|
134
171
|
- test/tests.rb
|
135
|
-
- vertical_rulings_bug.pdf
|
136
|
-
- vertical_rulings_bug.rb
|
137
172
|
homepage: https://github.com/jazzido/tabula-extractor
|
138
173
|
licenses:
|
139
174
|
- MIT
|
175
|
+
metadata: {}
|
140
176
|
post_install_message:
|
141
177
|
rdoc_options: []
|
142
178
|
require_paths:
|
@@ -145,31 +181,46 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
145
181
|
requirements:
|
146
182
|
- - '>='
|
147
183
|
- !ruby/object:Gem::Version
|
148
|
-
segments:
|
149
|
-
- 0
|
150
|
-
hash: 2
|
151
184
|
version: '0'
|
152
|
-
none: false
|
153
185
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
154
186
|
requirements:
|
155
187
|
- - '>='
|
156
188
|
- !ruby/object:Gem::Version
|
157
|
-
segments:
|
158
|
-
- 0
|
159
|
-
hash: 2
|
160
189
|
version: '0'
|
161
|
-
none: false
|
162
190
|
requirements: []
|
163
191
|
rubyforge_project:
|
164
|
-
rubygems_version: 1.
|
192
|
+
rubygems_version: 2.1.9
|
165
193
|
signing_key:
|
166
|
-
specification_version:
|
194
|
+
specification_version: 4
|
167
195
|
summary: extract tables from PDF files
|
168
196
|
test_files:
|
197
|
+
- test/data/47008204D_USA.page4.pdf
|
198
|
+
- test/data/560015757GV_China.page1.pdf
|
169
199
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
200
|
+
- test/data/GSK_2012_Q4.page437.pdf
|
201
|
+
- test/data/S2MNCEbirdisland.pdf
|
170
202
|
- test/data/argentina_diputados_voting_record.pdf
|
171
203
|
- test/data/bo_page24.pdf
|
204
|
+
- test/data/campaign_donors.pdf
|
172
205
|
- test/data/frx_2012_disclosure.pdf
|
206
|
+
- test/data/frx_2012_disclosure.tsv
|
173
207
|
- test/data/gre.pdf
|
208
|
+
- test/data/no_tables.pdf
|
209
|
+
- test/data/puertos1.pdf
|
210
|
+
- test/data/spanning_cells.csv
|
211
|
+
- test/data/spanning_cells.pdf
|
212
|
+
- test/data/strongschools.pdf
|
174
213
|
- test/data/tabla_subsidios.pdf
|
214
|
+
- test/data/vertical_rulings_bug.pdf
|
215
|
+
- test/data/vietnam3.pdf
|
216
|
+
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
217
|
+
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
218
|
+
- test/heuristic-test-set/original/bo_page24.pdf
|
219
|
+
- test/heuristic-test-set/original/campaign_donors.pdf
|
220
|
+
- test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
|
221
|
+
- test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
|
222
|
+
- test/heuristic-test-set/spreadsheet/strongschools.pdf
|
223
|
+
- test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
|
224
|
+
- test/heuristic.rb
|
225
|
+
- test/test_bin_tabula.sh
|
175
226
|
- test/tests.rb
|
data/lib/geom/point.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
-
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
-
#
|
5
|
-
|
6
|
-
|
7
|
-
module Geometry
|
8
|
-
class Point < Struct.new(:x, :y)
|
9
|
-
def self.new_by_array(array)
|
10
|
-
self.new(array[0], array[1])
|
11
|
-
end
|
12
|
-
|
13
|
-
def ==(another_point)
|
14
|
-
x === another_point.x && y === another_point.y
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
def Point(x, y)
|
20
|
-
Geometry::Point.new(x, y)
|
21
|
-
end
|
data/lib/geom/rectangle.rb
DELETED
@@ -1,101 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
-
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
-
#
|
5
|
-
|
6
|
-
|
7
|
-
module Geometry
|
8
|
-
class Rectangle < Struct.new(:point1, :point2)
|
9
|
-
SIMILARITY_DIVISOR = 20
|
10
|
-
|
11
|
-
def Rectangle.unionize(non_overlapping_rectangles, next_rect)
|
12
|
-
#if next_rect doesn't overlap any of non_overlapping_rectangles
|
13
|
-
if (overlapping = non_overlapping_rectangles.select{|r| next_rect.overlaps? r}) && !non_overlapping_rectangles.empty?
|
14
|
-
#remove all of those that it overlaps from non_overlapping_rectangles and
|
15
|
-
non_overlapping_rectangles -= overlapping
|
16
|
-
#add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
|
17
|
-
non_overlapping_rectangles << overlapping.inject(next_rect){|memo, overlap| memo.bounding_box(overlap) }
|
18
|
-
|
19
|
-
else
|
20
|
-
non_overlapping_rectangles << next_rect
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.new_by_x_y_dims(x, y, width, height)
|
25
|
-
self.new( Point.new_by_array([x, y]), Point.new_by_array([x + width, y + height]) )
|
26
|
-
end
|
27
|
-
|
28
|
-
def x
|
29
|
-
[point1.x, point2.x].min
|
30
|
-
end
|
31
|
-
|
32
|
-
alias_method :left, :x
|
33
|
-
|
34
|
-
def y
|
35
|
-
#puts "y: [#{point1.y} #{point2.y}].min"
|
36
|
-
[point1.y, point2.y].min
|
37
|
-
end
|
38
|
-
|
39
|
-
alias_method :top, :y
|
40
|
-
|
41
|
-
def x2
|
42
|
-
[point1.x, point2.x].max
|
43
|
-
end
|
44
|
-
|
45
|
-
alias_method :right, :x2
|
46
|
-
|
47
|
-
def y2
|
48
|
-
#puts "y2: [#{point1.y} #{point2.y}].max"
|
49
|
-
[point1.y, point2.y].max
|
50
|
-
end
|
51
|
-
|
52
|
-
alias_method :bottom, :y2
|
53
|
-
|
54
|
-
|
55
|
-
def width
|
56
|
-
(point1.x - point2.x).abs
|
57
|
-
end
|
58
|
-
|
59
|
-
def height
|
60
|
-
(point1.y - point2.y).abs
|
61
|
-
end
|
62
|
-
|
63
|
-
def area
|
64
|
-
self.width * self.height
|
65
|
-
end
|
66
|
-
|
67
|
-
def similarity_hash
|
68
|
-
[self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
|
69
|
-
end
|
70
|
-
|
71
|
-
def dims(*format)
|
72
|
-
if format
|
73
|
-
format.map{|method| self.send(method)}
|
74
|
-
else
|
75
|
-
[self.x, self.y, self.width, self.height]
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def contains?(other_x, other_y)
|
80
|
-
(other_x <= x2 && other_x >= x ) && (other_y <= y2 && other_y > y)
|
81
|
-
end
|
82
|
-
|
83
|
-
def overlaps?(other_rect)
|
84
|
-
return contains?(other_rect.x, other_rect.y) || contains?(other_rect.x2, other_rect.y2) ||
|
85
|
-
contains?(other_rect.x, other_rect.y2) || contains?(other_rect.x2, other_rect.y) ||
|
86
|
-
other_rect.contains?(x, y) || other_rect.contains?(x2, y2) ||
|
87
|
-
other_rect.contains?(x, y2) || other_rect.contains?(x2, y)
|
88
|
-
end
|
89
|
-
|
90
|
-
def bounding_box(other_rect)
|
91
|
-
#new rect with bounding box of these two
|
92
|
-
new_x1 = [x, other_rect.x].min
|
93
|
-
new_y1 = [x, other_rect.y].min
|
94
|
-
new_x2 = [x2, other_rect.x2].max
|
95
|
-
new_y2 = [y2, other_rect.y2].max
|
96
|
-
new_width = (new_x2 - new_x1).abs
|
97
|
-
new_height = (new_y2 - new_y1).abs
|
98
|
-
Rectangle.new_by_x_y_dims(new_x1, new_y1, new_width, new_height)
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
data/lib/geom/segment.rb
DELETED
@@ -1,82 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
-
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
-
#
|
5
|
-
|
6
|
-
|
7
|
-
module Geometry
|
8
|
-
include Math
|
9
|
-
extend Math
|
10
|
-
|
11
|
-
def Geometry.distance(point1, point2)
|
12
|
-
hypot point1.x - point2.x, point1.y - point2.y
|
13
|
-
end
|
14
|
-
|
15
|
-
|
16
|
-
class Segment < Struct.new(:point1, :point2)
|
17
|
-
def self.new_by_arrays(point1_coordinates, point2_coordinates)
|
18
|
-
self.new(Point.new_by_array(point1_coordinates),
|
19
|
-
Point.new_by_array(point2_coordinates))
|
20
|
-
end
|
21
|
-
|
22
|
-
def scale!(scale_factor)
|
23
|
-
self.point1.x = self.point1.x * scale_factor
|
24
|
-
self.point1.y = self.point1.y * scale_factor
|
25
|
-
self.point2.x = self.point2.x * scale_factor
|
26
|
-
self.point2.y = self.point2.y * scale_factor
|
27
|
-
end
|
28
|
-
|
29
|
-
def vertical?
|
30
|
-
point1.x == point2.x
|
31
|
-
end
|
32
|
-
|
33
|
-
def horizontal?
|
34
|
-
point1.y == point2.y
|
35
|
-
end
|
36
|
-
|
37
|
-
def leftmost_endpoint
|
38
|
-
((point1.x <=> point2.x) == -1) ? point1 : point2
|
39
|
-
end
|
40
|
-
|
41
|
-
def rightmost_endpoint
|
42
|
-
((point1.x <=> point2.x) == 1) ? point1 : point2
|
43
|
-
end
|
44
|
-
|
45
|
-
def topmost_endpoint
|
46
|
-
((point1.y <=> point2.y) == 1) ? point1 : point2
|
47
|
-
end
|
48
|
-
|
49
|
-
def bottommost_endpoint
|
50
|
-
((point1.y <=> point2.y) == -1) ? point1 : point2
|
51
|
-
end
|
52
|
-
|
53
|
-
def top
|
54
|
-
topmost_endpoint.y
|
55
|
-
end
|
56
|
-
|
57
|
-
def bottom
|
58
|
-
bottommost_endpoint.y
|
59
|
-
end
|
60
|
-
def width
|
61
|
-
(left - right).abs
|
62
|
-
end
|
63
|
-
def height
|
64
|
-
(bottom - top).abs
|
65
|
-
end
|
66
|
-
|
67
|
-
def left
|
68
|
-
leftmost_endpoint.x
|
69
|
-
end
|
70
|
-
|
71
|
-
def right
|
72
|
-
rightmost_endpoint.x
|
73
|
-
end
|
74
|
-
def length
|
75
|
-
Geometry.distance(point1, point2)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
def Segment(point1, point2)
|
81
|
-
Geometry::Segment.new point1, point2
|
82
|
-
end
|
data/lib/tabula/pdf_dump.rb
DELETED
@@ -1,132 +0,0 @@
|
|
1
|
-
require 'observer'
|
2
|
-
|
3
|
-
require_relative './entities.rb'
|
4
|
-
|
5
|
-
require 'java'
|
6
|
-
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
7
|
-
java_import org.apache.pdfbox.pdfparser.PDFParser
|
8
|
-
java_import org.apache.pdfbox.pdmodel.PDDocument
|
9
|
-
java_import org.apache.pdfbox.util.PDFTextStripper
|
10
|
-
java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
|
11
|
-
|
12
|
-
module Tabula
|
13
|
-
|
14
|
-
module Extraction
|
15
|
-
|
16
|
-
def Extraction.openPDF(pdf_filename, password='')
|
17
|
-
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
18
|
-
document = PDDocument.load(pdf_filename)
|
19
|
-
if document.isEncrypted
|
20
|
-
sdm = StandardDecryptionMaterial.new(password)
|
21
|
-
document.openProtection(sdm)
|
22
|
-
end
|
23
|
-
document
|
24
|
-
end
|
25
|
-
|
26
|
-
class TextExtractor < org.apache.pdfbox.util.PDFTextStripper
|
27
|
-
|
28
|
-
attr_accessor :characters, :fonts
|
29
|
-
|
30
|
-
PRINTABLE_RE = /[[:print:]]/
|
31
|
-
|
32
|
-
def initialize
|
33
|
-
super
|
34
|
-
self.fonts = {}
|
35
|
-
self.characters = []
|
36
|
-
self.setSortByPosition(true)
|
37
|
-
end
|
38
|
-
|
39
|
-
def clear!
|
40
|
-
self.characters = []; self.fonts = {}
|
41
|
-
end
|
42
|
-
|
43
|
-
def processTextPosition(text)
|
44
|
-
# return if text.getCharacter == ' '
|
45
|
-
|
46
|
-
# text_font = text.getFont
|
47
|
-
# text_size = text.getFontSize
|
48
|
-
# font_plus_size = self.fonts.select { |k, v| v == text_font }.first.first + "-" + text_size.to_i.to_s
|
49
|
-
|
50
|
-
# $fonts[$current_page].merge!({
|
51
|
-
# font_plus_size => { :family => text_font.getBaseFont, :size => text_size }
|
52
|
-
# })
|
53
|
-
|
54
|
-
# $page_contents[$current_page] += " <text top=\"%.2f\" left=\"%.2f\" width=\"%.2f\" height=\"%.2f\" font=\"#{font_plus_size}\" dir=\"#{text.getDir}\">#{text.getCharacter}</text>\n" % [text.getYDirAdj - text.getHeightDir, text.getXDirAdj, text.getWidthDirAdj, text.getHeightDir]
|
55
|
-
|
56
|
-
c = text.getCharacter
|
57
|
-
# probably not the fastest way of detecting printable chars
|
58
|
-
self.characters << text if c =~ PRINTABLE_RE
|
59
|
-
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
class PagesInfoExtractor
|
64
|
-
def initialize(pdf_filename, password='')
|
65
|
-
@pdf_file = Extraction.openPDF(pdf_filename, password)
|
66
|
-
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
67
|
-
end
|
68
|
-
|
69
|
-
def pages
|
70
|
-
Enumerator.new do |y|
|
71
|
-
begin
|
72
|
-
@all_pages.each_with_index do |page, i|
|
73
|
-
contents = page.getContents
|
74
|
-
# next if contents.nil?
|
75
|
-
y.yield Tabula::Page.new(page.findCropBox.width,
|
76
|
-
page.findCropBox.height,
|
77
|
-
page.getRotation.to_i,
|
78
|
-
i+1)
|
79
|
-
end
|
80
|
-
ensure
|
81
|
-
@pdf_file.close
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
|
88
|
-
class CharacterExtractor
|
89
|
-
include Observable
|
90
|
-
|
91
|
-
#N.B. pages can be :all, a list of pages or a range.
|
92
|
-
def initialize(pdf_filename, pages=[1], password='')
|
93
|
-
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
94
|
-
@pdf_file = Extraction.openPDF(pdf_filename, password)
|
95
|
-
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
96
|
-
@pages = pages == :all ? (1..@all_pages.size) : pages
|
97
|
-
@extractor = TextExtractor.new
|
98
|
-
end
|
99
|
-
|
100
|
-
def extract
|
101
|
-
Enumerator.new do |y|
|
102
|
-
begin
|
103
|
-
@pages.each do |i|
|
104
|
-
page = @all_pages.get(i-1)
|
105
|
-
contents = page.getContents
|
106
|
-
next if contents.nil?
|
107
|
-
@extractor.clear!
|
108
|
-
@extractor.processStream(page, page.findResources, contents.getStream)
|
109
|
-
|
110
|
-
y.yield Tabula::Page.new(page.findCropBox.width,
|
111
|
-
page.findCropBox.height,
|
112
|
-
page.getRotation.to_i,
|
113
|
-
i+1,
|
114
|
-
@extractor.characters.map { |char|
|
115
|
-
Tabula::TextElement.new(char.getYDirAdj.round(2),
|
116
|
-
char.getXDirAdj.round(2),
|
117
|
-
char.getWidthDirAdj.round(2),
|
118
|
-
char.getHeightDir.round(2),
|
119
|
-
char.getFont,
|
120
|
-
char.getFontSize.round(2),
|
121
|
-
char.getCharacter,
|
122
|
-
char.getWidthOfSpace)
|
123
|
-
})
|
124
|
-
end
|
125
|
-
ensure
|
126
|
-
@pdf_file.close
|
127
|
-
end # begin
|
128
|
-
end
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
132
|
-
end
|