tabula-extractor 0.6.6-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 0.6.6
|
4
|
+
version: 0.7.0
|
6
5
|
platform: java
|
7
6
|
authors:
|
8
7
|
- Manuel Aristarán
|
@@ -11,7 +10,7 @@ authors:
|
|
11
10
|
autorequire:
|
12
11
|
bindir: bin
|
13
12
|
cert_chain: []
|
14
|
-
date:
|
13
|
+
date: 2014-01-07 00:00:00.000000000 Z
|
15
14
|
dependencies:
|
16
15
|
- !ruby/object:Gem::Dependency
|
17
16
|
name: minitest
|
@@ -20,13 +19,11 @@ dependencies:
|
|
20
19
|
- - '>='
|
21
20
|
- !ruby/object:Gem::Version
|
22
21
|
version: '0'
|
23
|
-
none: false
|
24
22
|
requirement: !ruby/object:Gem::Requirement
|
25
23
|
requirements:
|
26
24
|
- - '>='
|
27
25
|
- !ruby/object:Gem::Version
|
28
26
|
version: '0'
|
29
|
-
none: false
|
30
27
|
prerelease: false
|
31
28
|
type: :development
|
32
29
|
- !ruby/object:Gem::Dependency
|
@@ -36,13 +33,11 @@ dependencies:
|
|
36
33
|
- - '>='
|
37
34
|
- !ruby/object:Gem::Version
|
38
35
|
version: 1.3.4
|
39
|
-
none: false
|
40
36
|
requirement: !ruby/object:Gem::Requirement
|
41
37
|
requirements:
|
42
38
|
- - '>='
|
43
39
|
- !ruby/object:Gem::Version
|
44
40
|
version: 1.3.4
|
45
|
-
none: false
|
46
41
|
prerelease: false
|
47
42
|
type: :development
|
48
43
|
- !ruby/object:Gem::Dependency
|
@@ -52,13 +47,25 @@ dependencies:
|
|
52
47
|
- - '>='
|
53
48
|
- !ruby/object:Gem::Version
|
54
49
|
version: '0'
|
55
|
-
none: false
|
56
50
|
requirement: !ruby/object:Gem::Requirement
|
57
51
|
requirements:
|
58
52
|
- - '>='
|
59
53
|
- !ruby/object:Gem::Version
|
60
54
|
version: '0'
|
61
|
-
|
55
|
+
prerelease: false
|
56
|
+
type: :development
|
57
|
+
- !ruby/object:Gem::Dependency
|
58
|
+
name: pry
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
62
69
|
prerelease: false
|
63
70
|
type: :development
|
64
71
|
- !ruby/object:Gem::Dependency
|
@@ -68,13 +75,11 @@ dependencies:
|
|
68
75
|
- - ~>
|
69
76
|
- !ruby/object:Gem::Version
|
70
77
|
version: '2.0'
|
71
|
-
none: false
|
72
78
|
requirement: !ruby/object:Gem::Requirement
|
73
79
|
requirements:
|
74
80
|
- - ~>
|
75
81
|
- !ruby/object:Gem::Version
|
76
82
|
version: '2.0'
|
77
|
-
none: false
|
78
83
|
prerelease: false
|
79
84
|
type: :runtime
|
80
85
|
description: extract tables from PDF files
|
@@ -109,34 +114,65 @@ files:
|
|
109
114
|
- ext/liblsd64.dll
|
110
115
|
- ext/lsd.c
|
111
116
|
- ext/lsd.h
|
112
|
-
- lib/geom/point.rb
|
113
|
-
- lib/geom/rectangle.rb
|
114
|
-
- lib/geom/segment.rb
|
115
117
|
- lib/tabula.rb
|
116
118
|
- lib/tabula/core_ext.rb
|
117
119
|
- lib/tabula/entities.rb
|
120
|
+
- lib/tabula/entities/cell.rb
|
121
|
+
- lib/tabula/entities/has_cells.rb
|
122
|
+
- lib/tabula/entities/line.rb
|
123
|
+
- lib/tabula/entities/page.rb
|
124
|
+
- lib/tabula/entities/page_area.rb
|
125
|
+
- lib/tabula/entities/ruling.rb
|
126
|
+
- lib/tabula/entities/spreadsheet.rb
|
127
|
+
- lib/tabula/entities/table.rb
|
128
|
+
- lib/tabula/entities/text_chunk.rb
|
129
|
+
- lib/tabula/entities/text_element.rb
|
130
|
+
- lib/tabula/entities/zone_entity.rb
|
131
|
+
- lib/tabula/extraction.rb
|
118
132
|
- lib/tabula/line_segment_detector.rb
|
119
|
-
- lib/tabula/
|
133
|
+
- lib/tabula/pdf_line_extractor.rb
|
120
134
|
- lib/tabula/pdf_render.rb
|
135
|
+
- lib/tabula/spreadsheet_extractor.rb
|
121
136
|
- lib/tabula/table_extractor.rb
|
122
137
|
- lib/tabula/table_guesser.rb
|
123
138
|
- lib/tabula/version.rb
|
124
|
-
- lib/tabula/whitespace.rb
|
125
139
|
- lib/tabula/writers.rb
|
126
140
|
- tabula-extractor.gemspec
|
127
141
|
- target/pdfbox-app-2.0.0-SNAPSHOT.jar
|
142
|
+
- test/data/47008204D_USA.page4.pdf
|
143
|
+
- test/data/560015757GV_China.page1.pdf
|
128
144
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
145
|
+
- test/data/GSK_2012_Q4.page437.pdf
|
146
|
+
- test/data/S2MNCEbirdisland.pdf
|
129
147
|
- test/data/argentina_diputados_voting_record.pdf
|
130
148
|
- test/data/bo_page24.pdf
|
149
|
+
- test/data/campaign_donors.pdf
|
131
150
|
- test/data/frx_2012_disclosure.pdf
|
151
|
+
- test/data/frx_2012_disclosure.tsv
|
132
152
|
- test/data/gre.pdf
|
153
|
+
- test/data/no_tables.pdf
|
154
|
+
- test/data/puertos1.pdf
|
155
|
+
- test/data/spanning_cells.csv
|
156
|
+
- test/data/spanning_cells.pdf
|
157
|
+
- test/data/strongschools.pdf
|
133
158
|
- test/data/tabla_subsidios.pdf
|
159
|
+
- test/data/vertical_rulings_bug.pdf
|
160
|
+
- test/data/vietnam3.pdf
|
161
|
+
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
162
|
+
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
163
|
+
- test/heuristic-test-set/original/bo_page24.pdf
|
164
|
+
- test/heuristic-test-set/original/campaign_donors.pdf
|
165
|
+
- test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
|
166
|
+
- test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
|
167
|
+
- test/heuristic-test-set/spreadsheet/strongschools.pdf
|
168
|
+
- test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
|
169
|
+
- test/heuristic.rb
|
170
|
+
- test/test_bin_tabula.sh
|
134
171
|
- test/tests.rb
|
135
|
-
- vertical_rulings_bug.pdf
|
136
|
-
- vertical_rulings_bug.rb
|
137
172
|
homepage: https://github.com/jazzido/tabula-extractor
|
138
173
|
licenses:
|
139
174
|
- MIT
|
175
|
+
metadata: {}
|
140
176
|
post_install_message:
|
141
177
|
rdoc_options: []
|
142
178
|
require_paths:
|
@@ -145,31 +181,46 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
145
181
|
requirements:
|
146
182
|
- - '>='
|
147
183
|
- !ruby/object:Gem::Version
|
148
|
-
segments:
|
149
|
-
- 0
|
150
|
-
hash: 2
|
151
184
|
version: '0'
|
152
|
-
none: false
|
153
185
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
154
186
|
requirements:
|
155
187
|
- - '>='
|
156
188
|
- !ruby/object:Gem::Version
|
157
|
-
segments:
|
158
|
-
- 0
|
159
|
-
hash: 2
|
160
189
|
version: '0'
|
161
|
-
none: false
|
162
190
|
requirements: []
|
163
191
|
rubyforge_project:
|
164
|
-
rubygems_version: 1.
|
192
|
+
rubygems_version: 2.1.9
|
165
193
|
signing_key:
|
166
|
-
specification_version:
|
194
|
+
specification_version: 4
|
167
195
|
summary: extract tables from PDF files
|
168
196
|
test_files:
|
197
|
+
- test/data/47008204D_USA.page4.pdf
|
198
|
+
- test/data/560015757GV_China.page1.pdf
|
169
199
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
200
|
+
- test/data/GSK_2012_Q4.page437.pdf
|
201
|
+
- test/data/S2MNCEbirdisland.pdf
|
170
202
|
- test/data/argentina_diputados_voting_record.pdf
|
171
203
|
- test/data/bo_page24.pdf
|
204
|
+
- test/data/campaign_donors.pdf
|
172
205
|
- test/data/frx_2012_disclosure.pdf
|
206
|
+
- test/data/frx_2012_disclosure.tsv
|
173
207
|
- test/data/gre.pdf
|
208
|
+
- test/data/no_tables.pdf
|
209
|
+
- test/data/puertos1.pdf
|
210
|
+
- test/data/spanning_cells.csv
|
211
|
+
- test/data/spanning_cells.pdf
|
212
|
+
- test/data/strongschools.pdf
|
174
213
|
- test/data/tabla_subsidios.pdf
|
214
|
+
- test/data/vertical_rulings_bug.pdf
|
215
|
+
- test/data/vietnam3.pdf
|
216
|
+
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
217
|
+
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
218
|
+
- test/heuristic-test-set/original/bo_page24.pdf
|
219
|
+
- test/heuristic-test-set/original/campaign_donors.pdf
|
220
|
+
- test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
|
221
|
+
- test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
|
222
|
+
- test/heuristic-test-set/spreadsheet/strongschools.pdf
|
223
|
+
- test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
|
224
|
+
- test/heuristic.rb
|
225
|
+
- test/test_bin_tabula.sh
|
175
226
|
- test/tests.rb
|
data/lib/geom/point.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
-
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
-
#
|
5
|
-
|
6
|
-
|
7
|
-
module Geometry
|
8
|
-
class Point < Struct.new(:x, :y)
|
9
|
-
def self.new_by_array(array)
|
10
|
-
self.new(array[0], array[1])
|
11
|
-
end
|
12
|
-
|
13
|
-
def ==(another_point)
|
14
|
-
x === another_point.x && y === another_point.y
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
def Point(x, y)
|
20
|
-
Geometry::Point.new(x, y)
|
21
|
-
end
|
data/lib/geom/rectangle.rb
DELETED
@@ -1,101 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
-
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
-
#
|
5
|
-
|
6
|
-
|
7
|
-
module Geometry
|
8
|
-
class Rectangle < Struct.new(:point1, :point2)
|
9
|
-
SIMILARITY_DIVISOR = 20
|
10
|
-
|
11
|
-
def Rectangle.unionize(non_overlapping_rectangles, next_rect)
|
12
|
-
#if next_rect doesn't overlap any of non_overlapping_rectangles
|
13
|
-
if (overlapping = non_overlapping_rectangles.select{|r| next_rect.overlaps? r}) && !non_overlapping_rectangles.empty?
|
14
|
-
#remove all of those that it overlaps from non_overlapping_rectangles and
|
15
|
-
non_overlapping_rectangles -= overlapping
|
16
|
-
#add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
|
17
|
-
non_overlapping_rectangles << overlapping.inject(next_rect){|memo, overlap| memo.bounding_box(overlap) }
|
18
|
-
|
19
|
-
else
|
20
|
-
non_overlapping_rectangles << next_rect
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.new_by_x_y_dims(x, y, width, height)
|
25
|
-
self.new( Point.new_by_array([x, y]), Point.new_by_array([x + width, y + height]) )
|
26
|
-
end
|
27
|
-
|
28
|
-
def x
|
29
|
-
[point1.x, point2.x].min
|
30
|
-
end
|
31
|
-
|
32
|
-
alias_method :left, :x
|
33
|
-
|
34
|
-
def y
|
35
|
-
#puts "y: [#{point1.y} #{point2.y}].min"
|
36
|
-
[point1.y, point2.y].min
|
37
|
-
end
|
38
|
-
|
39
|
-
alias_method :top, :y
|
40
|
-
|
41
|
-
def x2
|
42
|
-
[point1.x, point2.x].max
|
43
|
-
end
|
44
|
-
|
45
|
-
alias_method :right, :x2
|
46
|
-
|
47
|
-
def y2
|
48
|
-
#puts "y2: [#{point1.y} #{point2.y}].max"
|
49
|
-
[point1.y, point2.y].max
|
50
|
-
end
|
51
|
-
|
52
|
-
alias_method :bottom, :y2
|
53
|
-
|
54
|
-
|
55
|
-
def width
|
56
|
-
(point1.x - point2.x).abs
|
57
|
-
end
|
58
|
-
|
59
|
-
def height
|
60
|
-
(point1.y - point2.y).abs
|
61
|
-
end
|
62
|
-
|
63
|
-
def area
|
64
|
-
self.width * self.height
|
65
|
-
end
|
66
|
-
|
67
|
-
def similarity_hash
|
68
|
-
[self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
|
69
|
-
end
|
70
|
-
|
71
|
-
def dims(*format)
|
72
|
-
if format
|
73
|
-
format.map{|method| self.send(method)}
|
74
|
-
else
|
75
|
-
[self.x, self.y, self.width, self.height]
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def contains?(other_x, other_y)
|
80
|
-
(other_x <= x2 && other_x >= x ) && (other_y <= y2 && other_y > y)
|
81
|
-
end
|
82
|
-
|
83
|
-
def overlaps?(other_rect)
|
84
|
-
return contains?(other_rect.x, other_rect.y) || contains?(other_rect.x2, other_rect.y2) ||
|
85
|
-
contains?(other_rect.x, other_rect.y2) || contains?(other_rect.x2, other_rect.y) ||
|
86
|
-
other_rect.contains?(x, y) || other_rect.contains?(x2, y2) ||
|
87
|
-
other_rect.contains?(x, y2) || other_rect.contains?(x2, y)
|
88
|
-
end
|
89
|
-
|
90
|
-
def bounding_box(other_rect)
|
91
|
-
#new rect with bounding box of these two
|
92
|
-
new_x1 = [x, other_rect.x].min
|
93
|
-
new_y1 = [x, other_rect.y].min
|
94
|
-
new_x2 = [x2, other_rect.x2].max
|
95
|
-
new_y2 = [y2, other_rect.y2].max
|
96
|
-
new_width = (new_x2 - new_x1).abs
|
97
|
-
new_height = (new_y2 - new_y1).abs
|
98
|
-
Rectangle.new_by_x_y_dims(new_x1, new_y1, new_width, new_height)
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
data/lib/geom/segment.rb
DELETED
@@ -1,82 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
|
3
|
-
# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
|
4
|
-
#
|
5
|
-
|
6
|
-
|
7
|
-
module Geometry
|
8
|
-
include Math
|
9
|
-
extend Math
|
10
|
-
|
11
|
-
def Geometry.distance(point1, point2)
|
12
|
-
hypot point1.x - point2.x, point1.y - point2.y
|
13
|
-
end
|
14
|
-
|
15
|
-
|
16
|
-
class Segment < Struct.new(:point1, :point2)
|
17
|
-
def self.new_by_arrays(point1_coordinates, point2_coordinates)
|
18
|
-
self.new(Point.new_by_array(point1_coordinates),
|
19
|
-
Point.new_by_array(point2_coordinates))
|
20
|
-
end
|
21
|
-
|
22
|
-
def scale!(scale_factor)
|
23
|
-
self.point1.x = self.point1.x * scale_factor
|
24
|
-
self.point1.y = self.point1.y * scale_factor
|
25
|
-
self.point2.x = self.point2.x * scale_factor
|
26
|
-
self.point2.y = self.point2.y * scale_factor
|
27
|
-
end
|
28
|
-
|
29
|
-
def vertical?
|
30
|
-
point1.x == point2.x
|
31
|
-
end
|
32
|
-
|
33
|
-
def horizontal?
|
34
|
-
point1.y == point2.y
|
35
|
-
end
|
36
|
-
|
37
|
-
def leftmost_endpoint
|
38
|
-
((point1.x <=> point2.x) == -1) ? point1 : point2
|
39
|
-
end
|
40
|
-
|
41
|
-
def rightmost_endpoint
|
42
|
-
((point1.x <=> point2.x) == 1) ? point1 : point2
|
43
|
-
end
|
44
|
-
|
45
|
-
def topmost_endpoint
|
46
|
-
((point1.y <=> point2.y) == 1) ? point1 : point2
|
47
|
-
end
|
48
|
-
|
49
|
-
def bottommost_endpoint
|
50
|
-
((point1.y <=> point2.y) == -1) ? point1 : point2
|
51
|
-
end
|
52
|
-
|
53
|
-
def top
|
54
|
-
topmost_endpoint.y
|
55
|
-
end
|
56
|
-
|
57
|
-
def bottom
|
58
|
-
bottommost_endpoint.y
|
59
|
-
end
|
60
|
-
def width
|
61
|
-
(left - right).abs
|
62
|
-
end
|
63
|
-
def height
|
64
|
-
(bottom - top).abs
|
65
|
-
end
|
66
|
-
|
67
|
-
def left
|
68
|
-
leftmost_endpoint.x
|
69
|
-
end
|
70
|
-
|
71
|
-
def right
|
72
|
-
rightmost_endpoint.x
|
73
|
-
end
|
74
|
-
def length
|
75
|
-
Geometry.distance(point1, point2)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
def Segment(point1, point2)
|
81
|
-
Geometry::Segment.new point1, point2
|
82
|
-
end
|
data/lib/tabula/pdf_dump.rb
DELETED
@@ -1,132 +0,0 @@
|
|
1
|
-
require 'observer'
|
2
|
-
|
3
|
-
require_relative './entities.rb'
|
4
|
-
|
5
|
-
require 'java'
|
6
|
-
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
7
|
-
java_import org.apache.pdfbox.pdfparser.PDFParser
|
8
|
-
java_import org.apache.pdfbox.pdmodel.PDDocument
|
9
|
-
java_import org.apache.pdfbox.util.PDFTextStripper
|
10
|
-
java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
|
11
|
-
|
12
|
-
module Tabula
|
13
|
-
|
14
|
-
module Extraction
|
15
|
-
|
16
|
-
def Extraction.openPDF(pdf_filename, password='')
|
17
|
-
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
18
|
-
document = PDDocument.load(pdf_filename)
|
19
|
-
if document.isEncrypted
|
20
|
-
sdm = StandardDecryptionMaterial.new(password)
|
21
|
-
document.openProtection(sdm)
|
22
|
-
end
|
23
|
-
document
|
24
|
-
end
|
25
|
-
|
26
|
-
class TextExtractor < org.apache.pdfbox.util.PDFTextStripper
|
27
|
-
|
28
|
-
attr_accessor :characters, :fonts
|
29
|
-
|
30
|
-
PRINTABLE_RE = /[[:print:]]/
|
31
|
-
|
32
|
-
def initialize
|
33
|
-
super
|
34
|
-
self.fonts = {}
|
35
|
-
self.characters = []
|
36
|
-
self.setSortByPosition(true)
|
37
|
-
end
|
38
|
-
|
39
|
-
def clear!
|
40
|
-
self.characters = []; self.fonts = {}
|
41
|
-
end
|
42
|
-
|
43
|
-
def processTextPosition(text)
|
44
|
-
# return if text.getCharacter == ' '
|
45
|
-
|
46
|
-
# text_font = text.getFont
|
47
|
-
# text_size = text.getFontSize
|
48
|
-
# font_plus_size = self.fonts.select { |k, v| v == text_font }.first.first + "-" + text_size.to_i.to_s
|
49
|
-
|
50
|
-
# $fonts[$current_page].merge!({
|
51
|
-
# font_plus_size => { :family => text_font.getBaseFont, :size => text_size }
|
52
|
-
# })
|
53
|
-
|
54
|
-
# $page_contents[$current_page] += " <text top=\"%.2f\" left=\"%.2f\" width=\"%.2f\" height=\"%.2f\" font=\"#{font_plus_size}\" dir=\"#{text.getDir}\">#{text.getCharacter}</text>\n" % [text.getYDirAdj - text.getHeightDir, text.getXDirAdj, text.getWidthDirAdj, text.getHeightDir]
|
55
|
-
|
56
|
-
c = text.getCharacter
|
57
|
-
# probably not the fastest way of detecting printable chars
|
58
|
-
self.characters << text if c =~ PRINTABLE_RE
|
59
|
-
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
class PagesInfoExtractor
|
64
|
-
def initialize(pdf_filename, password='')
|
65
|
-
@pdf_file = Extraction.openPDF(pdf_filename, password)
|
66
|
-
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
67
|
-
end
|
68
|
-
|
69
|
-
def pages
|
70
|
-
Enumerator.new do |y|
|
71
|
-
begin
|
72
|
-
@all_pages.each_with_index do |page, i|
|
73
|
-
contents = page.getContents
|
74
|
-
# next if contents.nil?
|
75
|
-
y.yield Tabula::Page.new(page.findCropBox.width,
|
76
|
-
page.findCropBox.height,
|
77
|
-
page.getRotation.to_i,
|
78
|
-
i+1)
|
79
|
-
end
|
80
|
-
ensure
|
81
|
-
@pdf_file.close
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
|
88
|
-
class CharacterExtractor
|
89
|
-
include Observable
|
90
|
-
|
91
|
-
#N.B. pages can be :all, a list of pages or a range.
|
92
|
-
def initialize(pdf_filename, pages=[1], password='')
|
93
|
-
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
94
|
-
@pdf_file = Extraction.openPDF(pdf_filename, password)
|
95
|
-
@all_pages = @pdf_file.getDocumentCatalog.getAllPages
|
96
|
-
@pages = pages == :all ? (1..@all_pages.size) : pages
|
97
|
-
@extractor = TextExtractor.new
|
98
|
-
end
|
99
|
-
|
100
|
-
def extract
|
101
|
-
Enumerator.new do |y|
|
102
|
-
begin
|
103
|
-
@pages.each do |i|
|
104
|
-
page = @all_pages.get(i-1)
|
105
|
-
contents = page.getContents
|
106
|
-
next if contents.nil?
|
107
|
-
@extractor.clear!
|
108
|
-
@extractor.processStream(page, page.findResources, contents.getStream)
|
109
|
-
|
110
|
-
y.yield Tabula::Page.new(page.findCropBox.width,
|
111
|
-
page.findCropBox.height,
|
112
|
-
page.getRotation.to_i,
|
113
|
-
i+1,
|
114
|
-
@extractor.characters.map { |char|
|
115
|
-
Tabula::TextElement.new(char.getYDirAdj.round(2),
|
116
|
-
char.getXDirAdj.round(2),
|
117
|
-
char.getWidthDirAdj.round(2),
|
118
|
-
char.getHeightDir.round(2),
|
119
|
-
char.getFont,
|
120
|
-
char.getFontSize.round(2),
|
121
|
-
char.getCharacter,
|
122
|
-
char.getWidthOfSpace)
|
123
|
-
})
|
124
|
-
end
|
125
|
-
ensure
|
126
|
-
@pdf_file.close
|
127
|
-
end # begin
|
128
|
-
end
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
132
|
-
end
|