tabula-extractor 0.7.2-java → 0.7.4-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +4 -8
- data/bin/tabula +3 -3
- data/lib/tabula.rb +9 -5
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/cell.rb +6 -4
- data/lib/tabula/entities/has_cells.rb +22 -78
- data/lib/tabula/entities/line.rb +52 -6
- data/lib/tabula/entities/page.rb +43 -50
- data/lib/tabula/entities/ruling.rb +83 -105
- data/lib/tabula/entities/spreadsheet.rb +74 -11
- data/lib/tabula/entities/table.rb +55 -37
- data/lib/tabula/entities/tabular.rb +42 -0
- data/lib/tabula/entities/text_chunk.rb +55 -52
- data/lib/tabula/entities/text_element.rb +129 -62
- data/lib/tabula/entities/zone_entity.rb +15 -6
- data/lib/tabula/extraction.rb +114 -49
- data/lib/tabula/line_segment_detector.rb +0 -5
- data/lib/tabula/table_extractor.rb +32 -37
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -5
- metadata +13 -95
- data/ext/COPYING +0 -661
- data/ext/Makefile.OSX +0 -18
- data/ext/Makefile.defaults +0 -9
- data/ext/Makefile.linux32 +0 -11
- data/ext/Makefile.linux64 +0 -12
- data/ext/Makefile.mingw +0 -10
- data/ext/Makefile.mingw64 +0 -10
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +0 -3
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +0 -2270
- data/ext/lsd.h +0 -283
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/argentina_diputados_voting_record.pdf +0 -0
- data/test/data/bo_page24.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +0 -88
- data/test/data/gre.pdf +0 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +0 -21
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/tabla_subsidios.pdf +0 -0
- data/test/data/vertical_rulings_bug.pdf +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +0 -50
- data/test/test_bin_tabula.sh +0 -7
- data/test/tests.rb +0 -603
data/lib/tabula/extraction.rb
CHANGED
@@ -31,6 +31,8 @@ module Tabula
|
|
31
31
|
:extract_ruling_lines => true
|
32
32
|
}
|
33
33
|
|
34
|
+
# TODO: the +pages+ constructor argument does not make sense
|
35
|
+
# now that we have +extract_page+ and +extract_pages+
|
34
36
|
def initialize(pdf_filename, pages=[1], password='', options={})
|
35
37
|
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
36
38
|
@pdf_filename = pdf_filename
|
@@ -47,39 +49,67 @@ module Tabula
|
|
47
49
|
@transformed_clipping_path = nil
|
48
50
|
self.clipping_paths = []
|
49
51
|
@rulings = []
|
50
|
-
@min_char_width = @min_char_height =
|
52
|
+
@min_char_width = @min_char_height = Float::MAX
|
51
53
|
end
|
52
54
|
|
53
|
-
def
|
55
|
+
def close!
|
56
|
+
self.ensure_open!
|
57
|
+
@pdf_file.close
|
58
|
+
@pdf_file_closed = true
|
59
|
+
end
|
60
|
+
|
61
|
+
def ensure_open!
|
62
|
+
raise "Document is closed" if @pdf_file_closed
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# extract objects from a page. Returns an instance of +Tabula::Page+
|
67
|
+
# (+page_number+ is 1-based. i.e., first page is number 1)
|
68
|
+
def extract_page(page_number)
|
69
|
+
self.ensure_open!
|
70
|
+
|
71
|
+
if page_number-1 >= @all_pages.size || (page_number) < 0
|
72
|
+
raise IndexError, "Page #{page_number} doesn't exist. Skipping. Valid pages are 1..#{@all_pages.size}"
|
73
|
+
end
|
74
|
+
|
75
|
+
page = @all_pages.get(page_number-1)
|
76
|
+
contents = page.getContents
|
77
|
+
return nil if contents.nil?
|
78
|
+
|
79
|
+
self.clear!
|
80
|
+
self.drawPage(page)
|
81
|
+
Tabula::Page.new(@pdf_filename,
|
82
|
+
page.findCropBox.width,
|
83
|
+
page.findCropBox.height,
|
84
|
+
page.getRotation.to_i,
|
85
|
+
page_number, #one-indexed, just like +page_number+ is.
|
86
|
+
self.characters,
|
87
|
+
self.rulings,
|
88
|
+
@min_char_width,
|
89
|
+
@min_char_height)
|
90
|
+
end
|
91
|
+
|
92
|
+
def extract(pages=nil)
|
93
|
+
self.ensure_open!
|
94
|
+
pages = if pages == :all
|
95
|
+
(1..@all_pages.size)
|
96
|
+
elsif pages.nil?
|
97
|
+
@pages
|
98
|
+
else
|
99
|
+
pages
|
100
|
+
end
|
101
|
+
|
54
102
|
Enumerator.new do |y|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
contents = page.getContents
|
59
|
-
next if contents.nil?
|
60
|
-
|
61
|
-
self.clear!
|
62
|
-
self.drawPage(page)
|
63
|
-
p = Tabula::Page.new(@pdf_filename,
|
64
|
-
page.findCropBox.width,
|
65
|
-
page.findCropBox.height,
|
66
|
-
page.getRotation.to_i,
|
67
|
-
i, #one-indexed, just like `i` is.
|
68
|
-
self.characters,
|
69
|
-
self.rulings,
|
70
|
-
@min_char_width,
|
71
|
-
@min_char_height)
|
72
|
-
y.yield p
|
73
|
-
end
|
74
|
-
ensure
|
75
|
-
@pdf_file.close
|
76
|
-
end # begin
|
103
|
+
pages.each do |i|
|
104
|
+
y.yield self.extract_page(i)
|
105
|
+
end
|
77
106
|
end
|
78
107
|
end
|
79
108
|
|
80
109
|
def clear!
|
81
110
|
self.characters.clear
|
82
111
|
self.clipping_paths.clear
|
112
|
+
@min_char_width = @min_char_height = Float::MAX
|
83
113
|
@page_transform = nil
|
84
114
|
@rulings.clear
|
85
115
|
end
|
@@ -118,8 +148,14 @@ module Tabula
|
|
118
148
|
|
119
149
|
path = self.pathToList(self.getLinePath)
|
120
150
|
|
151
|
+
# skip paths whose first operation is not a MOVETO
|
152
|
+
# or contains operations other than LINETO, MOVETO or CLOSE
|
121
153
|
if path[0][0] != java.awt.geom.PathIterator::SEG_MOVETO \
|
122
|
-
|
154
|
+
|| path[1..-1].any? { |p|
|
155
|
+
p.first != java.awt.geom.PathIterator::SEG_LINETO \
|
156
|
+
&& p.first != java.awt.geom.PathIterator::SEG_MOVETO \
|
157
|
+
&& p.first != java.awt.geom.PathIterator::SEG_CLOSE
|
158
|
+
}
|
123
159
|
self.getLinePath.reset
|
124
160
|
return
|
125
161
|
end
|
@@ -129,26 +165,57 @@ module Tabula
|
|
129
165
|
strokeColorComps = filter_by_color || self.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
|
130
166
|
color_filter = self.options[:line_color_filter]
|
131
167
|
|
168
|
+
if !color_filter.nil? && !color_filter.call(strokeColorComps)
|
169
|
+
self.getLinePath.reset
|
170
|
+
return
|
171
|
+
end
|
172
|
+
|
173
|
+
# skip the first path operation save it as the starting position
|
132
174
|
first = path.shift
|
133
|
-
|
175
|
+
# last_move
|
176
|
+
start_pos = last_move = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1])
|
177
|
+
|
178
|
+
end_pos = nil
|
134
179
|
|
135
180
|
path.each do |p|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
181
|
+
case p[0]
|
182
|
+
when java.awt.geom.PathIterator::SEG_LINETO
|
183
|
+
end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
|
184
|
+
line = (start_pos <=> end_pos) == -1 \
|
185
|
+
? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \
|
186
|
+
: java.awt.geom.Line2D::Float.new(end_pos, start_pos)
|
187
|
+
|
188
|
+
if line.intersects(ccp_bounds)
|
189
|
+
# convert line to rectangle for clipping it to the current clippath
|
190
|
+
# sucks, but awt doesn't have methods for this
|
191
|
+
tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
|
192
|
+
@rulings << ::Tabula::Ruling.new(tmp.getY,
|
193
|
+
tmp.getX,
|
194
|
+
tmp.getWidth,
|
195
|
+
tmp.getHeight,
|
196
|
+
filter_by_color.to_a)
|
197
|
+
end
|
198
|
+
when java.awt.geom.PathIterator::SEG_MOVETO
|
199
|
+
last_move = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
|
200
|
+
when java.awt.geom.PathIterator::SEG_CLOSE
|
201
|
+
# according to PathIterator docs:
|
202
|
+
# "the preceding subpath should be closed by appending a line segment
|
203
|
+
# back to the point corresponding to the most recent SEG_MOVETO."
|
204
|
+
|
205
|
+
line = (end_pos <=> last_move) == -1 \
|
206
|
+
? java.awt.geom.Line2D::Float.new(end_pos, last_move) \
|
207
|
+
: java.awt.geom.Line2D::Float.new(last_move, end_pos)
|
208
|
+
|
209
|
+
if line.intersects(ccp_bounds)
|
210
|
+
# convert line to rectangle for clipping it to the current clippath
|
211
|
+
# sucks, but awt doesn't have methods for this
|
212
|
+
tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
|
213
|
+
@rulings << ::Tabula::Ruling.new(tmp.getY,
|
214
|
+
tmp.getX,
|
215
|
+
tmp.getWidth,
|
216
|
+
tmp.getHeight,
|
217
|
+
filter_by_color.to_a)
|
218
|
+
end
|
152
219
|
end
|
153
220
|
start_pos = end_pos
|
154
221
|
end
|
@@ -201,22 +268,21 @@ module Tabula
|
|
201
268
|
c = text.getCharacter
|
202
269
|
h = text.getHeightDir.round(2)
|
203
270
|
|
204
|
-
if c == '
|
271
|
+
if c == ' ' # replace non-breaking space for space
|
205
272
|
c = ' '
|
206
|
-
h = text.getWidth.round(2)
|
207
273
|
end
|
208
274
|
|
209
275
|
te = Tabula::TextElement.new(text.getY.round(2) - h,
|
210
276
|
text.getX.round(2),
|
211
|
-
text.
|
277
|
+
text.getWidthDirAdj,
|
212
278
|
# ugly hack follows: we need spaces to have a height, so we can
|
213
279
|
# test for vertical overlap. height == width seems a safe bet.
|
214
|
-
|
280
|
+
text.getHeightDir,
|
215
281
|
text.getFont,
|
216
|
-
text.getFontSize
|
282
|
+
text.getFontSize,
|
217
283
|
c,
|
218
284
|
# workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
|
219
|
-
text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
|
285
|
+
(text.getWidthOfSpace.nan? || text.getWidthOfSpace == 0) ? self.currentSpaceWidth : text.getWidthOfSpace,
|
220
286
|
text.getDir)
|
221
287
|
|
222
288
|
ccp_bounds = self.currentClippingPath
|
@@ -246,7 +312,6 @@ module Tabula
|
|
246
312
|
end
|
247
313
|
|
248
314
|
def rulings
|
249
|
-
return [] if @rulings.empty?
|
250
315
|
@rulings.reject { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
|
251
316
|
end
|
252
317
|
|
@@ -42,10 +42,12 @@ module Tabula
|
|
42
42
|
page = [page]
|
43
43
|
end
|
44
44
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
45
|
+
extractor = Extraction::ObjectExtractor.new(pdf_path,
|
46
|
+
page,
|
47
|
+
options[:password])
|
48
|
+
|
49
|
+
pdf_page = extractor.extract.next
|
50
|
+
extractor.close!
|
49
51
|
|
50
52
|
if ["spreadsheet", "original"].include? options[:extraction_method]
|
51
53
|
use_spreadsheet_extraction_method = options[:extraction_method] == "spreadsheet"
|
@@ -54,39 +56,32 @@ module Tabula
|
|
54
56
|
end
|
55
57
|
|
56
58
|
if use_spreadsheet_extraction_method
|
57
|
-
|
58
|
-
else
|
59
|
-
use_detected_lines = false
|
60
|
-
if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
|
61
|
-
detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
|
62
|
-
area)
|
63
|
-
|
64
|
-
# only use lines if at least 80% of them cover at least 90%
|
65
|
-
# of the height of area of interest
|
66
|
-
|
67
|
-
# TODO this heuristic SUCKS
|
68
|
-
# what if only a couple columns is delimited with vertical rulings?
|
69
|
-
# ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
|
70
|
-
# idea: detect columns without considering rulings, detect vertical rulings
|
71
|
-
# calculate ratio and try to come up with a threshold
|
72
|
-
use_detected_lines = detected_vertical_rulings.size > 2 \
|
73
|
-
&& (detected_vertical_rulings.count { |vl|
|
74
|
-
vl.height / area.height > 0.9
|
75
|
-
} / detected_vertical_rulings.size.to_f) >= 0.8
|
76
|
-
|
77
|
-
end
|
78
|
-
|
79
|
-
table = pdf_page.get_area(area).get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
|
80
|
-
|
81
|
-
# fixes up the table a little bit, replacing nils with empty TextElements
|
82
|
-
# and sorting the lines.
|
83
|
-
table.lines.each do |l|
|
84
|
-
l.text_elements = l.text_elements.map do |te|
|
85
|
-
te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
86
|
-
end
|
87
|
-
end
|
88
|
-
table.lines.sort_by! { |l| l.text_elements.map { |te| te.top or 0 }.max }
|
89
|
-
table
|
59
|
+
return (spreadsheets = pdf_page.get_area(area).spreadsheets).empty? ? Spreadsheet.empty(pdf_page) : spreadsheets.inject(&:+)
|
90
60
|
end
|
61
|
+
|
62
|
+
use_detected_lines = false
|
63
|
+
if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
|
64
|
+
detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
|
65
|
+
area)
|
66
|
+
|
67
|
+
# only use lines if at least 80% of them cover at least 90%
|
68
|
+
# of the height of area of interest
|
69
|
+
|
70
|
+
# TODO this heuristic SUCKS
|
71
|
+
# what if only a couple columns is delimited with vertical rulings?
|
72
|
+
# ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
|
73
|
+
# idea: detect columns without considering rulings, detect vertical rulings
|
74
|
+
# calculate ratio and try to come up with a threshold
|
75
|
+
use_detected_lines = detected_vertical_rulings.size > 2 \
|
76
|
+
&& (detected_vertical_rulings.count { |vl|
|
77
|
+
vl.height / area.height > 0.9
|
78
|
+
} / detected_vertical_rulings.size.to_f) >= 0.8
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
pdf_page
|
83
|
+
.get_area(area)
|
84
|
+
.get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
|
85
|
+
|
91
86
|
end
|
92
87
|
end
|
data/lib/tabula/version.rb
CHANGED
data/tabula-extractor.gemspec
CHANGED
@@ -15,17 +15,14 @@ Gem::Specification.new do |s|
|
|
15
15
|
|
16
16
|
s.platform = 'java'
|
17
17
|
|
18
|
-
|
19
|
-
s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
|
20
|
-
s.test_files = `git ls-files -- {test,features}/*`.split("\n")
|
18
|
+
s.files = `git ls-files`.split("\n").reject { |f| f =~ /^test\// }
|
21
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
20
|
s.require_paths = ["lib"]
|
23
21
|
|
24
|
-
s.add_development_dependency 'minitest'
|
25
22
|
s.add_development_dependency 'bundler', '>= 1.3.4'
|
26
23
|
s.add_development_dependency 'ruby-debug'
|
27
24
|
s.add_development_dependency 'pry'
|
25
|
+
s.add_development_dependency 'minitest'
|
28
26
|
|
29
27
|
s.add_runtime_dependency "trollop", ["~> 2.0"]
|
30
|
-
# s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
|
31
28
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.4
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Manuel Aristarán
|
@@ -10,38 +10,38 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-
|
13
|
+
date: 2014-05-09 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: bundler
|
17
17
|
version_requirements: !ruby/object:Gem::Requirement
|
18
18
|
requirements:
|
19
19
|
- - '>='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
21
|
+
version: 1.3.4
|
22
22
|
requirement: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 1.3.4
|
27
27
|
prerelease: false
|
28
28
|
type: :development
|
29
29
|
- !ruby/object:Gem::Dependency
|
30
|
-
name:
|
30
|
+
name: ruby-debug
|
31
31
|
version_requirements: !ruby/object:Gem::Requirement
|
32
32
|
requirements:
|
33
33
|
- - '>='
|
34
34
|
- !ruby/object:Gem::Version
|
35
|
-
version:
|
35
|
+
version: '0'
|
36
36
|
requirement: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '0'
|
41
41
|
prerelease: false
|
42
42
|
type: :development
|
43
43
|
- !ruby/object:Gem::Dependency
|
44
|
-
name:
|
44
|
+
name: pry
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
46
46
|
requirements:
|
47
47
|
- - '>='
|
@@ -55,7 +55,7 @@ dependencies:
|
|
55
55
|
prerelease: false
|
56
56
|
type: :development
|
57
57
|
- !ruby/object:Gem::Dependency
|
58
|
-
name:
|
58
|
+
name: minitest
|
59
59
|
version_requirements: !ruby/object:Gem::Requirement
|
60
60
|
requirements:
|
61
61
|
- - '>='
|
@@ -99,21 +99,6 @@ files:
|
|
99
99
|
- README.md
|
100
100
|
- Rakefile
|
101
101
|
- bin/tabula
|
102
|
-
- ext/COPYING
|
103
|
-
- ext/Makefile.OSX
|
104
|
-
- ext/Makefile.defaults
|
105
|
-
- ext/Makefile.linux32
|
106
|
-
- ext/Makefile.linux64
|
107
|
-
- ext/Makefile.mingw
|
108
|
-
- ext/Makefile.mingw64
|
109
|
-
- ext/liblsd-linux32.so
|
110
|
-
- ext/liblsd-linux64.so
|
111
|
-
- ext/liblsd.def
|
112
|
-
- ext/liblsd.dll
|
113
|
-
- ext/liblsd.dylib
|
114
|
-
- ext/liblsd64.dll
|
115
|
-
- ext/lsd.c
|
116
|
-
- ext/lsd.h
|
117
102
|
- lib/tabula.rb
|
118
103
|
- lib/tabula/core_ext.rb
|
119
104
|
- lib/tabula/entities.rb
|
@@ -125,6 +110,7 @@ files:
|
|
125
110
|
- lib/tabula/entities/ruling.rb
|
126
111
|
- lib/tabula/entities/spreadsheet.rb
|
127
112
|
- lib/tabula/entities/table.rb
|
113
|
+
- lib/tabula/entities/tabular.rb
|
128
114
|
- lib/tabula/entities/text_chunk.rb
|
129
115
|
- lib/tabula/entities/text_element.rb
|
130
116
|
- lib/tabula/entities/text_element_index.rb
|
@@ -143,40 +129,6 @@ files:
|
|
143
129
|
- target/pdfbox-app-2.0.0-SNAPSHOT.jar
|
144
130
|
- target/slf4j-api-1.6.3.jar
|
145
131
|
- target/trove4j-3.0.3.jar
|
146
|
-
- test/data/47008204D_USA.page4.pdf
|
147
|
-
- test/data/560015757GV_China.page1.pdf
|
148
|
-
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
149
|
-
- test/data/GSK_2012_Q4.page437.pdf
|
150
|
-
- test/data/S2MNCEbirdisland.pdf
|
151
|
-
- test/data/argentina_diputados_voting_record.pdf
|
152
|
-
- test/data/bo_page24.pdf
|
153
|
-
- test/data/campaign_donors.pdf
|
154
|
-
- test/data/frx_2012_disclosure.pdf
|
155
|
-
- test/data/frx_2012_disclosure.tsv
|
156
|
-
- test/data/gre.pdf
|
157
|
-
- test/data/no_tables.pdf
|
158
|
-
- test/data/nyc_2013fiscalreporttables.pdf
|
159
|
-
- test/data/puertos1.pdf
|
160
|
-
- test/data/spanning_cells.csv
|
161
|
-
- test/data/spanning_cells.pdf
|
162
|
-
- test/data/strongschools.pdf
|
163
|
-
- test/data/sydney_disclosure_contract.pdf
|
164
|
-
- test/data/tabla_subsidios.pdf
|
165
|
-
- test/data/vertical_rulings_bug.pdf
|
166
|
-
- test/data/vietnam3.pdf
|
167
|
-
- test/data/wc2012.pdf
|
168
|
-
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
169
|
-
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
170
|
-
- test/heuristic-test-set/original/bo_page24.pdf
|
171
|
-
- test/heuristic-test-set/original/campaign_donors.pdf
|
172
|
-
- test/heuristic-test-set/original/cs076pct.pdf
|
173
|
-
- test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
|
174
|
-
- test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
|
175
|
-
- test/heuristic-test-set/spreadsheet/strongschools.pdf
|
176
|
-
- test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
|
177
|
-
- test/heuristic.rb
|
178
|
-
- test/test_bin_tabula.sh
|
179
|
-
- test/tests.rb
|
180
132
|
homepage: https://github.com/jazzido/tabula-extractor
|
181
133
|
licenses:
|
182
134
|
- MIT
|
@@ -197,42 +149,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
197
149
|
version: '0'
|
198
150
|
requirements: []
|
199
151
|
rubyforge_project:
|
200
|
-
rubygems_version: 2.
|
152
|
+
rubygems_version: 2.2.2
|
201
153
|
signing_key:
|
202
154
|
specification_version: 4
|
203
155
|
summary: extract tables from PDF files
|
204
|
-
test_files:
|
205
|
-
- test/data/47008204D_USA.page4.pdf
|
206
|
-
- test/data/560015757GV_China.page1.pdf
|
207
|
-
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
208
|
-
- test/data/GSK_2012_Q4.page437.pdf
|
209
|
-
- test/data/S2MNCEbirdisland.pdf
|
210
|
-
- test/data/argentina_diputados_voting_record.pdf
|
211
|
-
- test/data/bo_page24.pdf
|
212
|
-
- test/data/campaign_donors.pdf
|
213
|
-
- test/data/frx_2012_disclosure.pdf
|
214
|
-
- test/data/frx_2012_disclosure.tsv
|
215
|
-
- test/data/gre.pdf
|
216
|
-
- test/data/no_tables.pdf
|
217
|
-
- test/data/nyc_2013fiscalreporttables.pdf
|
218
|
-
- test/data/puertos1.pdf
|
219
|
-
- test/data/spanning_cells.csv
|
220
|
-
- test/data/spanning_cells.pdf
|
221
|
-
- test/data/strongschools.pdf
|
222
|
-
- test/data/sydney_disclosure_contract.pdf
|
223
|
-
- test/data/tabla_subsidios.pdf
|
224
|
-
- test/data/vertical_rulings_bug.pdf
|
225
|
-
- test/data/vietnam3.pdf
|
226
|
-
- test/data/wc2012.pdf
|
227
|
-
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
228
|
-
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
229
|
-
- test/heuristic-test-set/original/bo_page24.pdf
|
230
|
-
- test/heuristic-test-set/original/campaign_donors.pdf
|
231
|
-
- test/heuristic-test-set/original/cs076pct.pdf
|
232
|
-
- test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
|
233
|
-
- test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
|
234
|
-
- test/heuristic-test-set/spreadsheet/strongschools.pdf
|
235
|
-
- test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
|
236
|
-
- test/heuristic.rb
|
237
|
-
- test/test_bin_tabula.sh
|
238
|
-
- test/tests.rb
|
156
|
+
test_files: []
|