tabula-extractor 0.7.2-java → 0.7.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +4 -8
- data/bin/tabula +3 -3
- data/lib/tabula.rb +9 -5
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/cell.rb +6 -4
- data/lib/tabula/entities/has_cells.rb +22 -78
- data/lib/tabula/entities/line.rb +52 -6
- data/lib/tabula/entities/page.rb +43 -50
- data/lib/tabula/entities/ruling.rb +83 -105
- data/lib/tabula/entities/spreadsheet.rb +74 -11
- data/lib/tabula/entities/table.rb +55 -37
- data/lib/tabula/entities/tabular.rb +42 -0
- data/lib/tabula/entities/text_chunk.rb +55 -52
- data/lib/tabula/entities/text_element.rb +129 -62
- data/lib/tabula/entities/zone_entity.rb +15 -6
- data/lib/tabula/extraction.rb +114 -49
- data/lib/tabula/line_segment_detector.rb +0 -5
- data/lib/tabula/table_extractor.rb +32 -37
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -5
- metadata +13 -95
- data/ext/COPYING +0 -661
- data/ext/Makefile.OSX +0 -18
- data/ext/Makefile.defaults +0 -9
- data/ext/Makefile.linux32 +0 -11
- data/ext/Makefile.linux64 +0 -12
- data/ext/Makefile.mingw +0 -10
- data/ext/Makefile.mingw64 +0 -10
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +0 -3
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +0 -2270
- data/ext/lsd.h +0 -283
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/argentina_diputados_voting_record.pdf +0 -0
- data/test/data/bo_page24.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +0 -88
- data/test/data/gre.pdf +0 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +0 -21
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/tabla_subsidios.pdf +0 -0
- data/test/data/vertical_rulings_bug.pdf +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +0 -50
- data/test/test_bin_tabula.sh +0 -7
- data/test/tests.rb +0 -603
data/lib/tabula/extraction.rb
CHANGED
@@ -31,6 +31,8 @@ module Tabula
|
|
31
31
|
:extract_ruling_lines => true
|
32
32
|
}
|
33
33
|
|
34
|
+
# TODO: the +pages+ constructor argument does not make sense
|
35
|
+
# now that we have +extract_page+ and +extract_pages+
|
34
36
|
def initialize(pdf_filename, pages=[1], password='', options={})
|
35
37
|
raise Errno::ENOENT unless File.exists?(pdf_filename)
|
36
38
|
@pdf_filename = pdf_filename
|
@@ -47,39 +49,67 @@ module Tabula
|
|
47
49
|
@transformed_clipping_path = nil
|
48
50
|
self.clipping_paths = []
|
49
51
|
@rulings = []
|
50
|
-
@min_char_width = @min_char_height =
|
52
|
+
@min_char_width = @min_char_height = Float::MAX
|
51
53
|
end
|
52
54
|
|
53
|
-
def
|
55
|
+
def close!
|
56
|
+
self.ensure_open!
|
57
|
+
@pdf_file.close
|
58
|
+
@pdf_file_closed = true
|
59
|
+
end
|
60
|
+
|
61
|
+
def ensure_open!
|
62
|
+
raise "Document is closed" if @pdf_file_closed
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# extract objects from a page. Returns an instance of +Tabula::Page+
|
67
|
+
# (+page_number+ is 1-based. i.e., first page is number 1)
|
68
|
+
def extract_page(page_number)
|
69
|
+
self.ensure_open!
|
70
|
+
|
71
|
+
if page_number-1 >= @all_pages.size || (page_number) < 0
|
72
|
+
raise IndexError, "Page #{page_number} doesn't exist. Skipping. Valid pages are 1..#{@all_pages.size}"
|
73
|
+
end
|
74
|
+
|
75
|
+
page = @all_pages.get(page_number-1)
|
76
|
+
contents = page.getContents
|
77
|
+
return nil if contents.nil?
|
78
|
+
|
79
|
+
self.clear!
|
80
|
+
self.drawPage(page)
|
81
|
+
Tabula::Page.new(@pdf_filename,
|
82
|
+
page.findCropBox.width,
|
83
|
+
page.findCropBox.height,
|
84
|
+
page.getRotation.to_i,
|
85
|
+
page_number, #one-indexed, just like +page_number+ is.
|
86
|
+
self.characters,
|
87
|
+
self.rulings,
|
88
|
+
@min_char_width,
|
89
|
+
@min_char_height)
|
90
|
+
end
|
91
|
+
|
92
|
+
def extract(pages=nil)
|
93
|
+
self.ensure_open!
|
94
|
+
pages = if pages == :all
|
95
|
+
(1..@all_pages.size)
|
96
|
+
elsif pages.nil?
|
97
|
+
@pages
|
98
|
+
else
|
99
|
+
pages
|
100
|
+
end
|
101
|
+
|
54
102
|
Enumerator.new do |y|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
contents = page.getContents
|
59
|
-
next if contents.nil?
|
60
|
-
|
61
|
-
self.clear!
|
62
|
-
self.drawPage(page)
|
63
|
-
p = Tabula::Page.new(@pdf_filename,
|
64
|
-
page.findCropBox.width,
|
65
|
-
page.findCropBox.height,
|
66
|
-
page.getRotation.to_i,
|
67
|
-
i, #one-indexed, just like `i` is.
|
68
|
-
self.characters,
|
69
|
-
self.rulings,
|
70
|
-
@min_char_width,
|
71
|
-
@min_char_height)
|
72
|
-
y.yield p
|
73
|
-
end
|
74
|
-
ensure
|
75
|
-
@pdf_file.close
|
76
|
-
end # begin
|
103
|
+
pages.each do |i|
|
104
|
+
y.yield self.extract_page(i)
|
105
|
+
end
|
77
106
|
end
|
78
107
|
end
|
79
108
|
|
80
109
|
def clear!
|
81
110
|
self.characters.clear
|
82
111
|
self.clipping_paths.clear
|
112
|
+
@min_char_width = @min_char_height = Float::MAX
|
83
113
|
@page_transform = nil
|
84
114
|
@rulings.clear
|
85
115
|
end
|
@@ -118,8 +148,14 @@ module Tabula
|
|
118
148
|
|
119
149
|
path = self.pathToList(self.getLinePath)
|
120
150
|
|
151
|
+
# skip paths whose first operation is not a MOVETO
|
152
|
+
# or contains operations other than LINETO, MOVETO or CLOSE
|
121
153
|
if path[0][0] != java.awt.geom.PathIterator::SEG_MOVETO \
|
122
|
-
|
154
|
+
|| path[1..-1].any? { |p|
|
155
|
+
p.first != java.awt.geom.PathIterator::SEG_LINETO \
|
156
|
+
&& p.first != java.awt.geom.PathIterator::SEG_MOVETO \
|
157
|
+
&& p.first != java.awt.geom.PathIterator::SEG_CLOSE
|
158
|
+
}
|
123
159
|
self.getLinePath.reset
|
124
160
|
return
|
125
161
|
end
|
@@ -129,26 +165,57 @@ module Tabula
|
|
129
165
|
strokeColorComps = filter_by_color || self.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
|
130
166
|
color_filter = self.options[:line_color_filter]
|
131
167
|
|
168
|
+
if !color_filter.nil? && !color_filter.call(strokeColorComps)
|
169
|
+
self.getLinePath.reset
|
170
|
+
return
|
171
|
+
end
|
172
|
+
|
173
|
+
# skip the first path operation save it as the starting position
|
132
174
|
first = path.shift
|
133
|
-
|
175
|
+
# last_move
|
176
|
+
start_pos = last_move = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1])
|
177
|
+
|
178
|
+
end_pos = nil
|
134
179
|
|
135
180
|
path.each do |p|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
181
|
+
case p[0]
|
182
|
+
when java.awt.geom.PathIterator::SEG_LINETO
|
183
|
+
end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
|
184
|
+
line = (start_pos <=> end_pos) == -1 \
|
185
|
+
? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \
|
186
|
+
: java.awt.geom.Line2D::Float.new(end_pos, start_pos)
|
187
|
+
|
188
|
+
if line.intersects(ccp_bounds)
|
189
|
+
# convert line to rectangle for clipping it to the current clippath
|
190
|
+
# sucks, but awt doesn't have methods for this
|
191
|
+
tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
|
192
|
+
@rulings << ::Tabula::Ruling.new(tmp.getY,
|
193
|
+
tmp.getX,
|
194
|
+
tmp.getWidth,
|
195
|
+
tmp.getHeight,
|
196
|
+
filter_by_color.to_a)
|
197
|
+
end
|
198
|
+
when java.awt.geom.PathIterator::SEG_MOVETO
|
199
|
+
last_move = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
|
200
|
+
when java.awt.geom.PathIterator::SEG_CLOSE
|
201
|
+
# according to PathIterator docs:
|
202
|
+
# "the preceding subpath should be closed by appending a line segment
|
203
|
+
# back to the point corresponding to the most recent SEG_MOVETO."
|
204
|
+
|
205
|
+
line = (end_pos <=> last_move) == -1 \
|
206
|
+
? java.awt.geom.Line2D::Float.new(end_pos, last_move) \
|
207
|
+
: java.awt.geom.Line2D::Float.new(last_move, end_pos)
|
208
|
+
|
209
|
+
if line.intersects(ccp_bounds)
|
210
|
+
# convert line to rectangle for clipping it to the current clippath
|
211
|
+
# sucks, but awt doesn't have methods for this
|
212
|
+
tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
|
213
|
+
@rulings << ::Tabula::Ruling.new(tmp.getY,
|
214
|
+
tmp.getX,
|
215
|
+
tmp.getWidth,
|
216
|
+
tmp.getHeight,
|
217
|
+
filter_by_color.to_a)
|
218
|
+
end
|
152
219
|
end
|
153
220
|
start_pos = end_pos
|
154
221
|
end
|
@@ -201,22 +268,21 @@ module Tabula
|
|
201
268
|
c = text.getCharacter
|
202
269
|
h = text.getHeightDir.round(2)
|
203
270
|
|
204
|
-
if c == '
|
271
|
+
if c == ' ' # replace non-breaking space for space
|
205
272
|
c = ' '
|
206
|
-
h = text.getWidth.round(2)
|
207
273
|
end
|
208
274
|
|
209
275
|
te = Tabula::TextElement.new(text.getY.round(2) - h,
|
210
276
|
text.getX.round(2),
|
211
|
-
text.
|
277
|
+
text.getWidthDirAdj,
|
212
278
|
# ugly hack follows: we need spaces to have a height, so we can
|
213
279
|
# test for vertical overlap. height == width seems a safe bet.
|
214
|
-
|
280
|
+
text.getHeightDir,
|
215
281
|
text.getFont,
|
216
|
-
text.getFontSize
|
282
|
+
text.getFontSize,
|
217
283
|
c,
|
218
284
|
# workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
|
219
|
-
text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
|
285
|
+
(text.getWidthOfSpace.nan? || text.getWidthOfSpace == 0) ? self.currentSpaceWidth : text.getWidthOfSpace,
|
220
286
|
text.getDir)
|
221
287
|
|
222
288
|
ccp_bounds = self.currentClippingPath
|
@@ -246,7 +312,6 @@ module Tabula
|
|
246
312
|
end
|
247
313
|
|
248
314
|
def rulings
|
249
|
-
return [] if @rulings.empty?
|
250
315
|
@rulings.reject { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
|
251
316
|
end
|
252
317
|
|
@@ -42,10 +42,12 @@ module Tabula
|
|
42
42
|
page = [page]
|
43
43
|
end
|
44
44
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
45
|
+
extractor = Extraction::ObjectExtractor.new(pdf_path,
|
46
|
+
page,
|
47
|
+
options[:password])
|
48
|
+
|
49
|
+
pdf_page = extractor.extract.next
|
50
|
+
extractor.close!
|
49
51
|
|
50
52
|
if ["spreadsheet", "original"].include? options[:extraction_method]
|
51
53
|
use_spreadsheet_extraction_method = options[:extraction_method] == "spreadsheet"
|
@@ -54,39 +56,32 @@ module Tabula
|
|
54
56
|
end
|
55
57
|
|
56
58
|
if use_spreadsheet_extraction_method
|
57
|
-
|
58
|
-
else
|
59
|
-
use_detected_lines = false
|
60
|
-
if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
|
61
|
-
detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
|
62
|
-
area)
|
63
|
-
|
64
|
-
# only use lines if at least 80% of them cover at least 90%
|
65
|
-
# of the height of area of interest
|
66
|
-
|
67
|
-
# TODO this heuristic SUCKS
|
68
|
-
# what if only a couple columns is delimited with vertical rulings?
|
69
|
-
# ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
|
70
|
-
# idea: detect columns without considering rulings, detect vertical rulings
|
71
|
-
# calculate ratio and try to come up with a threshold
|
72
|
-
use_detected_lines = detected_vertical_rulings.size > 2 \
|
73
|
-
&& (detected_vertical_rulings.count { |vl|
|
74
|
-
vl.height / area.height > 0.9
|
75
|
-
} / detected_vertical_rulings.size.to_f) >= 0.8
|
76
|
-
|
77
|
-
end
|
78
|
-
|
79
|
-
table = pdf_page.get_area(area).get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
|
80
|
-
|
81
|
-
# fixes up the table a little bit, replacing nils with empty TextElements
|
82
|
-
# and sorting the lines.
|
83
|
-
table.lines.each do |l|
|
84
|
-
l.text_elements = l.text_elements.map do |te|
|
85
|
-
te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
86
|
-
end
|
87
|
-
end
|
88
|
-
table.lines.sort_by! { |l| l.text_elements.map { |te| te.top or 0 }.max }
|
89
|
-
table
|
59
|
+
return (spreadsheets = pdf_page.get_area(area).spreadsheets).empty? ? Spreadsheet.empty(pdf_page) : spreadsheets.inject(&:+)
|
90
60
|
end
|
61
|
+
|
62
|
+
use_detected_lines = false
|
63
|
+
if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
|
64
|
+
detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
|
65
|
+
area)
|
66
|
+
|
67
|
+
# only use lines if at least 80% of them cover at least 90%
|
68
|
+
# of the height of area of interest
|
69
|
+
|
70
|
+
# TODO this heuristic SUCKS
|
71
|
+
# what if only a couple columns is delimited with vertical rulings?
|
72
|
+
# ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
|
73
|
+
# idea: detect columns without considering rulings, detect vertical rulings
|
74
|
+
# calculate ratio and try to come up with a threshold
|
75
|
+
use_detected_lines = detected_vertical_rulings.size > 2 \
|
76
|
+
&& (detected_vertical_rulings.count { |vl|
|
77
|
+
vl.height / area.height > 0.9
|
78
|
+
} / detected_vertical_rulings.size.to_f) >= 0.8
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
pdf_page
|
83
|
+
.get_area(area)
|
84
|
+
.get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
|
85
|
+
|
91
86
|
end
|
92
87
|
end
|
data/lib/tabula/version.rb
CHANGED
data/tabula-extractor.gemspec
CHANGED
@@ -15,17 +15,14 @@ Gem::Specification.new do |s|
|
|
15
15
|
|
16
16
|
s.platform = 'java'
|
17
17
|
|
18
|
-
|
19
|
-
s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
|
20
|
-
s.test_files = `git ls-files -- {test,features}/*`.split("\n")
|
18
|
+
s.files = `git ls-files`.split("\n").reject { |f| f =~ /^test\// }
|
21
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
20
|
s.require_paths = ["lib"]
|
23
21
|
|
24
|
-
s.add_development_dependency 'minitest'
|
25
22
|
s.add_development_dependency 'bundler', '>= 1.3.4'
|
26
23
|
s.add_development_dependency 'ruby-debug'
|
27
24
|
s.add_development_dependency 'pry'
|
25
|
+
s.add_development_dependency 'minitest'
|
28
26
|
|
29
27
|
s.add_runtime_dependency "trollop", ["~> 2.0"]
|
30
|
-
# s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
|
31
28
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.4
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Manuel Aristarán
|
@@ -10,38 +10,38 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-
|
13
|
+
date: 2014-05-09 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: bundler
|
17
17
|
version_requirements: !ruby/object:Gem::Requirement
|
18
18
|
requirements:
|
19
19
|
- - '>='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
21
|
+
version: 1.3.4
|
22
22
|
requirement: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 1.3.4
|
27
27
|
prerelease: false
|
28
28
|
type: :development
|
29
29
|
- !ruby/object:Gem::Dependency
|
30
|
-
name:
|
30
|
+
name: ruby-debug
|
31
31
|
version_requirements: !ruby/object:Gem::Requirement
|
32
32
|
requirements:
|
33
33
|
- - '>='
|
34
34
|
- !ruby/object:Gem::Version
|
35
|
-
version:
|
35
|
+
version: '0'
|
36
36
|
requirement: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '0'
|
41
41
|
prerelease: false
|
42
42
|
type: :development
|
43
43
|
- !ruby/object:Gem::Dependency
|
44
|
-
name:
|
44
|
+
name: pry
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
46
46
|
requirements:
|
47
47
|
- - '>='
|
@@ -55,7 +55,7 @@ dependencies:
|
|
55
55
|
prerelease: false
|
56
56
|
type: :development
|
57
57
|
- !ruby/object:Gem::Dependency
|
58
|
-
name:
|
58
|
+
name: minitest
|
59
59
|
version_requirements: !ruby/object:Gem::Requirement
|
60
60
|
requirements:
|
61
61
|
- - '>='
|
@@ -99,21 +99,6 @@ files:
|
|
99
99
|
- README.md
|
100
100
|
- Rakefile
|
101
101
|
- bin/tabula
|
102
|
-
- ext/COPYING
|
103
|
-
- ext/Makefile.OSX
|
104
|
-
- ext/Makefile.defaults
|
105
|
-
- ext/Makefile.linux32
|
106
|
-
- ext/Makefile.linux64
|
107
|
-
- ext/Makefile.mingw
|
108
|
-
- ext/Makefile.mingw64
|
109
|
-
- ext/liblsd-linux32.so
|
110
|
-
- ext/liblsd-linux64.so
|
111
|
-
- ext/liblsd.def
|
112
|
-
- ext/liblsd.dll
|
113
|
-
- ext/liblsd.dylib
|
114
|
-
- ext/liblsd64.dll
|
115
|
-
- ext/lsd.c
|
116
|
-
- ext/lsd.h
|
117
102
|
- lib/tabula.rb
|
118
103
|
- lib/tabula/core_ext.rb
|
119
104
|
- lib/tabula/entities.rb
|
@@ -125,6 +110,7 @@ files:
|
|
125
110
|
- lib/tabula/entities/ruling.rb
|
126
111
|
- lib/tabula/entities/spreadsheet.rb
|
127
112
|
- lib/tabula/entities/table.rb
|
113
|
+
- lib/tabula/entities/tabular.rb
|
128
114
|
- lib/tabula/entities/text_chunk.rb
|
129
115
|
- lib/tabula/entities/text_element.rb
|
130
116
|
- lib/tabula/entities/text_element_index.rb
|
@@ -143,40 +129,6 @@ files:
|
|
143
129
|
- target/pdfbox-app-2.0.0-SNAPSHOT.jar
|
144
130
|
- target/slf4j-api-1.6.3.jar
|
145
131
|
- target/trove4j-3.0.3.jar
|
146
|
-
- test/data/47008204D_USA.page4.pdf
|
147
|
-
- test/data/560015757GV_China.page1.pdf
|
148
|
-
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
149
|
-
- test/data/GSK_2012_Q4.page437.pdf
|
150
|
-
- test/data/S2MNCEbirdisland.pdf
|
151
|
-
- test/data/argentina_diputados_voting_record.pdf
|
152
|
-
- test/data/bo_page24.pdf
|
153
|
-
- test/data/campaign_donors.pdf
|
154
|
-
- test/data/frx_2012_disclosure.pdf
|
155
|
-
- test/data/frx_2012_disclosure.tsv
|
156
|
-
- test/data/gre.pdf
|
157
|
-
- test/data/no_tables.pdf
|
158
|
-
- test/data/nyc_2013fiscalreporttables.pdf
|
159
|
-
- test/data/puertos1.pdf
|
160
|
-
- test/data/spanning_cells.csv
|
161
|
-
- test/data/spanning_cells.pdf
|
162
|
-
- test/data/strongschools.pdf
|
163
|
-
- test/data/sydney_disclosure_contract.pdf
|
164
|
-
- test/data/tabla_subsidios.pdf
|
165
|
-
- test/data/vertical_rulings_bug.pdf
|
166
|
-
- test/data/vietnam3.pdf
|
167
|
-
- test/data/wc2012.pdf
|
168
|
-
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
169
|
-
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
170
|
-
- test/heuristic-test-set/original/bo_page24.pdf
|
171
|
-
- test/heuristic-test-set/original/campaign_donors.pdf
|
172
|
-
- test/heuristic-test-set/original/cs076pct.pdf
|
173
|
-
- test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
|
174
|
-
- test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
|
175
|
-
- test/heuristic-test-set/spreadsheet/strongschools.pdf
|
176
|
-
- test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
|
177
|
-
- test/heuristic.rb
|
178
|
-
- test/test_bin_tabula.sh
|
179
|
-
- test/tests.rb
|
180
132
|
homepage: https://github.com/jazzido/tabula-extractor
|
181
133
|
licenses:
|
182
134
|
- MIT
|
@@ -197,42 +149,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
197
149
|
version: '0'
|
198
150
|
requirements: []
|
199
151
|
rubyforge_project:
|
200
|
-
rubygems_version: 2.
|
152
|
+
rubygems_version: 2.2.2
|
201
153
|
signing_key:
|
202
154
|
specification_version: 4
|
203
155
|
summary: extract tables from PDF files
|
204
|
-
test_files:
|
205
|
-
- test/data/47008204D_USA.page4.pdf
|
206
|
-
- test/data/560015757GV_China.page1.pdf
|
207
|
-
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
208
|
-
- test/data/GSK_2012_Q4.page437.pdf
|
209
|
-
- test/data/S2MNCEbirdisland.pdf
|
210
|
-
- test/data/argentina_diputados_voting_record.pdf
|
211
|
-
- test/data/bo_page24.pdf
|
212
|
-
- test/data/campaign_donors.pdf
|
213
|
-
- test/data/frx_2012_disclosure.pdf
|
214
|
-
- test/data/frx_2012_disclosure.tsv
|
215
|
-
- test/data/gre.pdf
|
216
|
-
- test/data/no_tables.pdf
|
217
|
-
- test/data/nyc_2013fiscalreporttables.pdf
|
218
|
-
- test/data/puertos1.pdf
|
219
|
-
- test/data/spanning_cells.csv
|
220
|
-
- test/data/spanning_cells.pdf
|
221
|
-
- test/data/strongschools.pdf
|
222
|
-
- test/data/sydney_disclosure_contract.pdf
|
223
|
-
- test/data/tabla_subsidios.pdf
|
224
|
-
- test/data/vertical_rulings_bug.pdf
|
225
|
-
- test/data/vietnam3.pdf
|
226
|
-
- test/data/wc2012.pdf
|
227
|
-
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
228
|
-
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
229
|
-
- test/heuristic-test-set/original/bo_page24.pdf
|
230
|
-
- test/heuristic-test-set/original/campaign_donors.pdf
|
231
|
-
- test/heuristic-test-set/original/cs076pct.pdf
|
232
|
-
- test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
|
233
|
-
- test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
|
234
|
-
- test/heuristic-test-set/spreadsheet/strongschools.pdf
|
235
|
-
- test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
|
236
|
-
- test/heuristic.rb
|
237
|
-
- test/test_bin_tabula.sh
|
238
|
-
- test/tests.rb
|
156
|
+
test_files: []
|