tabula-extractor 0.7.0-java → 0.7.1-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/tabula +5 -1
- data/lib/tabula/entities/cell.rb +11 -0
- data/lib/tabula/entities/has_cells.rb +0 -1
- data/lib/tabula/entities/page.rb +7 -8
- data/lib/tabula/entities/spreadsheet.rb +18 -0
- data/lib/tabula/entities/table.rb +20 -5
- data/lib/tabula/extraction.rb +7 -1
- data/lib/tabula/table_extractor.rb +39 -19
- data/lib/tabula/version.rb +1 -1
- data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
- data/test/tests.rb +17 -17
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fea5751c9b78705fbfda50be1e29011d40249b04
|
4
|
+
data.tar.gz: 2d7bb156a073636467b34e44b3f8b6a364920354
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8d1f1932397b599b3d724bb4e8ca5fd73f516e198a391625031ab83f8e5f0a2743d51dcc02953ee5a53c0664330c8e8e1f5309f72493e9f2e232068b06085a1
|
7
|
+
data.tar.gz: e292d6320db6a42799418b6651f63a88446f85c6787659597e77b9f7c38e698b69a10c78edbdeea49baedc0d92f77602b847aef90ab1f8f2ab10356493dbb397
|
data/bin/tabula
CHANGED
@@ -6,6 +6,10 @@ require_relative '../lib/tabula'
|
|
6
6
|
FORMATS = ['CSV', 'TSV', 'HTML', 'JSON']
|
7
7
|
|
8
8
|
def parse_pages_arg(pages_arg)
|
9
|
+
if(pages_arg == 'all')
|
10
|
+
return :all
|
11
|
+
end
|
12
|
+
|
9
13
|
ranges = pages_arg.split(',').map(&:strip)
|
10
14
|
pages = []
|
11
15
|
ranges.each do |range|
|
@@ -32,7 +36,7 @@ Usage:
|
|
32
36
|
where [options] are:
|
33
37
|
EOS
|
34
38
|
|
35
|
-
opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages
|
39
|
+
opt :pages, 'Comma separated list of ranges, or all. Examples: --pages 1-3,5-7, --pages 3 or --pages all. Default is --pages 1', :default => '1', :type => String
|
36
40
|
opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
|
37
41
|
opt :columns, 'X coordinates of column boundaries. Example --columns 10.1,20.2,30.3', :default => nil, :type => String
|
38
42
|
opt :password, 'Password to decrypt document. Default is empty', :default => ''
|
data/lib/tabula/entities/cell.rb
CHANGED
data/lib/tabula/entities/page.rb
CHANGED
@@ -83,11 +83,7 @@ module Tabula
|
|
83
83
|
|
84
84
|
#for API backwards-compatibility reasons, this returns an array of arrays.
|
85
85
|
def make_table(options={})
|
86
|
-
get_table(options).
|
87
|
-
l.text_elements.map! do |te|
|
88
|
-
te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
89
|
-
end
|
90
|
-
end.sort_by { |l| l.map { |te| te.top or 0 }.max }
|
86
|
+
get_table(options).rows
|
91
87
|
end
|
92
88
|
|
93
89
|
# returns the Spreadsheets; creating them if they're not memoized
|
@@ -241,13 +237,17 @@ module Tabula
|
|
241
237
|
end
|
242
238
|
|
243
239
|
lines_to_points.each do |l, p1_p2|
|
244
|
-
l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0],
|
245
|
-
p1_p2[1]
|
240
|
+
l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1]
|
246
241
|
end
|
247
242
|
end
|
248
243
|
|
249
244
|
def collapse_oriented_rulings(lines)
|
250
245
|
# lines must all be of one orientation (i.e. horizontal, vertical)
|
246
|
+
|
247
|
+
if lines.empty?
|
248
|
+
return []
|
249
|
+
end
|
250
|
+
|
251
251
|
lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
|
252
252
|
|
253
253
|
lines = lines.inject([lines.shift]) do |memo, next_line|
|
@@ -262,7 +262,6 @@ module Tabula
|
|
262
262
|
memo << next_line
|
263
263
|
end
|
264
264
|
end
|
265
|
-
lines
|
266
265
|
end
|
267
266
|
end
|
268
267
|
|
@@ -1,9 +1,13 @@
|
|
1
1
|
module Tabula
|
2
2
|
# a counterpart of Table, to be sure.
|
3
3
|
# not sure yet what their relationship ought to be.
|
4
|
+
|
5
|
+
# the both should implement `cells`, `rows`, `cols`, `extraction_method`
|
6
|
+
|
4
7
|
class Spreadsheet < ZoneEntity
|
5
8
|
include Tabula::HasCells
|
6
9
|
attr_accessor :cells, :vertical_ruling_lines, :horizontal_ruling_lines, :cells_resolved
|
10
|
+
attr_reader :extraction_method, :page
|
7
11
|
|
8
12
|
def initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) #, lines)
|
9
13
|
super(top, left, width, height)
|
@@ -11,6 +15,7 @@ module Tabula
|
|
11
15
|
@page = page
|
12
16
|
@vertical_ruling_lines = vertical_ruling_lines
|
13
17
|
@horizontal_ruling_lines = horizontal_ruling_lines
|
18
|
+
@extraction_method = "spreadsheet"
|
14
19
|
end
|
15
20
|
|
16
21
|
def ruling_lines
|
@@ -88,5 +93,18 @@ module Tabula
|
|
88
93
|
Tabula::Writers.TSV(rows, out)
|
89
94
|
out.string
|
90
95
|
end
|
96
|
+
|
97
|
+
def to_json(*a)
|
98
|
+
{
|
99
|
+
'json_class' => self.class.name,
|
100
|
+
'extraction_method' => @extraction_method,
|
101
|
+
'data' => rows,
|
102
|
+
}.to_json(*a)
|
103
|
+
end
|
104
|
+
|
105
|
+
def +(other)
|
106
|
+
raise ArgumentError unless other.page == @page
|
107
|
+
Spreadsheet.new(nil, nil, nil, nil, @page, @cells + other.cells, nil, nil )
|
108
|
+
end
|
91
109
|
end
|
92
110
|
end
|
@@ -1,9 +1,11 @@
|
|
1
1
|
module Tabula
|
2
2
|
class Table
|
3
|
-
attr_reader :
|
3
|
+
attr_reader :extraction_method
|
4
|
+
attr_accessor :lines
|
4
5
|
def initialize(line_count, separators)
|
5
6
|
@separators = separators
|
6
7
|
@lines = (0...line_count).inject([]) { |m| m << Line.new }
|
8
|
+
@extraction_method = "original"
|
7
9
|
end
|
8
10
|
|
9
11
|
def add_text_element(text_element, i, j)
|
@@ -28,22 +30,27 @@ module Tabula
|
|
28
30
|
end
|
29
31
|
|
30
32
|
def cols
|
31
|
-
|
32
|
-
lines.map(&:text_elements).transpose
|
33
|
+
rows.transpose
|
33
34
|
end
|
34
35
|
|
35
36
|
def rows
|
36
37
|
self.rpad!
|
37
|
-
lines.map
|
38
|
+
lines.map do |l|
|
39
|
+
l.text_elements.map! do |te|
|
40
|
+
te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
41
|
+
end
|
42
|
+
end.sort_by { |l| l.map { |te| te.top || 0 }.max }
|
38
43
|
end
|
39
44
|
|
40
45
|
# create a new Table object from an array of arrays, representing a list of rows in a spreadsheet
|
41
46
|
# probably only used for testing
|
42
47
|
def self.new_from_array(array_of_rows)
|
43
48
|
t = Table.new(array_of_rows.size, [])
|
49
|
+
@extraction_method = "testing"
|
44
50
|
array_of_rows.each_with_index do |row, index|
|
45
|
-
t.lines[index].text_elements = row.map{|cell| TextElement.new(
|
51
|
+
t.lines[index].text_elements = row.each_with_index.map{|cell, inner_index| TextElement.new(index, inner_index, 1, 1, nil, nil, cell, nil)}
|
46
52
|
end
|
53
|
+
t.rpad!
|
47
54
|
t
|
48
55
|
end
|
49
56
|
|
@@ -77,5 +84,13 @@ module Tabula
|
|
77
84
|
self.lines.zip(other.lines).all? { |my, yours| my == yours }
|
78
85
|
|
79
86
|
end
|
87
|
+
|
88
|
+
def to_json(*a)
|
89
|
+
{
|
90
|
+
'json_class' => self.class.name,
|
91
|
+
'extraction_method' => @extraction_method,
|
92
|
+
'data' => rows,
|
93
|
+
}.to_json(*a)
|
94
|
+
end
|
80
95
|
end
|
81
96
|
end
|
data/lib/tabula/extraction.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
java_import org.apache.pdfbox.pdfparser.PDFParser
|
2
3
|
java_import org.apache.pdfbox.util.TextPosition
|
3
4
|
java_import org.apache.pdfbox.pdmodel.PDDocument
|
@@ -198,7 +199,12 @@ module Tabula
|
|
198
199
|
|
199
200
|
def processTextPosition(text)
|
200
201
|
c = text.getCharacter
|
201
|
-
h =
|
202
|
+
h = text.getHeightDir.round(2)
|
203
|
+
|
204
|
+
if c == ' ' || c == ' ' # replace non-breaking space for space
|
205
|
+
c = ' '
|
206
|
+
h = text.getWidthDirAdj.round(2)
|
207
|
+
end
|
202
208
|
|
203
209
|
te = Tabula::TextElement.new(text.getYDirAdj.round(2) - h,
|
204
210
|
text.getXDirAdj.round(2),
|
@@ -28,7 +28,8 @@ module Tabula
|
|
28
28
|
options = {
|
29
29
|
:password => '',
|
30
30
|
:detect_ruling_lines => true,
|
31
|
-
:vertical_rulings => []
|
31
|
+
:vertical_rulings => [],
|
32
|
+
:extraction_method => "guess",
|
32
33
|
}.merge(options)
|
33
34
|
|
34
35
|
if area.instance_of?(Array)
|
@@ -41,32 +42,51 @@ module Tabula
|
|
41
42
|
page = [page]
|
42
43
|
end
|
43
44
|
|
44
|
-
|
45
|
+
pdf_page = Extraction::ObjectExtractor.new(pdf_path,
|
45
46
|
page,
|
46
47
|
options[:password]) \
|
47
48
|
.extract.next
|
48
49
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
50
|
+
if ["spreadsheet", "original"].include? options[:extraction_method]
|
51
|
+
use_spreadsheet_extraction_method = options[:extraction_method] == "spreadsheet"
|
52
|
+
else
|
53
|
+
use_spreadsheet_extraction_method = pdf_page.is_tabular?
|
54
|
+
end
|
53
55
|
|
54
|
-
|
55
|
-
|
56
|
+
if use_spreadsheet_extraction_method
|
57
|
+
table = pdf_page.get_area(area).spreadsheets.inject(&:+)
|
58
|
+
else
|
59
|
+
use_detected_lines = false
|
60
|
+
if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
|
61
|
+
detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
|
62
|
+
area)
|
56
63
|
|
57
|
-
|
58
|
-
|
59
|
-
# ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
|
60
|
-
# idea: detect columns without considering rulings, detect vertical rulings
|
61
|
-
# calculate ratio and try to come up with a threshold
|
62
|
-
use_detected_lines = detected_vertical_rulings.size > 2 \
|
63
|
-
&& (detected_vertical_rulings.count { |vl|
|
64
|
-
vl.height / area.height > 0.9
|
65
|
-
} / detected_vertical_rulings.size.to_f) >= 0.8
|
64
|
+
# only use lines if at least 80% of them cover at least 90%
|
65
|
+
# of the height of area of interest
|
66
66
|
|
67
|
-
|
67
|
+
# TODO this heuristic SUCKS
|
68
|
+
# what if only a couple columns is delimited with vertical rulings?
|
69
|
+
# ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
|
70
|
+
# idea: detect columns without considering rulings, detect vertical rulings
|
71
|
+
# calculate ratio and try to come up with a threshold
|
72
|
+
use_detected_lines = detected_vertical_rulings.size > 2 \
|
73
|
+
&& (detected_vertical_rulings.count { |vl|
|
74
|
+
vl.height / area.height > 0.9
|
75
|
+
} / detected_vertical_rulings.size.to_f) >= 0.8
|
68
76
|
|
69
|
-
|
77
|
+
end
|
70
78
|
|
79
|
+
table = pdf_page.get_area(area).get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
|
80
|
+
|
81
|
+
# fixes up the table a little bit, replacing nils with empty TextElements
|
82
|
+
# and sorting the lines.
|
83
|
+
table.lines.each do |l|
|
84
|
+
l.text_elements = l.text_elements.map do |te|
|
85
|
+
te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
table.lines.sort_by! { |l| l.text_elements.map { |te| te.top or 0 }.max }
|
89
|
+
table
|
90
|
+
end
|
71
91
|
end
|
72
92
|
end
|
data/lib/tabula/version.rb
CHANGED
Binary file
|
data/test/tests.rb
CHANGED
@@ -4,10 +4,14 @@ require 'minitest/autorun'
|
|
4
4
|
|
5
5
|
require_relative '../lib/tabula'
|
6
6
|
|
7
|
+
def table_to_array(table)
|
8
|
+
lines_to_array(table.rows)
|
9
|
+
end
|
10
|
+
|
7
11
|
def lines_to_array(lines)
|
8
|
-
lines.map
|
12
|
+
lines.map do |l|
|
9
13
|
l.map { |te| te.text.strip }
|
10
|
-
|
14
|
+
end
|
11
15
|
end
|
12
16
|
|
13
17
|
def lines_to_table(lines)
|
@@ -15,7 +19,7 @@ def lines_to_table(lines)
|
|
15
19
|
end
|
16
20
|
|
17
21
|
|
18
|
-
# I don't want to pollute the "real"
|
22
|
+
# I don't want to pollute the "real" class with a funny inspect method. Just for testing comparisons.
|
19
23
|
module Tabula
|
20
24
|
class Table
|
21
25
|
def inspect
|
@@ -27,7 +31,7 @@ end
|
|
27
31
|
module Tabula
|
28
32
|
class Line
|
29
33
|
def inspect
|
30
|
-
@text_elements.map
|
34
|
+
@text_elements.map{|te| te.nil? ? '' : te.text}.inspect
|
31
35
|
end
|
32
36
|
end
|
33
37
|
end
|
@@ -173,7 +177,7 @@ end
|
|
173
177
|
class TestExtractor < Minitest::Test
|
174
178
|
|
175
179
|
def test_table_extraction_1
|
176
|
-
table =
|
180
|
+
table = table_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
|
177
181
|
1,
|
178
182
|
[107.1, 57.9214, 394.5214, 290.7],
|
179
183
|
:detect_ruling_lines => false)
|
@@ -184,7 +188,7 @@ class TestExtractor < Minitest::Test
|
|
184
188
|
end
|
185
189
|
|
186
190
|
def test_diputados_voting_record
|
187
|
-
table =
|
191
|
+
table = table_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
|
188
192
|
1,
|
189
193
|
[269.875, 12.75, 790.5, 561])
|
190
194
|
|
@@ -199,14 +203,13 @@ class TestExtractor < Minitest::Test
|
|
199
203
|
# and a solution for half-x-height-offset lines.
|
200
204
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
201
205
|
|
202
|
-
table =
|
206
|
+
table = Tabula.extract_table(pdf_file_path,
|
203
207
|
1,
|
204
208
|
[106.01, 48.09, 227.31, 551.89],
|
205
209
|
:detect_ruling_lines => true)
|
206
210
|
|
207
211
|
expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
|
208
212
|
|
209
|
-
|
210
213
|
assert_equal expected, table
|
211
214
|
end
|
212
215
|
|
@@ -259,7 +262,7 @@ class TestExtractor < Minitest::Test
|
|
259
262
|
|
260
263
|
# TODO Spaces inserted in words - fails
|
261
264
|
def test_bo_page24
|
262
|
-
table =
|
265
|
+
table = table_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
|
263
266
|
1,
|
264
267
|
[425.625, 53.125, 575.714, 810.535],
|
265
268
|
:detect_ruling_lines => false)
|
@@ -312,7 +315,7 @@ class TestExtractor < Minitest::Test
|
|
312
315
|
|
313
316
|
vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
|
314
317
|
|
315
|
-
table =
|
318
|
+
table = table_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
|
316
319
|
1,
|
317
320
|
[255.57,40.43,398.76,557.35],
|
318
321
|
:vertical_rulings => vertical_rulings)
|
@@ -321,12 +324,12 @@ class TestExtractor < Minitest::Test
|
|
321
324
|
end
|
322
325
|
|
323
326
|
def test_get_spacing_and_merging_right
|
324
|
-
table =
|
327
|
+
table = table_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
|
325
328
|
1,
|
326
329
|
[52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
|
327
330
|
:detect_ruling_lines => true)
|
328
331
|
|
329
|
-
expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick
|
332
|
+
expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
|
330
333
|
|
331
334
|
assert_equal expected, table
|
332
335
|
|
@@ -539,7 +542,7 @@ class TestIsTabularHeuristic < Minitest::Test
|
|
539
542
|
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
|
540
543
|
page = extractor.extract.first
|
541
544
|
page.get_ruling_lines!
|
542
|
-
assert page.is_tabular
|
545
|
+
assert page.is_tabular?, "failed on file #{f}"
|
543
546
|
end
|
544
547
|
end
|
545
548
|
|
@@ -549,11 +552,8 @@ class TestIsTabularHeuristic < Minitest::Test
|
|
549
552
|
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
|
550
553
|
page = extractor.extract.first
|
551
554
|
page.get_ruling_lines!
|
552
|
-
assert !page.is_tabular
|
555
|
+
assert !page.is_tabular?, "failed on file #{f}"
|
553
556
|
end
|
554
557
|
end
|
555
558
|
|
556
|
-
|
557
|
-
|
558
|
-
|
559
559
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.1
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Manuel Aristarán
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-01-
|
13
|
+
date: 2014-01-18 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: minitest
|
@@ -162,6 +162,7 @@ files:
|
|
162
162
|
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
163
163
|
- test/heuristic-test-set/original/bo_page24.pdf
|
164
164
|
- test/heuristic-test-set/original/campaign_donors.pdf
|
165
|
+
- test/heuristic-test-set/original/cs076pct.pdf
|
165
166
|
- test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
|
166
167
|
- test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
|
167
168
|
- test/heuristic-test-set/spreadsheet/strongschools.pdf
|
@@ -189,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
189
190
|
version: '0'
|
190
191
|
requirements: []
|
191
192
|
rubyforge_project:
|
192
|
-
rubygems_version: 2.1
|
193
|
+
rubygems_version: 2.2.1
|
193
194
|
signing_key:
|
194
195
|
specification_version: 4
|
195
196
|
summary: extract tables from PDF files
|
@@ -217,6 +218,7 @@ test_files:
|
|
217
218
|
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
218
219
|
- test/heuristic-test-set/original/bo_page24.pdf
|
219
220
|
- test/heuristic-test-set/original/campaign_donors.pdf
|
221
|
+
- test/heuristic-test-set/original/cs076pct.pdf
|
220
222
|
- test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
|
221
223
|
- test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
|
222
224
|
- test/heuristic-test-set/spreadsheet/strongschools.pdf
|