tabula-extractor 0.7.2-java → 0.7.4-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +4 -8
- data/bin/tabula +3 -3
- data/lib/tabula.rb +9 -5
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/cell.rb +6 -4
- data/lib/tabula/entities/has_cells.rb +22 -78
- data/lib/tabula/entities/line.rb +52 -6
- data/lib/tabula/entities/page.rb +43 -50
- data/lib/tabula/entities/ruling.rb +83 -105
- data/lib/tabula/entities/spreadsheet.rb +74 -11
- data/lib/tabula/entities/table.rb +55 -37
- data/lib/tabula/entities/tabular.rb +42 -0
- data/lib/tabula/entities/text_chunk.rb +55 -52
- data/lib/tabula/entities/text_element.rb +129 -62
- data/lib/tabula/entities/zone_entity.rb +15 -6
- data/lib/tabula/extraction.rb +114 -49
- data/lib/tabula/line_segment_detector.rb +0 -5
- data/lib/tabula/table_extractor.rb +32 -37
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -5
- metadata +13 -95
- data/ext/COPYING +0 -661
- data/ext/Makefile.OSX +0 -18
- data/ext/Makefile.defaults +0 -9
- data/ext/Makefile.linux32 +0 -11
- data/ext/Makefile.linux64 +0 -12
- data/ext/Makefile.mingw +0 -10
- data/ext/Makefile.mingw64 +0 -10
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +0 -3
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +0 -2270
- data/ext/lsd.h +0 -283
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/argentina_diputados_voting_record.pdf +0 -0
- data/test/data/bo_page24.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +0 -88
- data/test/data/gre.pdf +0 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +0 -21
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/tabla_subsidios.pdf +0 -0
- data/test/data/vertical_rulings_bug.pdf +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +0 -50
- data/test/test_bin_tabula.sh +0 -7
- data/test/tests.rb +0 -603
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/test/heuristic.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
#a list of filenames and the correct answer
|
2
|
-
# no more bs.
|
3
|
-
require_relative '../lib/tabula'
|
4
|
-
|
5
|
-
|
6
|
-
should_use_spreadsheet = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "spreadsheet/*") ).map{|a| [a, true]}
|
7
|
-
should_use_original = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "original/*") ).map{|a| [a, false]}
|
8
|
-
|
9
|
-
correct = []
|
10
|
-
misclassified_as_original = []
|
11
|
-
misclassified_as_spreadsheet = []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
def heuristic(page)
|
16
|
-
page.is_tabular?
|
17
|
-
end
|
18
|
-
|
19
|
-
(should_use_spreadsheet + should_use_original) .each do |filename, expected_to_be_tabular|
|
20
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(filename, [1])
|
21
|
-
|
22
|
-
page = extractor.extract.first
|
23
|
-
page.get_ruling_lines!
|
24
|
-
# puts "#{File.basename(filename)} | #{expected_to_be_tabular}"
|
25
|
-
page_is_tabular = heuristic(page)
|
26
|
-
# puts ""
|
27
|
-
|
28
|
-
if page_is_tabular && expected_to_be_tabular || !page_is_tabular && !expected_to_be_tabular
|
29
|
-
correct << filename
|
30
|
-
elsif page_is_tabular && !expected_to_be_tabular
|
31
|
-
misclassified_as_spreadsheet << filename
|
32
|
-
elsif !page_is_tabular && expected_to_be_tabular
|
33
|
-
misclassified_as_original << filename
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
puts "#{correct.size} PDFs were correctly classified"
|
38
|
-
puts "#{misclassified_as_original.size + misclassified_as_spreadsheet.size} PDFs were incorrectly classified"
|
39
|
-
unless misclassified_as_spreadsheet.empty?
|
40
|
-
puts "#{misclassified_as_spreadsheet.size} PDFs should use the original extraction algorithm\n\t but was classified as needing the spreadsheet algorithm"
|
41
|
-
misclassified_as_spreadsheet.each do |filename|
|
42
|
-
puts " - #{File.basename(filename)}"
|
43
|
-
end
|
44
|
-
end
|
45
|
-
unless misclassified_as_original.empty?
|
46
|
-
puts "#{misclassified_as_original.size} PDFs should use the spreadsheet extraction algorithm\n\t but was classified as needing the original algorithm"
|
47
|
-
misclassified_as_original.each do |filename|
|
48
|
-
puts " - #{File.basename(filename)}"
|
49
|
-
end
|
50
|
-
end
|
data/test/test_bin_tabula.sh
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf --silent -o test.csv
|
2
|
-
bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf -o test.csv
|
3
|
-
bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv
|
4
|
-
bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv --format TSV
|
5
|
-
bin/tabula test/data/campaign_donors.pdf -o test.csv --columns 47,147,256,310,375,431,504 #columns should work
|
6
|
-
bin/tabula test/data/argentina_diputados_voting_record.pdf --guess -o test.csv --format TSV #should exclude guff
|
7
|
-
bin/tabula test/data/vertical_rulings_bug.pdf --area 250,0,325,1700 -o test.csv --format TSV #should be only a few lines
|
data/test/tests.rb
DELETED
@@ -1,603 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
require 'minitest'
|
3
|
-
require 'minitest/autorun'
|
4
|
-
|
5
|
-
require_relative '../lib/tabula'
|
6
|
-
|
7
|
-
def table_to_array(table)
|
8
|
-
lines_to_array(table.rows)
|
9
|
-
end
|
10
|
-
|
11
|
-
def lines_to_array(lines)
|
12
|
-
lines.map do |l|
|
13
|
-
l.map { |te| te.text.strip }
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def lines_to_table(lines)
|
18
|
-
Tabula::Table.new_from_array(lines_to_array(lines))
|
19
|
-
end
|
20
|
-
|
21
|
-
|
22
|
-
# I don't want to pollute the "real" class with a funny inspect method. Just for testing comparisons.
|
23
|
-
module Tabula
|
24
|
-
class Table
|
25
|
-
def inspect
|
26
|
-
"[" + lines.map(&:inspect).join(",") + "]"
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
module Tabula
|
32
|
-
class Line
|
33
|
-
def inspect
|
34
|
-
@text_elements.map{|te| te.nil? ? '' : te.text}.inspect
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
|
40
|
-
class TestEntityComparability < Minitest::Test
|
41
|
-
def test_text_element_comparability
|
42
|
-
base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
|
43
|
-
|
44
|
-
two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
|
45
|
-
three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
|
46
|
-
four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
|
47
|
-
|
48
|
-
five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
|
49
|
-
six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
|
50
|
-
seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
|
51
|
-
assert_equal base, two
|
52
|
-
assert_equal base, three
|
53
|
-
assert_equal base, four
|
54
|
-
|
55
|
-
refute_equal base, five
|
56
|
-
refute_equal base, six
|
57
|
-
refute_equal base, seven
|
58
|
-
end
|
59
|
-
|
60
|
-
def test_line_comparability
|
61
|
-
text_base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
|
62
|
-
|
63
|
-
text_two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
|
64
|
-
text_three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
|
65
|
-
text_four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
|
66
|
-
|
67
|
-
text_five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
|
68
|
-
text_six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
|
69
|
-
text_seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
|
70
|
-
line_base = Tabula::Line.new
|
71
|
-
line_base.text_elements = [text_base, text_two, text_three]
|
72
|
-
line_equal = Tabula::Line.new
|
73
|
-
line_equal.text_elements = [text_base, text_two, text_three]
|
74
|
-
line_equal_but_longer = Tabula::Line.new
|
75
|
-
line_equal_but_longer.text_elements = [text_base, text_two, text_three, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
|
76
|
-
line_unequal = Tabula::Line.new
|
77
|
-
line_unequal.text_elements = [text_base, text_two, text_three, text_five]
|
78
|
-
line_unequal_and_longer = Tabula::Line.new
|
79
|
-
line_unequal_and_longer.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
|
80
|
-
line_unequal_and_longer_and_different = Tabula::Line.new
|
81
|
-
line_unequal_and_longer_and_different.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, 'whatever']
|
82
|
-
|
83
|
-
assert_equal line_base, line_equal
|
84
|
-
assert_equal line_base, line_equal_but_longer
|
85
|
-
refute_equal line_base, line_unequal
|
86
|
-
refute_equal line_base, line_unequal_and_longer
|
87
|
-
refute_equal line_base, line_unequal_and_longer_and_different
|
88
|
-
end
|
89
|
-
|
90
|
-
def test_table_comparability
|
91
|
-
rows_base = [["a", "b", "c"], ['', 'd', '']]
|
92
|
-
rows_equal = [["a", "b", "c"], ['', 'd']]
|
93
|
-
rows_equal_padded = [['', "a", "b", "c"], ['', '', 'd']]
|
94
|
-
rows_unequal_one = [["a", "b", "c"], ['d']]
|
95
|
-
rows_unequal_two = [["a", "b", "c"], ['d', '']]
|
96
|
-
rows_unequal_three = [["a", "b", "c"], ['d'], ['a','b', 'd']]
|
97
|
-
rows_unequal_four = [["a", "b", "c"]]
|
98
|
-
|
99
|
-
table_base = Tabula::Table.new_from_array(rows_base)
|
100
|
-
table_equal = Tabula::Table.new_from_array(rows_equal)
|
101
|
-
table_equal_column_padded = Tabula::Table.new_from_array(rows_equal_padded)
|
102
|
-
table_unequal_one = Tabula::Table.new_from_array(rows_unequal_one)
|
103
|
-
table_unequal_two = Tabula::Table.new_from_array(rows_unequal_two)
|
104
|
-
table_unequal_three = Tabula::Table.new_from_array(rows_unequal_three)
|
105
|
-
table_unequal_four = Tabula::Table.new_from_array(rows_unequal_four)
|
106
|
-
|
107
|
-
assert_equal table_base, table_equal
|
108
|
-
assert_equal table_base, table_equal_column_padded
|
109
|
-
refute_equal table_base, table_unequal_one
|
110
|
-
refute_equal table_base, table_unequal_two
|
111
|
-
refute_equal table_base, table_unequal_three
|
112
|
-
refute_equal table_base, table_unequal_four
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
class TestPagesInfoExtractor < Minitest::Test
|
117
|
-
def test_pages_info_extractor
|
118
|
-
extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
119
|
-
|
120
|
-
i = 0
|
121
|
-
extractor.pages.each do |page|
|
122
|
-
assert_instance_of Tabula::Page, page
|
123
|
-
i += 1
|
124
|
-
end
|
125
|
-
assert_equal 2, i
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
class TestTableGuesser < Minitest::Test
|
130
|
-
def test_find_rects_from_lines_with_lsd
|
131
|
-
skip "Skipping until we actually use LSD"
|
132
|
-
filename = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
133
|
-
page_index = 0
|
134
|
-
lines = Tabula::Extraction::LineExtractor.lines_in_pdf_page(filename, page_index, :render_pdf => true)
|
135
|
-
|
136
|
-
page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
|
137
|
-
page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
|
138
|
-
expected_page_areas = [[54.087890625, 50.203125, 734.220703125, 550.44140625]]
|
139
|
-
assert_equal expected_page_areas, page_areas
|
140
|
-
end
|
141
|
-
|
142
|
-
end
|
143
|
-
|
144
|
-
class TestDumper < Minitest::Test
|
145
|
-
|
146
|
-
def test_extractor
|
147
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
148
|
-
page = extractor.extract.next
|
149
|
-
assert_instance_of Tabula::Page, page
|
150
|
-
end
|
151
|
-
|
152
|
-
def test_get_by_area
|
153
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
154
|
-
characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
|
155
|
-
assert_equal characters.size, 206
|
156
|
-
end
|
157
|
-
end
|
158
|
-
|
159
|
-
class TestRulingIntersection < Minitest::Test
|
160
|
-
def test_ruling_intersection
|
161
|
-
horizontals = [Tabula::Ruling.new(10, 1, 10, 0)]
|
162
|
-
verticals = [Tabula::Ruling.new(1, 3, 0, 11),
|
163
|
-
Tabula::Ruling.new(1, 4, 0, 11)]
|
164
|
-
ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
|
165
|
-
assert_equal 2, ints.size
|
166
|
-
assert_equal ints[0][0].getX, 3.0
|
167
|
-
assert_equal ints[0][0].getY, 10.0
|
168
|
-
assert_equal ints[1][0].getX, 4.0
|
169
|
-
assert_equal ints[1][0].getY, 10.0
|
170
|
-
|
171
|
-
verticals = [Tabula::Ruling.new(20, 3, 0, 11)]
|
172
|
-
ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
|
173
|
-
assert_equal ints.size, 0
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
class TestExtractor < Minitest::Test
|
178
|
-
|
179
|
-
def test_table_extraction_1
|
180
|
-
table = table_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
|
181
|
-
1,
|
182
|
-
[107.1, 57.9214, 394.5214, 290.7],
|
183
|
-
:detect_ruling_lines => false)
|
184
|
-
|
185
|
-
expected = [["Prior Scale","New Scale","% Rank*"], ["800","170","99"], ["790","170","99"], ["780","170","99"], ["770","170","99"], ["760","170","99"], ["750","169","99"], ["740","169","99"], ["730","168","98"], ["720","168","98"], ["710","167","97"], ["700","166","96"], ["690","165","95"], ["680","165","95"], ["670","164","93"], ["660","164","93"], ["650","163","91"]]
|
186
|
-
|
187
|
-
assert_equal expected, table
|
188
|
-
end
|
189
|
-
|
190
|
-
def test_diputados_voting_record
|
191
|
-
table = table_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
|
192
|
-
1,
|
193
|
-
[269.875, 12.75, 790.5, 561])
|
194
|
-
|
195
|
-
expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
|
196
|
-
|
197
|
-
assert_equal expected, table
|
198
|
-
end
|
199
|
-
|
200
|
-
def test_forest_disclosure_report_dont_regress
|
201
|
-
# this is the current state of the expected output. Ideally the output should be like
|
202
|
-
# test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
|
203
|
-
# and a solution for half-x-height-offset lines.
|
204
|
-
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
205
|
-
|
206
|
-
table = Tabula.extract_table(pdf_file_path,
|
207
|
-
1,
|
208
|
-
[106.01, 48.09, 227.31, 551.89],
|
209
|
-
:detect_ruling_lines => true,
|
210
|
-
:extraction_method => "original")
|
211
|
-
|
212
|
-
expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
|
213
|
-
|
214
|
-
assert_equal expected, table
|
215
|
-
end
|
216
|
-
|
217
|
-
def test_missing_spaces_around_an_ampersand
|
218
|
-
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
219
|
-
character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
|
220
|
-
page_obj = character_extractor.extract.next
|
221
|
-
lines = page_obj.ruling_lines
|
222
|
-
vertical_rulings = lines.select(&:vertical?)
|
223
|
-
|
224
|
-
area = [170, 28, 185, 833] #top left bottom right
|
225
|
-
|
226
|
-
expected = Tabula::Table.new_from_array([
|
227
|
-
["", "REGIONAL PULMONARY & SLEEP",],
|
228
|
-
["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"],
|
229
|
-
["", "MEDICINE", ],
|
230
|
-
])
|
231
|
-
|
232
|
-
assert_equal expected, lines_to_table(page_obj.get_area(area).make_table(:vertical_rulings => vertical_rulings))
|
233
|
-
end
|
234
|
-
|
235
|
-
def test_forest_disclosure_report
|
236
|
-
skip "Skipping until we support multiline cells"
|
237
|
-
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
238
|
-
character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
|
239
|
-
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
240
|
-
vertical_rulings = lines.select(&:vertical?) #.uniq{|line| (line.left / 10).round }
|
241
|
-
|
242
|
-
page_obj = character_extractor.extract.next
|
243
|
-
characters = page_obj.get_text([110, 28, 218, 833])
|
244
|
-
#top left bottom right
|
245
|
-
expected = Tabula::Table.new_from_array([
|
246
|
-
['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '', '$85.00'],
|
247
|
-
['TOTAL', '', '', '','$85.00'],
|
248
|
-
['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '', '$78.80'],
|
249
|
-
['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '', '$392.45'],
|
250
|
-
['TOTAL', '', '', '', '$471.25'],
|
251
|
-
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '', '$20.39'],
|
252
|
-
['TOTAL', '', '', '','$20.39'],
|
253
|
-
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '', '$310.33'],
|
254
|
-
['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '', '$4,700.00'],
|
255
|
-
['TOTAL', '', '', '', '$5,010.33'],
|
256
|
-
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '', '$193.67'],
|
257
|
-
['TOTAL', '', '', '', '$193.67'],
|
258
|
-
['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '', '$19.50']
|
259
|
-
])
|
260
|
-
|
261
|
-
assert_equal expected, lines_to_table(Tabula.make_table(characters, :vertical_rulings => vertical_rulings))
|
262
|
-
end
|
263
|
-
|
264
|
-
# TODO Spaces inserted in words - fails
|
265
|
-
def test_bo_page24
|
266
|
-
table = table_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
|
267
|
-
1,
|
268
|
-
[425.625, 53.125, 575.714, 810.535],
|
269
|
-
:detect_ruling_lines => false)
|
270
|
-
|
271
|
-
expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
|
272
|
-
|
273
|
-
assert_equal expected, table
|
274
|
-
end
|
275
|
-
|
276
|
-
|
277
|
-
def test_vertical_rulings_splitting_words
|
278
|
-
#if a vertical ruling crosses over a word, the word should be split at that vertical ruling
|
279
|
-
# before, the entire word would end up on one side of the vertical ruling.
|
280
|
-
pdf_file_path = File.expand_path('data/vertical_rulings_bug.pdf', File.dirname(__FILE__))
|
281
|
-
|
282
|
-
#both of these are semantically "correct"; the difference is in how we handle multi-line cells
|
283
|
-
expected = Tabula::Table.new_from_array([
|
284
|
-
["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
|
285
|
-
["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
|
286
|
-
["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH ABRAHAMSON", "", "", "$22.93", "", "", "$22.93"]
|
287
|
-
])
|
288
|
-
other_expected = Tabula::Table.new_from_array([
|
289
|
-
["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
|
290
|
-
["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
|
291
|
-
["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH", "", "", "$22.93", "", "", "$22.93"],
|
292
|
-
["", "", "", "ABRAHAMSON"]
|
293
|
-
])
|
294
|
-
|
295
|
-
#N.B. it's "MORGANTOWN", "WV" that we're most interested in here (it used to show up as ["MORGANTOWNWV", "", ""])
|
296
|
-
|
297
|
-
|
298
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, 1...2) #:all ) # 1..2643
|
299
|
-
extractor.extract.each_with_index do |pdf_page, page_index|
|
300
|
-
|
301
|
-
page_areas = [[250, 0, 325, 1700]]
|
302
|
-
|
303
|
-
scale_factor = pdf_page.width / 1700
|
304
|
-
|
305
|
-
vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
|
306
|
-
|
307
|
-
tables = page_areas.map do |page_area|
|
308
|
-
pdf_page.get_area(page_area).make_table(:vertical_rulings => vertical_rulings)
|
309
|
-
end
|
310
|
-
assert_equal expected, lines_to_table(tables.first)
|
311
|
-
end
|
312
|
-
end
|
313
|
-
|
314
|
-
def test_vertical_rulings_prevent_merging_of_columns
|
315
|
-
expected = [["SZARANGOWICZ", "GUSTAVO ALEJANDRO", "25.096.244", "20-25096244-5", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TAILHADE", "LUIS RODOLFO", "21.386.299", "20-21386299-6", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TEDESCHI", "ADRIÁN ALBERTO", "24.171.507", "20-24171507-9", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["URRIZA", "MARÍA TERESA", "18.135.604", "27-18135604-4", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["USTARROZ", "GERÓNIMO JAVIER", "24.912.947", "20-24912947-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VALSANGIACOMO BLANC", "OFERNANDO JORGE", "26.800.203", "20-26800203-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VICENTE", "PABLO ARIEL", "21.897.586", "20-21897586-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["AMBURI", "HUGO ALBERTO", "14.096.560", "20-14096560-0", "09/10/2013", "EFECTIVO", "$ 20.000,00"], ["BERRA", "CLAUDIA SUSANA", "14.433.112", "27-14433112-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"]]
|
316
|
-
|
317
|
-
vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
|
318
|
-
|
319
|
-
table = table_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
|
320
|
-
1,
|
321
|
-
[255.57,40.43,398.76,557.35],
|
322
|
-
:vertical_rulings => vertical_rulings)
|
323
|
-
|
324
|
-
assert_equal expected, table
|
325
|
-
end
|
326
|
-
|
327
|
-
def test_get_spacing_and_merging_right
|
328
|
-
table = table_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
|
329
|
-
1,
|
330
|
-
[52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
|
331
|
-
:detect_ruling_lines => true)
|
332
|
-
|
333
|
-
expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
|
334
|
-
|
335
|
-
assert_equal expected, table
|
336
|
-
|
337
|
-
end
|
338
|
-
|
339
|
-
|
340
|
-
class SpreadsheetsHasCellsTester
|
341
|
-
include Tabula::HasCells
|
342
|
-
attr_accessor :cells
|
343
|
-
def initialize(cells)
|
344
|
-
@cells = cells
|
345
|
-
end
|
346
|
-
end
|
347
|
-
|
348
|
-
#just tests the algorithm
|
349
|
-
def test_cells_to_spreadsheets
|
350
|
-
|
351
|
-
cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
|
352
|
-
Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
|
353
|
-
Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
|
354
|
-
Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
|
355
|
-
Tabula::Cell.new(90.0, 18.0, 52.0, 4.0), Tabula::Cell.new(94.0, 18.0, 52.0, 6.0),
|
356
|
-
Tabula::Cell.new(100.0, 18.0, 52.0, 28.0), Tabula::Cell.new(128.0, 18.0, 52.0, 4.0),
|
357
|
-
Tabula::Cell.new(132.0, 18.0, 52.0, 64.0), Tabula::Cell.new(196.0, 18.0, 52.0, 66.0),
|
358
|
-
Tabula::Cell.new(262.0, 18.0, 52.0, 4.0), Tabula::Cell.new(266.0, 18.0, 52.0, 84.0),
|
359
|
-
Tabula::Cell.new(350.0, 18.0, 52.0, 4.0), Tabula::Cell.new(354.0, 18.0, 52.0, 32.0),
|
360
|
-
Tabula::Cell.new(386.0, 18.0, 52.0, 38.0), Tabula::Cell.new(424.0, 18.0, 52.0, 18.0),
|
361
|
-
Tabula::Cell.new(442.0, 18.0, 52.0, 74.0), Tabula::Cell.new(516.0, 18.0, 52.0, 28.0),
|
362
|
-
Tabula::Cell.new(544.0, 18.0, 52.0, 4.0), Tabula::Cell.new(44.0, 70.0, 156.0, 6.0),
|
363
|
-
Tabula::Cell.new(50.0, 70.0, 156.0, 4.0), Tabula::Cell.new(54.0, 70.0, 156.0, 6.0),
|
364
|
-
Tabula::Cell.new(60.0, 70.0, 156.0, 4.0), Tabula::Cell.new(64.0, 70.0, 156.0, 6.0),
|
365
|
-
Tabula::Cell.new(70.0, 70.0, 156.0, 4.0), Tabula::Cell.new(74.0, 70.0, 156.0, 6.0),
|
366
|
-
Tabula::Cell.new(84.0, 70.0, 2.0, 6.0), Tabula::Cell.new(90.0, 70.0, 156.0, 4.0),
|
367
|
-
Tabula::Cell.new(94.0, 70.0, 156.0, 6.0), Tabula::Cell.new(100.0, 70.0, 156.0, 28.0),
|
368
|
-
Tabula::Cell.new(128.0, 70.0, 156.0, 4.0), Tabula::Cell.new(132.0, 70.0, 156.0, 64.0),
|
369
|
-
Tabula::Cell.new(196.0, 70.0, 156.0, 66.0), Tabula::Cell.new(262.0, 70.0, 156.0, 4.0),
|
370
|
-
Tabula::Cell.new(266.0, 70.0, 156.0, 84.0), Tabula::Cell.new(350.0, 70.0, 156.0, 4.0),
|
371
|
-
Tabula::Cell.new(354.0, 70.0, 156.0, 32.0), Tabula::Cell.new(386.0, 70.0, 156.0, 38.0),
|
372
|
-
Tabula::Cell.new(424.0, 70.0, 156.0, 18.0), Tabula::Cell.new(442.0, 70.0, 156.0, 74.0),
|
373
|
-
Tabula::Cell.new(516.0, 70.0, 156.0, 28.0), Tabula::Cell.new(544.0, 70.0, 156.0, 4.0),
|
374
|
-
Tabula::Cell.new(84.0, 72.0, 446.0, 6.0), Tabula::Cell.new(90.0, 226.0, 176.0, 4.0),
|
375
|
-
Tabula::Cell.new(94.0, 226.0, 176.0, 6.0), Tabula::Cell.new(100.0, 226.0, 176.0, 28.0),
|
376
|
-
Tabula::Cell.new(128.0, 226.0, 176.0, 4.0), Tabula::Cell.new(132.0, 226.0, 176.0, 64.0),
|
377
|
-
Tabula::Cell.new(196.0, 226.0, 176.0, 66.0), Tabula::Cell.new(262.0, 226.0, 176.0, 4.0),
|
378
|
-
Tabula::Cell.new(266.0, 226.0, 176.0, 84.0), Tabula::Cell.new(350.0, 226.0, 176.0, 4.0),
|
379
|
-
Tabula::Cell.new(354.0, 226.0, 176.0, 32.0), Tabula::Cell.new(386.0, 226.0, 176.0, 38.0),
|
380
|
-
Tabula::Cell.new(424.0, 226.0, 176.0, 18.0), Tabula::Cell.new(442.0, 226.0, 176.0, 74.0),
|
381
|
-
Tabula::Cell.new(516.0, 226.0, 176.0, 28.0), Tabula::Cell.new(544.0, 226.0, 176.0, 4.0),
|
382
|
-
Tabula::Cell.new(90.0, 402.0, 116.0, 4.0), Tabula::Cell.new(94.0, 402.0, 116.0, 6.0),
|
383
|
-
Tabula::Cell.new(100.0, 402.0, 116.0, 28.0), Tabula::Cell.new(128.0, 402.0, 116.0, 4.0),
|
384
|
-
Tabula::Cell.new(132.0, 402.0, 116.0, 64.0), Tabula::Cell.new(196.0, 402.0, 116.0, 66.0),
|
385
|
-
Tabula::Cell.new(262.0, 402.0, 116.0, 4.0), Tabula::Cell.new(266.0, 402.0, 116.0, 84.0),
|
386
|
-
Tabula::Cell.new(350.0, 402.0, 116.0, 4.0), Tabula::Cell.new(354.0, 402.0, 116.0, 32.0),
|
387
|
-
Tabula::Cell.new(386.0, 402.0, 116.0, 38.0), Tabula::Cell.new(424.0, 402.0, 116.0, 18.0),
|
388
|
-
Tabula::Cell.new(442.0, 402.0, 116.0, 74.0), Tabula::Cell.new(516.0, 402.0, 116.0, 28.0),
|
389
|
-
Tabula::Cell.new(544.0, 402.0, 116.0, 4.0), Tabula::Cell.new(84.0, 518.0, 246.0, 6.0),
|
390
|
-
Tabula::Cell.new(90.0, 518.0, 186.0, 4.0), Tabula::Cell.new(94.0, 518.0, 186.0, 6.0),
|
391
|
-
Tabula::Cell.new(100.0, 518.0, 186.0, 28.0), Tabula::Cell.new(128.0, 518.0, 186.0, 4.0),
|
392
|
-
Tabula::Cell.new(132.0, 518.0, 186.0, 64.0), Tabula::Cell.new(196.0, 518.0, 186.0, 66.0),
|
393
|
-
Tabula::Cell.new(262.0, 518.0, 186.0, 4.0), Tabula::Cell.new(266.0, 518.0, 186.0, 84.0),
|
394
|
-
Tabula::Cell.new(350.0, 518.0, 186.0, 4.0), Tabula::Cell.new(354.0, 518.0, 186.0, 32.0),
|
395
|
-
Tabula::Cell.new(386.0, 518.0, 186.0, 38.0), Tabula::Cell.new(424.0, 518.0, 186.0, 18.0),
|
396
|
-
Tabula::Cell.new(442.0, 518.0, 186.0, 74.0), Tabula::Cell.new(516.0, 518.0, 186.0, 28.0),
|
397
|
-
Tabula::Cell.new(544.0, 518.0, 186.0, 4.0), Tabula::Cell.new(90.0, 704.0, 60.0, 4.0),
|
398
|
-
Tabula::Cell.new(94.0, 704.0, 60.0, 6.0), Tabula::Cell.new(100.0, 704.0, 60.0, 28.0),
|
399
|
-
Tabula::Cell.new(128.0, 704.0, 60.0, 4.0), Tabula::Cell.new(132.0, 704.0, 60.0, 64.0),
|
400
|
-
Tabula::Cell.new(196.0, 704.0, 60.0, 66.0), Tabula::Cell.new(262.0, 704.0, 60.0, 4.0),
|
401
|
-
Tabula::Cell.new(266.0, 704.0, 60.0, 84.0), Tabula::Cell.new(350.0, 704.0, 60.0, 4.0),
|
402
|
-
Tabula::Cell.new(354.0, 704.0, 60.0, 32.0), Tabula::Cell.new(386.0, 704.0, 60.0, 38.0),
|
403
|
-
Tabula::Cell.new(424.0, 704.0, 60.0, 18.0), Tabula::Cell.new(442.0, 704.0, 60.0, 74.0),
|
404
|
-
Tabula::Cell.new(516.0, 704.0, 60.0, 28.0), Tabula::Cell.new(544.0, 704.0, 60.0, 4.0),
|
405
|
-
Tabula::Cell.new(84.0, 764.0, 216.0, 6.0), Tabula::Cell.new(90.0, 764.0, 216.0, 4.0),
|
406
|
-
Tabula::Cell.new(94.0, 764.0, 216.0, 6.0), Tabula::Cell.new(100.0, 764.0, 216.0, 28.0),
|
407
|
-
Tabula::Cell.new(128.0, 764.0, 216.0, 4.0), Tabula::Cell.new(132.0, 764.0, 216.0, 64.0),
|
408
|
-
Tabula::Cell.new(196.0, 764.0, 216.0, 66.0), Tabula::Cell.new(262.0, 764.0, 216.0, 4.0),
|
409
|
-
Tabula::Cell.new(266.0, 764.0, 216.0, 84.0), Tabula::Cell.new(350.0, 764.0, 216.0, 4.0),
|
410
|
-
Tabula::Cell.new(354.0, 764.0, 216.0, 32.0), Tabula::Cell.new(386.0, 764.0, 216.0, 38.0),
|
411
|
-
Tabula::Cell.new(424.0, 764.0, 216.0, 18.0), Tabula::Cell.new(442.0, 764.0, 216.0, 74.0),
|
412
|
-
Tabula::Cell.new(516.0, 764.0, 216.0, 28.0), Tabula::Cell.new(544.0, 764.0, 216.0, 4.0)]
|
413
|
-
|
414
|
-
|
415
|
-
expected_spreadsheets = [Tabula::Spreadsheet.new(40.0, 18.0, 208.0, 40.0, nil, nil, nil, nil),
|
416
|
-
Tabula::Spreadsheet.new(84.0, 18.0, 962.0, 464.0,nil, nil, nil, nil)]
|
417
|
-
|
418
|
-
#compares spreadsheets on area only.
|
419
|
-
assert_equal expected_spreadsheets.map{|s| [s.x, s.y, s.width, s.height] },
|
420
|
-
SpreadsheetsHasCellsTester.new(cells).find_spreadsheets_from_cells.map{|a| s = a.getBounds; [s.x, s.y, s.width, s.height] }
|
421
|
-
|
422
|
-
|
423
|
-
end
|
424
|
-
|
425
|
-
def test_add_spanning_cells
|
426
|
-
skip "until I write it"
|
427
|
-
end
|
428
|
-
|
429
|
-
def test_add_placeholder_cells_to_funny_shaped_tables
|
430
|
-
skip "until I write it, cf 01005787B_Pakistan.pdf"
|
431
|
-
end
|
432
|
-
|
433
|
-
class CellsHasCellsTester
|
434
|
-
include Tabula::HasCells
|
435
|
-
attr_accessor :vertical_ruling_lines, :horizontal_ruling_lines, :cells
|
436
|
-
def initialize(vertical_ruling_lines, horizontal_ruling_lines)
|
437
|
-
@cells = []
|
438
|
-
@vertical_ruling_lines = vertical_ruling_lines
|
439
|
-
@horizontal_ruling_lines = horizontal_ruling_lines
|
440
|
-
find_cells!
|
441
|
-
end
|
442
|
-
end
|
443
|
-
|
444
|
-
#just tests the algorithm
|
445
|
-
def test_lines_to_cells
|
446
|
-
vertical_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 0.0, 40.0),
|
447
|
-
Tabula::Ruling.new(44.0, 70.0, 0.0, 36.0),
|
448
|
-
Tabula::Ruling.new(40.0, 226.0, 0.0, 40.0)]
|
449
|
-
|
450
|
-
horizontal_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 208.0, 0.0),
|
451
|
-
Tabula::Ruling.new(44.0, 18.0, 208.0, 0.0),
|
452
|
-
Tabula::Ruling.new(50.0, 18.0, 208.0, 0.0),
|
453
|
-
Tabula::Ruling.new(54.0, 18.0, 208.0, 0.0),
|
454
|
-
Tabula::Ruling.new(60.0, 18.0, 208.0, 0.0),
|
455
|
-
Tabula::Ruling.new(64.0, 18.0, 208.0, 0.0),
|
456
|
-
Tabula::Ruling.new(70.0, 18.0, 208.0, 0.0),
|
457
|
-
Tabula::Ruling.new(74.0, 18.0, 208.0, 0.0),
|
458
|
-
Tabula::Ruling.new(80.0, 18.0, 208.0, 0.0)]
|
459
|
-
|
460
|
-
expected_cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
|
461
|
-
Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
|
462
|
-
Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
|
463
|
-
Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
|
464
|
-
Tabula::Cell.new(44.0, 70.0, 156.0, 6.0), Tabula::Cell.new(50.0, 70.0, 156.0, 4.0),
|
465
|
-
Tabula::Cell.new(54.0, 70.0, 156.0, 6.0), Tabula::Cell.new(60.0, 70.0, 156.0, 4.0),
|
466
|
-
Tabula::Cell.new(64.0, 70.0, 156.0, 6.0), Tabula::Cell.new(70.0, 70.0, 156.0, 4.0),
|
467
|
-
Tabula::Cell.new(74.0, 70.0, 156.0, 6.0), ]
|
468
|
-
|
469
|
-
actual_cells = CellsHasCellsTester.new(vertical_ruling_lines, horizontal_ruling_lines).cells
|
470
|
-
assert_equal Set.new(expected_cells), Set.new(actual_cells) #I don't care about order
|
471
|
-
end
|
472
|
-
|
473
|
-
#this is the real deal!!
|
474
|
-
def test_extract_tabular_data_using_lines_and_spreadsheets
|
475
|
-
pdf_file_path = "./test/data/frx_2012_disclosure.pdf"
|
476
|
-
expected_data_path = "./test/data/frx_2012_disclosure.tsv"
|
477
|
-
expected = open(expected_data_path, 'r').read #.split("\n").map{|line| line.split("\t")}
|
478
|
-
|
479
|
-
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all).extract.each do |pdf_page|
|
480
|
-
spreadsheet = pdf_page.spreadsheets.first
|
481
|
-
assert_equal expected, spreadsheet.to_tsv
|
482
|
-
end
|
483
|
-
end
|
484
|
-
|
485
|
-
def test_cope_with_a_tableless_page
|
486
|
-
pdf_file_path = "./test/data/no_tables.pdf"
|
487
|
-
|
488
|
-
spreadsheets = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all, '',
|
489
|
-
:line_color_filter => lambda{|components| components.all?{|c| c < 0.1}}
|
490
|
-
).extract.to_a.first.spreadsheets
|
491
|
-
|
492
|
-
assert_equal 0, spreadsheets.size
|
493
|
-
end
|
494
|
-
|
495
|
-
def test_spanning_cells
|
496
|
-
pdf_file_path = "./test/data/spanning_cells.pdf"
|
497
|
-
expected_data_path = "./test/data/spanning_cells.csv"
|
498
|
-
expected = open(expected_data_path, 'r').read
|
499
|
-
|
500
|
-
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
501
|
-
spreadsheet = pdf_page.spreadsheets.first
|
502
|
-
assert_equal expected, spreadsheet.to_csv
|
503
|
-
end
|
504
|
-
end
|
505
|
-
|
506
|
-
def test_almost_vertical_lines
|
507
|
-
pdf_file_path = "./test/data/puertos1.pdf"
|
508
|
-
top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
|
509
|
-
area = Tabula::ZoneEntity.new(top, left,
|
510
|
-
right - left, bottom - top)
|
511
|
-
|
512
|
-
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
513
|
-
rulings = Tabula::Ruling.crop_rulings_to_area(pdf_page.ruling_lines, area)
|
514
|
-
# TODO assertion not entirely correct, should do the trick for now
|
515
|
-
assert_equal 15, rulings.select(&:vertical?).count
|
516
|
-
end
|
517
|
-
end
|
518
|
-
|
519
|
-
def test_extract_spreadsheet_within_an_area
|
520
|
-
pdf_file_path = "./test/data/puertos1.pdf"
|
521
|
-
top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
|
522
|
-
|
523
|
-
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
524
|
-
area = pdf_page.get_area([top, left, bottom, right])
|
525
|
-
table = area.spreadsheets.first.to_a
|
526
|
-
assert_equal 15, table.length
|
527
|
-
assert_equal ["", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM"], table.first
|
528
|
-
assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
|
529
|
-
end
|
530
|
-
end
|
531
|
-
|
532
|
-
def test_remove_repeated_text
|
533
|
-
top, left, bottom, right = 106.07142857142858, 50.91428571428572, 141.42857142857144, 755.2285714285715
|
534
|
-
|
535
|
-
table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
|
536
|
-
1,
|
537
|
-
[top,left,bottom,right],
|
538
|
-
:detect_ruling_lines => false,
|
539
|
-
:extraction_method => 'original')
|
540
|
-
|
541
|
-
ary = table_to_array(table)
|
542
|
-
assert_equal ary[1][1], "$ 18,969,610"
|
543
|
-
assert_equal ary[1][2], "$ 18,157,722"
|
544
|
-
end
|
545
|
-
|
546
|
-
def test_remove_overlapping_text
|
547
|
-
# one of those PDFs that put characters on top of another to make text "bold"
|
548
|
-
top,left,bottom,right = 399.98571428571427, 36.06428571428571, 425.1214285714285, 544.2428571428571
|
549
|
-
table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
|
550
|
-
1,
|
551
|
-
[top,left,bottom,right],
|
552
|
-
:detect_ruling_lines => false,
|
553
|
-
:extraction_method => 'original')
|
554
|
-
|
555
|
-
ary = table_to_array(table)
|
556
|
-
assert_equal ary.first.first, "Community development"
|
557
|
-
end
|
558
|
-
|
559
|
-
def test_cells_including_line_returns
|
560
|
-
data = []
|
561
|
-
pdf_file_path = "./test/data/sydney_disclosure_contract.pdf"
|
562
|
-
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
563
|
-
pdf_page.spreadsheets.each do |spreadsheet|
|
564
|
-
spreadsheet.cells.each do |cell|
|
565
|
-
cell.text_elements = pdf_page.get_cell_text(cell)
|
566
|
-
cell.options = ({:use_line_returns => true, :cell_debug => 0})
|
567
|
-
data << cell.text
|
568
|
-
end
|
569
|
-
end
|
570
|
-
end
|
571
|
-
assert_equal ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \nAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \nDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender \nTender evaluation criteria included: \n- The schedule of prices \n- Compliance with technical specifications/Technical assessment \n- Operational Plan including maintenance procedures"], data
|
572
|
-
end
|
573
|
-
|
574
|
-
end
|
575
|
-
|
576
|
-
class TestIsTabularHeuristic < Minitest::Test
|
577
|
-
|
578
|
-
EXPECTED_TO_BE_SPREADSHEET = ['47008204D_USA.page4.pdf', 'GSK_2012_Q4.page437.pdf', 'strongschools.pdf', 'tabla_subsidios.pdf']
|
579
|
-
NOT_EXPECTED_TO_BE_SPREADSHEET = ['560015757GV_China.page1.pdf', 'S2MNCEbirdisland.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
|
580
|
-
|
581
|
-
File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
582
|
-
|
583
|
-
def test_heuristic_detects_spreadsheets
|
584
|
-
EXPECTED_TO_BE_SPREADSHEET.each do |f|
|
585
|
-
path = File.expand_path('data/' + f, File.dirname(__FILE__))
|
586
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
|
587
|
-
page = extractor.extract.first
|
588
|
-
page.get_ruling_lines!
|
589
|
-
assert page.is_tabular?, "failed on file #{f}"
|
590
|
-
end
|
591
|
-
end
|
592
|
-
|
593
|
-
def test_heuristic_detects_non_spreadsheets
|
594
|
-
NOT_EXPECTED_TO_BE_SPREADSHEET.each do |f|
|
595
|
-
path = File.expand_path('data/' + f, File.dirname(__FILE__))
|
596
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
|
597
|
-
page = extractor.extract.first
|
598
|
-
page.get_ruling_lines!
|
599
|
-
assert !page.is_tabular?, "failed on file #{f}"
|
600
|
-
end
|
601
|
-
end
|
602
|
-
|
603
|
-
end
|