tabula-extractor 0.7.2-java → 0.7.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +4 -8
- data/bin/tabula +3 -3
- data/lib/tabula.rb +9 -5
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/cell.rb +6 -4
- data/lib/tabula/entities/has_cells.rb +22 -78
- data/lib/tabula/entities/line.rb +52 -6
- data/lib/tabula/entities/page.rb +43 -50
- data/lib/tabula/entities/ruling.rb +83 -105
- data/lib/tabula/entities/spreadsheet.rb +74 -11
- data/lib/tabula/entities/table.rb +55 -37
- data/lib/tabula/entities/tabular.rb +42 -0
- data/lib/tabula/entities/text_chunk.rb +55 -52
- data/lib/tabula/entities/text_element.rb +129 -62
- data/lib/tabula/entities/zone_entity.rb +15 -6
- data/lib/tabula/extraction.rb +114 -49
- data/lib/tabula/line_segment_detector.rb +0 -5
- data/lib/tabula/table_extractor.rb +32 -37
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -5
- metadata +13 -95
- data/ext/COPYING +0 -661
- data/ext/Makefile.OSX +0 -18
- data/ext/Makefile.defaults +0 -9
- data/ext/Makefile.linux32 +0 -11
- data/ext/Makefile.linux64 +0 -12
- data/ext/Makefile.mingw +0 -10
- data/ext/Makefile.mingw64 +0 -10
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +0 -3
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +0 -2270
- data/ext/lsd.h +0 -283
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/argentina_diputados_voting_record.pdf +0 -0
- data/test/data/bo_page24.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +0 -88
- data/test/data/gre.pdf +0 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +0 -21
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/tabla_subsidios.pdf +0 -0
- data/test/data/vertical_rulings_bug.pdf +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +0 -50
- data/test/test_bin_tabula.sh +0 -7
- data/test/tests.rb +0 -603
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/test/heuristic.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
#a list of filenames and the correct answer
|
2
|
-
# no more bs.
|
3
|
-
require_relative '../lib/tabula'
|
4
|
-
|
5
|
-
|
6
|
-
should_use_spreadsheet = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "spreadsheet/*") ).map{|a| [a, true]}
|
7
|
-
should_use_original = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "original/*") ).map{|a| [a, false]}
|
8
|
-
|
9
|
-
correct = []
|
10
|
-
misclassified_as_original = []
|
11
|
-
misclassified_as_spreadsheet = []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
def heuristic(page)
|
16
|
-
page.is_tabular?
|
17
|
-
end
|
18
|
-
|
19
|
-
(should_use_spreadsheet + should_use_original) .each do |filename, expected_to_be_tabular|
|
20
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(filename, [1])
|
21
|
-
|
22
|
-
page = extractor.extract.first
|
23
|
-
page.get_ruling_lines!
|
24
|
-
# puts "#{File.basename(filename)} | #{expected_to_be_tabular}"
|
25
|
-
page_is_tabular = heuristic(page)
|
26
|
-
# puts ""
|
27
|
-
|
28
|
-
if page_is_tabular && expected_to_be_tabular || !page_is_tabular && !expected_to_be_tabular
|
29
|
-
correct << filename
|
30
|
-
elsif page_is_tabular && !expected_to_be_tabular
|
31
|
-
misclassified_as_spreadsheet << filename
|
32
|
-
elsif !page_is_tabular && expected_to_be_tabular
|
33
|
-
misclassified_as_original << filename
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
puts "#{correct.size} PDFs were correctly classified"
|
38
|
-
puts "#{misclassified_as_original.size + misclassified_as_spreadsheet.size} PDFs were incorrectly classified"
|
39
|
-
unless misclassified_as_spreadsheet.empty?
|
40
|
-
puts "#{misclassified_as_spreadsheet.size} PDFs should use the original extraction algorithm\n\t but was classified as needing the spreadsheet algorithm"
|
41
|
-
misclassified_as_spreadsheet.each do |filename|
|
42
|
-
puts " - #{File.basename(filename)}"
|
43
|
-
end
|
44
|
-
end
|
45
|
-
unless misclassified_as_original.empty?
|
46
|
-
puts "#{misclassified_as_original.size} PDFs should use the spreadsheet extraction algorithm\n\t but was classified as needing the original algorithm"
|
47
|
-
misclassified_as_original.each do |filename|
|
48
|
-
puts " - #{File.basename(filename)}"
|
49
|
-
end
|
50
|
-
end
|
data/test/test_bin_tabula.sh
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf --silent -o test.csv
|
2
|
-
bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf -o test.csv
|
3
|
-
bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv
|
4
|
-
bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv --format TSV
|
5
|
-
bin/tabula test/data/campaign_donors.pdf -o test.csv --columns 47,147,256,310,375,431,504 #columns should work
|
6
|
-
bin/tabula test/data/argentina_diputados_voting_record.pdf --guess -o test.csv --format TSV #should exclude guff
|
7
|
-
bin/tabula test/data/vertical_rulings_bug.pdf --area 250,0,325,1700 -o test.csv --format TSV #should be only a few lines
|
data/test/tests.rb
DELETED
@@ -1,603 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
require 'minitest'
|
3
|
-
require 'minitest/autorun'
|
4
|
-
|
5
|
-
require_relative '../lib/tabula'
|
6
|
-
|
7
|
-
def table_to_array(table)
|
8
|
-
lines_to_array(table.rows)
|
9
|
-
end
|
10
|
-
|
11
|
-
def lines_to_array(lines)
|
12
|
-
lines.map do |l|
|
13
|
-
l.map { |te| te.text.strip }
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def lines_to_table(lines)
|
18
|
-
Tabula::Table.new_from_array(lines_to_array(lines))
|
19
|
-
end
|
20
|
-
|
21
|
-
|
22
|
-
# I don't want to pollute the "real" class with a funny inspect method. Just for testing comparisons.
|
23
|
-
module Tabula
|
24
|
-
class Table
|
25
|
-
def inspect
|
26
|
-
"[" + lines.map(&:inspect).join(",") + "]"
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
module Tabula
|
32
|
-
class Line
|
33
|
-
def inspect
|
34
|
-
@text_elements.map{|te| te.nil? ? '' : te.text}.inspect
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
|
40
|
-
class TestEntityComparability < Minitest::Test
|
41
|
-
def test_text_element_comparability
|
42
|
-
base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
|
43
|
-
|
44
|
-
two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
|
45
|
-
three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
|
46
|
-
four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
|
47
|
-
|
48
|
-
five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
|
49
|
-
six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
|
50
|
-
seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
|
51
|
-
assert_equal base, two
|
52
|
-
assert_equal base, three
|
53
|
-
assert_equal base, four
|
54
|
-
|
55
|
-
refute_equal base, five
|
56
|
-
refute_equal base, six
|
57
|
-
refute_equal base, seven
|
58
|
-
end
|
59
|
-
|
60
|
-
def test_line_comparability
|
61
|
-
text_base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
|
62
|
-
|
63
|
-
text_two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
|
64
|
-
text_three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
|
65
|
-
text_four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
|
66
|
-
|
67
|
-
text_five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
|
68
|
-
text_six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
|
69
|
-
text_seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
|
70
|
-
line_base = Tabula::Line.new
|
71
|
-
line_base.text_elements = [text_base, text_two, text_three]
|
72
|
-
line_equal = Tabula::Line.new
|
73
|
-
line_equal.text_elements = [text_base, text_two, text_three]
|
74
|
-
line_equal_but_longer = Tabula::Line.new
|
75
|
-
line_equal_but_longer.text_elements = [text_base, text_two, text_three, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
|
76
|
-
line_unequal = Tabula::Line.new
|
77
|
-
line_unequal.text_elements = [text_base, text_two, text_three, text_five]
|
78
|
-
line_unequal_and_longer = Tabula::Line.new
|
79
|
-
line_unequal_and_longer.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
|
80
|
-
line_unequal_and_longer_and_different = Tabula::Line.new
|
81
|
-
line_unequal_and_longer_and_different.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, 'whatever']
|
82
|
-
|
83
|
-
assert_equal line_base, line_equal
|
84
|
-
assert_equal line_base, line_equal_but_longer
|
85
|
-
refute_equal line_base, line_unequal
|
86
|
-
refute_equal line_base, line_unequal_and_longer
|
87
|
-
refute_equal line_base, line_unequal_and_longer_and_different
|
88
|
-
end
|
89
|
-
|
90
|
-
def test_table_comparability
|
91
|
-
rows_base = [["a", "b", "c"], ['', 'd', '']]
|
92
|
-
rows_equal = [["a", "b", "c"], ['', 'd']]
|
93
|
-
rows_equal_padded = [['', "a", "b", "c"], ['', '', 'd']]
|
94
|
-
rows_unequal_one = [["a", "b", "c"], ['d']]
|
95
|
-
rows_unequal_two = [["a", "b", "c"], ['d', '']]
|
96
|
-
rows_unequal_three = [["a", "b", "c"], ['d'], ['a','b', 'd']]
|
97
|
-
rows_unequal_four = [["a", "b", "c"]]
|
98
|
-
|
99
|
-
table_base = Tabula::Table.new_from_array(rows_base)
|
100
|
-
table_equal = Tabula::Table.new_from_array(rows_equal)
|
101
|
-
table_equal_column_padded = Tabula::Table.new_from_array(rows_equal_padded)
|
102
|
-
table_unequal_one = Tabula::Table.new_from_array(rows_unequal_one)
|
103
|
-
table_unequal_two = Tabula::Table.new_from_array(rows_unequal_two)
|
104
|
-
table_unequal_three = Tabula::Table.new_from_array(rows_unequal_three)
|
105
|
-
table_unequal_four = Tabula::Table.new_from_array(rows_unequal_four)
|
106
|
-
|
107
|
-
assert_equal table_base, table_equal
|
108
|
-
assert_equal table_base, table_equal_column_padded
|
109
|
-
refute_equal table_base, table_unequal_one
|
110
|
-
refute_equal table_base, table_unequal_two
|
111
|
-
refute_equal table_base, table_unequal_three
|
112
|
-
refute_equal table_base, table_unequal_four
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
class TestPagesInfoExtractor < Minitest::Test
|
117
|
-
def test_pages_info_extractor
|
118
|
-
extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
119
|
-
|
120
|
-
i = 0
|
121
|
-
extractor.pages.each do |page|
|
122
|
-
assert_instance_of Tabula::Page, page
|
123
|
-
i += 1
|
124
|
-
end
|
125
|
-
assert_equal 2, i
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
class TestTableGuesser < Minitest::Test
|
130
|
-
def test_find_rects_from_lines_with_lsd
|
131
|
-
skip "Skipping until we actually use LSD"
|
132
|
-
filename = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
133
|
-
page_index = 0
|
134
|
-
lines = Tabula::Extraction::LineExtractor.lines_in_pdf_page(filename, page_index, :render_pdf => true)
|
135
|
-
|
136
|
-
page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
|
137
|
-
page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
|
138
|
-
expected_page_areas = [[54.087890625, 50.203125, 734.220703125, 550.44140625]]
|
139
|
-
assert_equal expected_page_areas, page_areas
|
140
|
-
end
|
141
|
-
|
142
|
-
end
|
143
|
-
|
144
|
-
class TestDumper < Minitest::Test
|
145
|
-
|
146
|
-
def test_extractor
|
147
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
148
|
-
page = extractor.extract.next
|
149
|
-
assert_instance_of Tabula::Page, page
|
150
|
-
end
|
151
|
-
|
152
|
-
def test_get_by_area
|
153
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
154
|
-
characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
|
155
|
-
assert_equal characters.size, 206
|
156
|
-
end
|
157
|
-
end
|
158
|
-
|
159
|
-
class TestRulingIntersection < Minitest::Test
|
160
|
-
def test_ruling_intersection
|
161
|
-
horizontals = [Tabula::Ruling.new(10, 1, 10, 0)]
|
162
|
-
verticals = [Tabula::Ruling.new(1, 3, 0, 11),
|
163
|
-
Tabula::Ruling.new(1, 4, 0, 11)]
|
164
|
-
ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
|
165
|
-
assert_equal 2, ints.size
|
166
|
-
assert_equal ints[0][0].getX, 3.0
|
167
|
-
assert_equal ints[0][0].getY, 10.0
|
168
|
-
assert_equal ints[1][0].getX, 4.0
|
169
|
-
assert_equal ints[1][0].getY, 10.0
|
170
|
-
|
171
|
-
verticals = [Tabula::Ruling.new(20, 3, 0, 11)]
|
172
|
-
ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
|
173
|
-
assert_equal ints.size, 0
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
class TestExtractor < Minitest::Test
|
178
|
-
|
179
|
-
def test_table_extraction_1
|
180
|
-
table = table_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
|
181
|
-
1,
|
182
|
-
[107.1, 57.9214, 394.5214, 290.7],
|
183
|
-
:detect_ruling_lines => false)
|
184
|
-
|
185
|
-
expected = [["Prior Scale","New Scale","% Rank*"], ["800","170","99"], ["790","170","99"], ["780","170","99"], ["770","170","99"], ["760","170","99"], ["750","169","99"], ["740","169","99"], ["730","168","98"], ["720","168","98"], ["710","167","97"], ["700","166","96"], ["690","165","95"], ["680","165","95"], ["670","164","93"], ["660","164","93"], ["650","163","91"]]
|
186
|
-
|
187
|
-
assert_equal expected, table
|
188
|
-
end
|
189
|
-
|
190
|
-
def test_diputados_voting_record
|
191
|
-
table = table_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
|
192
|
-
1,
|
193
|
-
[269.875, 12.75, 790.5, 561])
|
194
|
-
|
195
|
-
expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
|
196
|
-
|
197
|
-
assert_equal expected, table
|
198
|
-
end
|
199
|
-
|
200
|
-
def test_forest_disclosure_report_dont_regress
|
201
|
-
# this is the current state of the expected output. Ideally the output should be like
|
202
|
-
# test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
|
203
|
-
# and a solution for half-x-height-offset lines.
|
204
|
-
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
205
|
-
|
206
|
-
table = Tabula.extract_table(pdf_file_path,
|
207
|
-
1,
|
208
|
-
[106.01, 48.09, 227.31, 551.89],
|
209
|
-
:detect_ruling_lines => true,
|
210
|
-
:extraction_method => "original")
|
211
|
-
|
212
|
-
expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
|
213
|
-
|
214
|
-
assert_equal expected, table
|
215
|
-
end
|
216
|
-
|
217
|
-
def test_missing_spaces_around_an_ampersand
|
218
|
-
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
219
|
-
character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
|
220
|
-
page_obj = character_extractor.extract.next
|
221
|
-
lines = page_obj.ruling_lines
|
222
|
-
vertical_rulings = lines.select(&:vertical?)
|
223
|
-
|
224
|
-
area = [170, 28, 185, 833] #top left bottom right
|
225
|
-
|
226
|
-
expected = Tabula::Table.new_from_array([
|
227
|
-
["", "REGIONAL PULMONARY & SLEEP",],
|
228
|
-
["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"],
|
229
|
-
["", "MEDICINE", ],
|
230
|
-
])
|
231
|
-
|
232
|
-
assert_equal expected, lines_to_table(page_obj.get_area(area).make_table(:vertical_rulings => vertical_rulings))
|
233
|
-
end
|
234
|
-
|
235
|
-
def test_forest_disclosure_report
|
236
|
-
skip "Skipping until we support multiline cells"
|
237
|
-
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
238
|
-
character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
|
239
|
-
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
240
|
-
vertical_rulings = lines.select(&:vertical?) #.uniq{|line| (line.left / 10).round }
|
241
|
-
|
242
|
-
page_obj = character_extractor.extract.next
|
243
|
-
characters = page_obj.get_text([110, 28, 218, 833])
|
244
|
-
#top left bottom right
|
245
|
-
expected = Tabula::Table.new_from_array([
|
246
|
-
['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '', '$85.00'],
|
247
|
-
['TOTAL', '', '', '','$85.00'],
|
248
|
-
['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '', '$78.80'],
|
249
|
-
['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '', '$392.45'],
|
250
|
-
['TOTAL', '', '', '', '$471.25'],
|
251
|
-
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '', '$20.39'],
|
252
|
-
['TOTAL', '', '', '','$20.39'],
|
253
|
-
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '', '$310.33'],
|
254
|
-
['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '', '$4,700.00'],
|
255
|
-
['TOTAL', '', '', '', '$5,010.33'],
|
256
|
-
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '', '$193.67'],
|
257
|
-
['TOTAL', '', '', '', '$193.67'],
|
258
|
-
['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '', '$19.50']
|
259
|
-
])
|
260
|
-
|
261
|
-
assert_equal expected, lines_to_table(Tabula.make_table(characters, :vertical_rulings => vertical_rulings))
|
262
|
-
end
|
263
|
-
|
264
|
-
# TODO Spaces inserted in words - fails
|
265
|
-
def test_bo_page24
|
266
|
-
table = table_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
|
267
|
-
1,
|
268
|
-
[425.625, 53.125, 575.714, 810.535],
|
269
|
-
:detect_ruling_lines => false)
|
270
|
-
|
271
|
-
expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
|
272
|
-
|
273
|
-
assert_equal expected, table
|
274
|
-
end
|
275
|
-
|
276
|
-
|
277
|
-
def test_vertical_rulings_splitting_words
|
278
|
-
#if a vertical ruling crosses over a word, the word should be split at that vertical ruling
|
279
|
-
# before, the entire word would end up on one side of the vertical ruling.
|
280
|
-
pdf_file_path = File.expand_path('data/vertical_rulings_bug.pdf', File.dirname(__FILE__))
|
281
|
-
|
282
|
-
#both of these are semantically "correct"; the difference is in how we handle multi-line cells
|
283
|
-
expected = Tabula::Table.new_from_array([
|
284
|
-
["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
|
285
|
-
["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
|
286
|
-
["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH ABRAHAMSON", "", "", "$22.93", "", "", "$22.93"]
|
287
|
-
])
|
288
|
-
other_expected = Tabula::Table.new_from_array([
|
289
|
-
["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
|
290
|
-
["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
|
291
|
-
["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH", "", "", "$22.93", "", "", "$22.93"],
|
292
|
-
["", "", "", "ABRAHAMSON"]
|
293
|
-
])
|
294
|
-
|
295
|
-
#N.B. it's "MORGANTOWN", "WV" that we're most interested in here (it used to show up as ["MORGANTOWNWV", "", ""])
|
296
|
-
|
297
|
-
|
298
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, 1...2) #:all ) # 1..2643
|
299
|
-
extractor.extract.each_with_index do |pdf_page, page_index|
|
300
|
-
|
301
|
-
page_areas = [[250, 0, 325, 1700]]
|
302
|
-
|
303
|
-
scale_factor = pdf_page.width / 1700
|
304
|
-
|
305
|
-
vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
|
306
|
-
|
307
|
-
tables = page_areas.map do |page_area|
|
308
|
-
pdf_page.get_area(page_area).make_table(:vertical_rulings => vertical_rulings)
|
309
|
-
end
|
310
|
-
assert_equal expected, lines_to_table(tables.first)
|
311
|
-
end
|
312
|
-
end
|
313
|
-
|
314
|
-
def test_vertical_rulings_prevent_merging_of_columns
|
315
|
-
expected = [["SZARANGOWICZ", "GUSTAVO ALEJANDRO", "25.096.244", "20-25096244-5", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TAILHADE", "LUIS RODOLFO", "21.386.299", "20-21386299-6", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TEDESCHI", "ADRIÁN ALBERTO", "24.171.507", "20-24171507-9", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["URRIZA", "MARÍA TERESA", "18.135.604", "27-18135604-4", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["USTARROZ", "GERÓNIMO JAVIER", "24.912.947", "20-24912947-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VALSANGIACOMO BLANC", "OFERNANDO JORGE", "26.800.203", "20-26800203-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VICENTE", "PABLO ARIEL", "21.897.586", "20-21897586-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["AMBURI", "HUGO ALBERTO", "14.096.560", "20-14096560-0", "09/10/2013", "EFECTIVO", "$ 20.000,00"], ["BERRA", "CLAUDIA SUSANA", "14.433.112", "27-14433112-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"]]
|
316
|
-
|
317
|
-
vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
|
318
|
-
|
319
|
-
table = table_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
|
320
|
-
1,
|
321
|
-
[255.57,40.43,398.76,557.35],
|
322
|
-
:vertical_rulings => vertical_rulings)
|
323
|
-
|
324
|
-
assert_equal expected, table
|
325
|
-
end
|
326
|
-
|
327
|
-
def test_get_spacing_and_merging_right
|
328
|
-
table = table_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
|
329
|
-
1,
|
330
|
-
[52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
|
331
|
-
:detect_ruling_lines => true)
|
332
|
-
|
333
|
-
expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
|
334
|
-
|
335
|
-
assert_equal expected, table
|
336
|
-
|
337
|
-
end
|
338
|
-
|
339
|
-
|
340
|
-
class SpreadsheetsHasCellsTester
|
341
|
-
include Tabula::HasCells
|
342
|
-
attr_accessor :cells
|
343
|
-
def initialize(cells)
|
344
|
-
@cells = cells
|
345
|
-
end
|
346
|
-
end
|
347
|
-
|
348
|
-
#just tests the algorithm
|
349
|
-
def test_cells_to_spreadsheets
|
350
|
-
|
351
|
-
cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
|
352
|
-
Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
|
353
|
-
Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
|
354
|
-
Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
|
355
|
-
Tabula::Cell.new(90.0, 18.0, 52.0, 4.0), Tabula::Cell.new(94.0, 18.0, 52.0, 6.0),
|
356
|
-
Tabula::Cell.new(100.0, 18.0, 52.0, 28.0), Tabula::Cell.new(128.0, 18.0, 52.0, 4.0),
|
357
|
-
Tabula::Cell.new(132.0, 18.0, 52.0, 64.0), Tabula::Cell.new(196.0, 18.0, 52.0, 66.0),
|
358
|
-
Tabula::Cell.new(262.0, 18.0, 52.0, 4.0), Tabula::Cell.new(266.0, 18.0, 52.0, 84.0),
|
359
|
-
Tabula::Cell.new(350.0, 18.0, 52.0, 4.0), Tabula::Cell.new(354.0, 18.0, 52.0, 32.0),
|
360
|
-
Tabula::Cell.new(386.0, 18.0, 52.0, 38.0), Tabula::Cell.new(424.0, 18.0, 52.0, 18.0),
|
361
|
-
Tabula::Cell.new(442.0, 18.0, 52.0, 74.0), Tabula::Cell.new(516.0, 18.0, 52.0, 28.0),
|
362
|
-
Tabula::Cell.new(544.0, 18.0, 52.0, 4.0), Tabula::Cell.new(44.0, 70.0, 156.0, 6.0),
|
363
|
-
Tabula::Cell.new(50.0, 70.0, 156.0, 4.0), Tabula::Cell.new(54.0, 70.0, 156.0, 6.0),
|
364
|
-
Tabula::Cell.new(60.0, 70.0, 156.0, 4.0), Tabula::Cell.new(64.0, 70.0, 156.0, 6.0),
|
365
|
-
Tabula::Cell.new(70.0, 70.0, 156.0, 4.0), Tabula::Cell.new(74.0, 70.0, 156.0, 6.0),
|
366
|
-
Tabula::Cell.new(84.0, 70.0, 2.0, 6.0), Tabula::Cell.new(90.0, 70.0, 156.0, 4.0),
|
367
|
-
Tabula::Cell.new(94.0, 70.0, 156.0, 6.0), Tabula::Cell.new(100.0, 70.0, 156.0, 28.0),
|
368
|
-
Tabula::Cell.new(128.0, 70.0, 156.0, 4.0), Tabula::Cell.new(132.0, 70.0, 156.0, 64.0),
|
369
|
-
Tabula::Cell.new(196.0, 70.0, 156.0, 66.0), Tabula::Cell.new(262.0, 70.0, 156.0, 4.0),
|
370
|
-
Tabula::Cell.new(266.0, 70.0, 156.0, 84.0), Tabula::Cell.new(350.0, 70.0, 156.0, 4.0),
|
371
|
-
Tabula::Cell.new(354.0, 70.0, 156.0, 32.0), Tabula::Cell.new(386.0, 70.0, 156.0, 38.0),
|
372
|
-
Tabula::Cell.new(424.0, 70.0, 156.0, 18.0), Tabula::Cell.new(442.0, 70.0, 156.0, 74.0),
|
373
|
-
Tabula::Cell.new(516.0, 70.0, 156.0, 28.0), Tabula::Cell.new(544.0, 70.0, 156.0, 4.0),
|
374
|
-
Tabula::Cell.new(84.0, 72.0, 446.0, 6.0), Tabula::Cell.new(90.0, 226.0, 176.0, 4.0),
|
375
|
-
Tabula::Cell.new(94.0, 226.0, 176.0, 6.0), Tabula::Cell.new(100.0, 226.0, 176.0, 28.0),
|
376
|
-
Tabula::Cell.new(128.0, 226.0, 176.0, 4.0), Tabula::Cell.new(132.0, 226.0, 176.0, 64.0),
|
377
|
-
Tabula::Cell.new(196.0, 226.0, 176.0, 66.0), Tabula::Cell.new(262.0, 226.0, 176.0, 4.0),
|
378
|
-
Tabula::Cell.new(266.0, 226.0, 176.0, 84.0), Tabula::Cell.new(350.0, 226.0, 176.0, 4.0),
|
379
|
-
Tabula::Cell.new(354.0, 226.0, 176.0, 32.0), Tabula::Cell.new(386.0, 226.0, 176.0, 38.0),
|
380
|
-
Tabula::Cell.new(424.0, 226.0, 176.0, 18.0), Tabula::Cell.new(442.0, 226.0, 176.0, 74.0),
|
381
|
-
Tabula::Cell.new(516.0, 226.0, 176.0, 28.0), Tabula::Cell.new(544.0, 226.0, 176.0, 4.0),
|
382
|
-
Tabula::Cell.new(90.0, 402.0, 116.0, 4.0), Tabula::Cell.new(94.0, 402.0, 116.0, 6.0),
|
383
|
-
Tabula::Cell.new(100.0, 402.0, 116.0, 28.0), Tabula::Cell.new(128.0, 402.0, 116.0, 4.0),
|
384
|
-
Tabula::Cell.new(132.0, 402.0, 116.0, 64.0), Tabula::Cell.new(196.0, 402.0, 116.0, 66.0),
|
385
|
-
Tabula::Cell.new(262.0, 402.0, 116.0, 4.0), Tabula::Cell.new(266.0, 402.0, 116.0, 84.0),
|
386
|
-
Tabula::Cell.new(350.0, 402.0, 116.0, 4.0), Tabula::Cell.new(354.0, 402.0, 116.0, 32.0),
|
387
|
-
Tabula::Cell.new(386.0, 402.0, 116.0, 38.0), Tabula::Cell.new(424.0, 402.0, 116.0, 18.0),
|
388
|
-
Tabula::Cell.new(442.0, 402.0, 116.0, 74.0), Tabula::Cell.new(516.0, 402.0, 116.0, 28.0),
|
389
|
-
Tabula::Cell.new(544.0, 402.0, 116.0, 4.0), Tabula::Cell.new(84.0, 518.0, 246.0, 6.0),
|
390
|
-
Tabula::Cell.new(90.0, 518.0, 186.0, 4.0), Tabula::Cell.new(94.0, 518.0, 186.0, 6.0),
|
391
|
-
Tabula::Cell.new(100.0, 518.0, 186.0, 28.0), Tabula::Cell.new(128.0, 518.0, 186.0, 4.0),
|
392
|
-
Tabula::Cell.new(132.0, 518.0, 186.0, 64.0), Tabula::Cell.new(196.0, 518.0, 186.0, 66.0),
|
393
|
-
Tabula::Cell.new(262.0, 518.0, 186.0, 4.0), Tabula::Cell.new(266.0, 518.0, 186.0, 84.0),
|
394
|
-
Tabula::Cell.new(350.0, 518.0, 186.0, 4.0), Tabula::Cell.new(354.0, 518.0, 186.0, 32.0),
|
395
|
-
Tabula::Cell.new(386.0, 518.0, 186.0, 38.0), Tabula::Cell.new(424.0, 518.0, 186.0, 18.0),
|
396
|
-
Tabula::Cell.new(442.0, 518.0, 186.0, 74.0), Tabula::Cell.new(516.0, 518.0, 186.0, 28.0),
|
397
|
-
Tabula::Cell.new(544.0, 518.0, 186.0, 4.0), Tabula::Cell.new(90.0, 704.0, 60.0, 4.0),
|
398
|
-
Tabula::Cell.new(94.0, 704.0, 60.0, 6.0), Tabula::Cell.new(100.0, 704.0, 60.0, 28.0),
|
399
|
-
Tabula::Cell.new(128.0, 704.0, 60.0, 4.0), Tabula::Cell.new(132.0, 704.0, 60.0, 64.0),
|
400
|
-
Tabula::Cell.new(196.0, 704.0, 60.0, 66.0), Tabula::Cell.new(262.0, 704.0, 60.0, 4.0),
|
401
|
-
Tabula::Cell.new(266.0, 704.0, 60.0, 84.0), Tabula::Cell.new(350.0, 704.0, 60.0, 4.0),
|
402
|
-
Tabula::Cell.new(354.0, 704.0, 60.0, 32.0), Tabula::Cell.new(386.0, 704.0, 60.0, 38.0),
|
403
|
-
Tabula::Cell.new(424.0, 704.0, 60.0, 18.0), Tabula::Cell.new(442.0, 704.0, 60.0, 74.0),
|
404
|
-
Tabula::Cell.new(516.0, 704.0, 60.0, 28.0), Tabula::Cell.new(544.0, 704.0, 60.0, 4.0),
|
405
|
-
Tabula::Cell.new(84.0, 764.0, 216.0, 6.0), Tabula::Cell.new(90.0, 764.0, 216.0, 4.0),
|
406
|
-
Tabula::Cell.new(94.0, 764.0, 216.0, 6.0), Tabula::Cell.new(100.0, 764.0, 216.0, 28.0),
|
407
|
-
Tabula::Cell.new(128.0, 764.0, 216.0, 4.0), Tabula::Cell.new(132.0, 764.0, 216.0, 64.0),
|
408
|
-
Tabula::Cell.new(196.0, 764.0, 216.0, 66.0), Tabula::Cell.new(262.0, 764.0, 216.0, 4.0),
|
409
|
-
Tabula::Cell.new(266.0, 764.0, 216.0, 84.0), Tabula::Cell.new(350.0, 764.0, 216.0, 4.0),
|
410
|
-
Tabula::Cell.new(354.0, 764.0, 216.0, 32.0), Tabula::Cell.new(386.0, 764.0, 216.0, 38.0),
|
411
|
-
Tabula::Cell.new(424.0, 764.0, 216.0, 18.0), Tabula::Cell.new(442.0, 764.0, 216.0, 74.0),
|
412
|
-
Tabula::Cell.new(516.0, 764.0, 216.0, 28.0), Tabula::Cell.new(544.0, 764.0, 216.0, 4.0)]
|
413
|
-
|
414
|
-
|
415
|
-
expected_spreadsheets = [Tabula::Spreadsheet.new(40.0, 18.0, 208.0, 40.0, nil, nil, nil, nil),
|
416
|
-
Tabula::Spreadsheet.new(84.0, 18.0, 962.0, 464.0,nil, nil, nil, nil)]
|
417
|
-
|
418
|
-
#compares spreadsheets on area only.
|
419
|
-
assert_equal expected_spreadsheets.map{|s| [s.x, s.y, s.width, s.height] },
|
420
|
-
SpreadsheetsHasCellsTester.new(cells).find_spreadsheets_from_cells.map{|a| s = a.getBounds; [s.x, s.y, s.width, s.height] }
|
421
|
-
|
422
|
-
|
423
|
-
end
|
424
|
-
|
425
|
-
def test_add_spanning_cells
|
426
|
-
skip "until I write it"
|
427
|
-
end
|
428
|
-
|
429
|
-
def test_add_placeholder_cells_to_funny_shaped_tables
|
430
|
-
skip "until I write it, cf 01005787B_Pakistan.pdf"
|
431
|
-
end
|
432
|
-
|
433
|
-
class CellsHasCellsTester
|
434
|
-
include Tabula::HasCells
|
435
|
-
attr_accessor :vertical_ruling_lines, :horizontal_ruling_lines, :cells
|
436
|
-
def initialize(vertical_ruling_lines, horizontal_ruling_lines)
|
437
|
-
@cells = []
|
438
|
-
@vertical_ruling_lines = vertical_ruling_lines
|
439
|
-
@horizontal_ruling_lines = horizontal_ruling_lines
|
440
|
-
find_cells!
|
441
|
-
end
|
442
|
-
end
|
443
|
-
|
444
|
-
#just tests the algorithm
|
445
|
-
def test_lines_to_cells
|
446
|
-
vertical_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 0.0, 40.0),
|
447
|
-
Tabula::Ruling.new(44.0, 70.0, 0.0, 36.0),
|
448
|
-
Tabula::Ruling.new(40.0, 226.0, 0.0, 40.0)]
|
449
|
-
|
450
|
-
horizontal_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 208.0, 0.0),
|
451
|
-
Tabula::Ruling.new(44.0, 18.0, 208.0, 0.0),
|
452
|
-
Tabula::Ruling.new(50.0, 18.0, 208.0, 0.0),
|
453
|
-
Tabula::Ruling.new(54.0, 18.0, 208.0, 0.0),
|
454
|
-
Tabula::Ruling.new(60.0, 18.0, 208.0, 0.0),
|
455
|
-
Tabula::Ruling.new(64.0, 18.0, 208.0, 0.0),
|
456
|
-
Tabula::Ruling.new(70.0, 18.0, 208.0, 0.0),
|
457
|
-
Tabula::Ruling.new(74.0, 18.0, 208.0, 0.0),
|
458
|
-
Tabula::Ruling.new(80.0, 18.0, 208.0, 0.0)]
|
459
|
-
|
460
|
-
expected_cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
|
461
|
-
Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
|
462
|
-
Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
|
463
|
-
Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
|
464
|
-
Tabula::Cell.new(44.0, 70.0, 156.0, 6.0), Tabula::Cell.new(50.0, 70.0, 156.0, 4.0),
|
465
|
-
Tabula::Cell.new(54.0, 70.0, 156.0, 6.0), Tabula::Cell.new(60.0, 70.0, 156.0, 4.0),
|
466
|
-
Tabula::Cell.new(64.0, 70.0, 156.0, 6.0), Tabula::Cell.new(70.0, 70.0, 156.0, 4.0),
|
467
|
-
Tabula::Cell.new(74.0, 70.0, 156.0, 6.0), ]
|
468
|
-
|
469
|
-
actual_cells = CellsHasCellsTester.new(vertical_ruling_lines, horizontal_ruling_lines).cells
|
470
|
-
assert_equal Set.new(expected_cells), Set.new(actual_cells) #I don't care about order
|
471
|
-
end
|
472
|
-
|
473
|
-
#this is the real deal!!
|
474
|
-
def test_extract_tabular_data_using_lines_and_spreadsheets
|
475
|
-
pdf_file_path = "./test/data/frx_2012_disclosure.pdf"
|
476
|
-
expected_data_path = "./test/data/frx_2012_disclosure.tsv"
|
477
|
-
expected = open(expected_data_path, 'r').read #.split("\n").map{|line| line.split("\t")}
|
478
|
-
|
479
|
-
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all).extract.each do |pdf_page|
|
480
|
-
spreadsheet = pdf_page.spreadsheets.first
|
481
|
-
assert_equal expected, spreadsheet.to_tsv
|
482
|
-
end
|
483
|
-
end
|
484
|
-
|
485
|
-
def test_cope_with_a_tableless_page
|
486
|
-
pdf_file_path = "./test/data/no_tables.pdf"
|
487
|
-
|
488
|
-
spreadsheets = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all, '',
|
489
|
-
:line_color_filter => lambda{|components| components.all?{|c| c < 0.1}}
|
490
|
-
).extract.to_a.first.spreadsheets
|
491
|
-
|
492
|
-
assert_equal 0, spreadsheets.size
|
493
|
-
end
|
494
|
-
|
495
|
-
def test_spanning_cells
|
496
|
-
pdf_file_path = "./test/data/spanning_cells.pdf"
|
497
|
-
expected_data_path = "./test/data/spanning_cells.csv"
|
498
|
-
expected = open(expected_data_path, 'r').read
|
499
|
-
|
500
|
-
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
501
|
-
spreadsheet = pdf_page.spreadsheets.first
|
502
|
-
assert_equal expected, spreadsheet.to_csv
|
503
|
-
end
|
504
|
-
end
|
505
|
-
|
506
|
-
def test_almost_vertical_lines
|
507
|
-
pdf_file_path = "./test/data/puertos1.pdf"
|
508
|
-
top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
|
509
|
-
area = Tabula::ZoneEntity.new(top, left,
|
510
|
-
right - left, bottom - top)
|
511
|
-
|
512
|
-
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
513
|
-
rulings = Tabula::Ruling.crop_rulings_to_area(pdf_page.ruling_lines, area)
|
514
|
-
# TODO assertion not entirely correct, should do the trick for now
|
515
|
-
assert_equal 15, rulings.select(&:vertical?).count
|
516
|
-
end
|
517
|
-
end
|
518
|
-
|
519
|
-
def test_extract_spreadsheet_within_an_area
|
520
|
-
pdf_file_path = "./test/data/puertos1.pdf"
|
521
|
-
top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
|
522
|
-
|
523
|
-
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
524
|
-
area = pdf_page.get_area([top, left, bottom, right])
|
525
|
-
table = area.spreadsheets.first.to_a
|
526
|
-
assert_equal 15, table.length
|
527
|
-
assert_equal ["", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM"], table.first
|
528
|
-
assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
|
529
|
-
end
|
530
|
-
end
|
531
|
-
|
532
|
-
def test_remove_repeated_text
|
533
|
-
top, left, bottom, right = 106.07142857142858, 50.91428571428572, 141.42857142857144, 755.2285714285715
|
534
|
-
|
535
|
-
table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
|
536
|
-
1,
|
537
|
-
[top,left,bottom,right],
|
538
|
-
:detect_ruling_lines => false,
|
539
|
-
:extraction_method => 'original')
|
540
|
-
|
541
|
-
ary = table_to_array(table)
|
542
|
-
assert_equal ary[1][1], "$ 18,969,610"
|
543
|
-
assert_equal ary[1][2], "$ 18,157,722"
|
544
|
-
end
|
545
|
-
|
546
|
-
def test_remove_overlapping_text
|
547
|
-
# one of those PDFs that put characters on top of another to make text "bold"
|
548
|
-
top,left,bottom,right = 399.98571428571427, 36.06428571428571, 425.1214285714285, 544.2428571428571
|
549
|
-
table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
|
550
|
-
1,
|
551
|
-
[top,left,bottom,right],
|
552
|
-
:detect_ruling_lines => false,
|
553
|
-
:extraction_method => 'original')
|
554
|
-
|
555
|
-
ary = table_to_array(table)
|
556
|
-
assert_equal ary.first.first, "Community development"
|
557
|
-
end
|
558
|
-
|
559
|
-
def test_cells_including_line_returns
|
560
|
-
data = []
|
561
|
-
pdf_file_path = "./test/data/sydney_disclosure_contract.pdf"
|
562
|
-
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
563
|
-
pdf_page.spreadsheets.each do |spreadsheet|
|
564
|
-
spreadsheet.cells.each do |cell|
|
565
|
-
cell.text_elements = pdf_page.get_cell_text(cell)
|
566
|
-
cell.options = ({:use_line_returns => true, :cell_debug => 0})
|
567
|
-
data << cell.text
|
568
|
-
end
|
569
|
-
end
|
570
|
-
end
|
571
|
-
assert_equal ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \nAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \nDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender \nTender evaluation criteria included: \n- The schedule of prices \n- Compliance with technical specifications/Technical assessment \n- Operational Plan including maintenance procedures"], data
|
572
|
-
end
|
573
|
-
|
574
|
-
end
|
575
|
-
|
576
|
-
class TestIsTabularHeuristic < Minitest::Test
|
577
|
-
|
578
|
-
EXPECTED_TO_BE_SPREADSHEET = ['47008204D_USA.page4.pdf', 'GSK_2012_Q4.page437.pdf', 'strongschools.pdf', 'tabla_subsidios.pdf']
|
579
|
-
NOT_EXPECTED_TO_BE_SPREADSHEET = ['560015757GV_China.page1.pdf', 'S2MNCEbirdisland.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
|
580
|
-
|
581
|
-
File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
582
|
-
|
583
|
-
def test_heuristic_detects_spreadsheets
|
584
|
-
EXPECTED_TO_BE_SPREADSHEET.each do |f|
|
585
|
-
path = File.expand_path('data/' + f, File.dirname(__FILE__))
|
586
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
|
587
|
-
page = extractor.extract.first
|
588
|
-
page.get_ruling_lines!
|
589
|
-
assert page.is_tabular?, "failed on file #{f}"
|
590
|
-
end
|
591
|
-
end
|
592
|
-
|
593
|
-
def test_heuristic_detects_non_spreadsheets
|
594
|
-
NOT_EXPECTED_TO_BE_SPREADSHEET.each do |f|
|
595
|
-
path = File.expand_path('data/' + f, File.dirname(__FILE__))
|
596
|
-
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
|
597
|
-
page = extractor.extract.first
|
598
|
-
page.get_ruling_lines!
|
599
|
-
assert !page.is_tabular?, "failed on file #{f}"
|
600
|
-
end
|
601
|
-
end
|
602
|
-
|
603
|
-
end
|