tabula-extractor 0.7.2-java → 0.7.4-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +4 -8
  4. data/bin/tabula +3 -3
  5. data/lib/tabula.rb +9 -5
  6. data/lib/tabula/entities.rb +1 -0
  7. data/lib/tabula/entities/cell.rb +6 -4
  8. data/lib/tabula/entities/has_cells.rb +22 -78
  9. data/lib/tabula/entities/line.rb +52 -6
  10. data/lib/tabula/entities/page.rb +43 -50
  11. data/lib/tabula/entities/ruling.rb +83 -105
  12. data/lib/tabula/entities/spreadsheet.rb +74 -11
  13. data/lib/tabula/entities/table.rb +55 -37
  14. data/lib/tabula/entities/tabular.rb +42 -0
  15. data/lib/tabula/entities/text_chunk.rb +55 -52
  16. data/lib/tabula/entities/text_element.rb +129 -62
  17. data/lib/tabula/entities/zone_entity.rb +15 -6
  18. data/lib/tabula/extraction.rb +114 -49
  19. data/lib/tabula/line_segment_detector.rb +0 -5
  20. data/lib/tabula/table_extractor.rb +32 -37
  21. data/lib/tabula/version.rb +1 -1
  22. data/tabula-extractor.gemspec +2 -5
  23. metadata +13 -95
  24. data/ext/COPYING +0 -661
  25. data/ext/Makefile.OSX +0 -18
  26. data/ext/Makefile.defaults +0 -9
  27. data/ext/Makefile.linux32 +0 -11
  28. data/ext/Makefile.linux64 +0 -12
  29. data/ext/Makefile.mingw +0 -10
  30. data/ext/Makefile.mingw64 +0 -10
  31. data/ext/liblsd-linux32.so +0 -0
  32. data/ext/liblsd-linux64.so +0 -0
  33. data/ext/liblsd.def +0 -3
  34. data/ext/liblsd.dll +0 -0
  35. data/ext/liblsd.dylib +0 -0
  36. data/ext/liblsd64.dll +0 -0
  37. data/ext/lsd.c +0 -2270
  38. data/ext/lsd.h +0 -283
  39. data/test/data/47008204D_USA.page4.pdf +0 -0
  40. data/test/data/560015757GV_China.page1.pdf +0 -0
  41. data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
  42. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  43. data/test/data/S2MNCEbirdisland.pdf +0 -0
  44. data/test/data/argentina_diputados_voting_record.pdf +0 -0
  45. data/test/data/bo_page24.pdf +0 -0
  46. data/test/data/campaign_donors.pdf +0 -0
  47. data/test/data/frx_2012_disclosure.pdf +0 -0
  48. data/test/data/frx_2012_disclosure.tsv +0 -88
  49. data/test/data/gre.pdf +0 -0
  50. data/test/data/no_tables.pdf +0 -0
  51. data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
  52. data/test/data/puertos1.pdf +0 -0
  53. data/test/data/spanning_cells.csv +0 -21
  54. data/test/data/spanning_cells.pdf +0 -0
  55. data/test/data/strongschools.pdf +0 -0
  56. data/test/data/sydney_disclosure_contract.pdf +0 -0
  57. data/test/data/tabla_subsidios.pdf +0 -0
  58. data/test/data/vertical_rulings_bug.pdf +0 -0
  59. data/test/data/vietnam3.pdf +0 -0
  60. data/test/data/wc2012.pdf +0 -0
  61. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  62. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  63. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  64. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  65. data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
  66. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  67. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  68. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  69. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  70. data/test/heuristic.rb +0 -50
  71. data/test/test_bin_tabula.sh +0 -7
  72. data/test/tests.rb +0 -603
data/test/heuristic.rb DELETED
@@ -1,50 +0,0 @@
1
- #a list of filenames and the correct answer
2
- # no more bs.
3
- require_relative '../lib/tabula'
4
-
5
-
6
- should_use_spreadsheet = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "spreadsheet/*") ).map{|a| [a, true]}
7
- should_use_original = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "original/*") ).map{|a| [a, false]}
8
-
9
- correct = []
10
- misclassified_as_original = []
11
- misclassified_as_spreadsheet = []
12
-
13
-
14
-
15
- def heuristic(page)
16
- page.is_tabular?
17
- end
18
-
19
- (should_use_spreadsheet + should_use_original) .each do |filename, expected_to_be_tabular|
20
- extractor = Tabula::Extraction::ObjectExtractor.new(filename, [1])
21
-
22
- page = extractor.extract.first
23
- page.get_ruling_lines!
24
- # puts "#{File.basename(filename)} | #{expected_to_be_tabular}"
25
- page_is_tabular = heuristic(page)
26
- # puts ""
27
-
28
- if page_is_tabular && expected_to_be_tabular || !page_is_tabular && !expected_to_be_tabular
29
- correct << filename
30
- elsif page_is_tabular && !expected_to_be_tabular
31
- misclassified_as_spreadsheet << filename
32
- elsif !page_is_tabular && expected_to_be_tabular
33
- misclassified_as_original << filename
34
- end
35
- end
36
-
37
- puts "#{correct.size} PDFs were correctly classified"
38
- puts "#{misclassified_as_original.size + misclassified_as_spreadsheet.size} PDFs were incorrectly classified"
39
- unless misclassified_as_spreadsheet.empty?
40
- puts "#{misclassified_as_spreadsheet.size} PDFs should use the original extraction algorithm\n\t but was classified as needing the spreadsheet algorithm"
41
- misclassified_as_spreadsheet.each do |filename|
42
- puts " - #{File.basename(filename)}"
43
- end
44
- end
45
- unless misclassified_as_original.empty?
46
- puts "#{misclassified_as_original.size} PDFs should use the spreadsheet extraction algorithm\n\t but was classified as needing the original algorithm"
47
- misclassified_as_original.each do |filename|
48
- puts " - #{File.basename(filename)}"
49
- end
50
- end
@@ -1,7 +0,0 @@
1
- bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf --silent -o test.csv
2
- bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf -o test.csv
3
- bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv
4
- bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv --format TSV
5
- bin/tabula test/data/campaign_donors.pdf -o test.csv --columns 47,147,256,310,375,431,504 #columns should work
6
- bin/tabula test/data/argentina_diputados_voting_record.pdf --guess -o test.csv --format TSV #should exclude guff
7
- bin/tabula test/data/vertical_rulings_bug.pdf --area 250,0,325,1700 -o test.csv --format TSV #should be only a few lines
data/test/tests.rb DELETED
@@ -1,603 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- require 'minitest'
3
- require 'minitest/autorun'
4
-
5
- require_relative '../lib/tabula'
6
-
7
- def table_to_array(table)
8
- lines_to_array(table.rows)
9
- end
10
-
11
- def lines_to_array(lines)
12
- lines.map do |l|
13
- l.map { |te| te.text.strip }
14
- end
15
- end
16
-
17
- def lines_to_table(lines)
18
- Tabula::Table.new_from_array(lines_to_array(lines))
19
- end
20
-
21
-
22
- # I don't want to pollute the "real" class with a funny inspect method. Just for testing comparisons.
23
- module Tabula
24
- class Table
25
- def inspect
26
- "[" + lines.map(&:inspect).join(",") + "]"
27
- end
28
- end
29
- end
30
-
31
- module Tabula
32
- class Line
33
- def inspect
34
- @text_elements.map{|te| te.nil? ? '' : te.text}.inspect
35
- end
36
- end
37
- end
38
-
39
-
40
- class TestEntityComparability < Minitest::Test
41
- def test_text_element_comparability
42
- base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
43
-
44
- two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
45
- three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
46
- four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
47
-
48
- five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
49
- six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
50
- seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
51
- assert_equal base, two
52
- assert_equal base, three
53
- assert_equal base, four
54
-
55
- refute_equal base, five
56
- refute_equal base, six
57
- refute_equal base, seven
58
- end
59
-
60
- def test_line_comparability
61
- text_base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
62
-
63
- text_two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
64
- text_three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
65
- text_four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
66
-
67
- text_five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
68
- text_six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
69
- text_seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
70
- line_base = Tabula::Line.new
71
- line_base.text_elements = [text_base, text_two, text_three]
72
- line_equal = Tabula::Line.new
73
- line_equal.text_elements = [text_base, text_two, text_three]
74
- line_equal_but_longer = Tabula::Line.new
75
- line_equal_but_longer.text_elements = [text_base, text_two, text_three, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
76
- line_unequal = Tabula::Line.new
77
- line_unequal.text_elements = [text_base, text_two, text_three, text_five]
78
- line_unequal_and_longer = Tabula::Line.new
79
- line_unequal_and_longer.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
80
- line_unequal_and_longer_and_different = Tabula::Line.new
81
- line_unequal_and_longer_and_different.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, 'whatever']
82
-
83
- assert_equal line_base, line_equal
84
- assert_equal line_base, line_equal_but_longer
85
- refute_equal line_base, line_unequal
86
- refute_equal line_base, line_unequal_and_longer
87
- refute_equal line_base, line_unequal_and_longer_and_different
88
- end
89
-
90
- def test_table_comparability
91
- rows_base = [["a", "b", "c"], ['', 'd', '']]
92
- rows_equal = [["a", "b", "c"], ['', 'd']]
93
- rows_equal_padded = [['', "a", "b", "c"], ['', '', 'd']]
94
- rows_unequal_one = [["a", "b", "c"], ['d']]
95
- rows_unequal_two = [["a", "b", "c"], ['d', '']]
96
- rows_unequal_three = [["a", "b", "c"], ['d'], ['a','b', 'd']]
97
- rows_unequal_four = [["a", "b", "c"]]
98
-
99
- table_base = Tabula::Table.new_from_array(rows_base)
100
- table_equal = Tabula::Table.new_from_array(rows_equal)
101
- table_equal_column_padded = Tabula::Table.new_from_array(rows_equal_padded)
102
- table_unequal_one = Tabula::Table.new_from_array(rows_unequal_one)
103
- table_unequal_two = Tabula::Table.new_from_array(rows_unequal_two)
104
- table_unequal_three = Tabula::Table.new_from_array(rows_unequal_three)
105
- table_unequal_four = Tabula::Table.new_from_array(rows_unequal_four)
106
-
107
- assert_equal table_base, table_equal
108
- assert_equal table_base, table_equal_column_padded
109
- refute_equal table_base, table_unequal_one
110
- refute_equal table_base, table_unequal_two
111
- refute_equal table_base, table_unequal_three
112
- refute_equal table_base, table_unequal_four
113
- end
114
- end
115
-
116
- class TestPagesInfoExtractor < Minitest::Test
117
- def test_pages_info_extractor
118
- extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
119
-
120
- i = 0
121
- extractor.pages.each do |page|
122
- assert_instance_of Tabula::Page, page
123
- i += 1
124
- end
125
- assert_equal 2, i
126
- end
127
- end
128
-
129
- class TestTableGuesser < Minitest::Test
130
- def test_find_rects_from_lines_with_lsd
131
- skip "Skipping until we actually use LSD"
132
- filename = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
133
- page_index = 0
134
- lines = Tabula::Extraction::LineExtractor.lines_in_pdf_page(filename, page_index, :render_pdf => true)
135
-
136
- page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
137
- page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
138
- expected_page_areas = [[54.087890625, 50.203125, 734.220703125, 550.44140625]]
139
- assert_equal expected_page_areas, page_areas
140
- end
141
-
142
- end
143
-
144
- class TestDumper < Minitest::Test
145
-
146
- def test_extractor
147
- extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
148
- page = extractor.extract.next
149
- assert_instance_of Tabula::Page, page
150
- end
151
-
152
- def test_get_by_area
153
- extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
154
- characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
155
- assert_equal characters.size, 206
156
- end
157
- end
158
-
159
- class TestRulingIntersection < Minitest::Test
160
- def test_ruling_intersection
161
- horizontals = [Tabula::Ruling.new(10, 1, 10, 0)]
162
- verticals = [Tabula::Ruling.new(1, 3, 0, 11),
163
- Tabula::Ruling.new(1, 4, 0, 11)]
164
- ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
165
- assert_equal 2, ints.size
166
- assert_equal ints[0][0].getX, 3.0
167
- assert_equal ints[0][0].getY, 10.0
168
- assert_equal ints[1][0].getX, 4.0
169
- assert_equal ints[1][0].getY, 10.0
170
-
171
- verticals = [Tabula::Ruling.new(20, 3, 0, 11)]
172
- ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
173
- assert_equal ints.size, 0
174
- end
175
- end
176
-
177
- class TestExtractor < Minitest::Test
178
-
179
- def test_table_extraction_1
180
- table = table_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
181
- 1,
182
- [107.1, 57.9214, 394.5214, 290.7],
183
- :detect_ruling_lines => false)
184
-
185
- expected = [["Prior Scale","New Scale","% Rank*"], ["800","170","99"], ["790","170","99"], ["780","170","99"], ["770","170","99"], ["760","170","99"], ["750","169","99"], ["740","169","99"], ["730","168","98"], ["720","168","98"], ["710","167","97"], ["700","166","96"], ["690","165","95"], ["680","165","95"], ["670","164","93"], ["660","164","93"], ["650","163","91"]]
186
-
187
- assert_equal expected, table
188
- end
189
-
190
- def test_diputados_voting_record
191
- table = table_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
192
- 1,
193
- [269.875, 12.75, 790.5, 561])
194
-
195
- expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
196
-
197
- assert_equal expected, table
198
- end
199
-
200
- def test_forest_disclosure_report_dont_regress
201
- # this is the current state of the expected output. Ideally the output should be like
202
- # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
203
- # and a solution for half-x-height-offset lines.
204
- pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
205
-
206
- table = Tabula.extract_table(pdf_file_path,
207
- 1,
208
- [106.01, 48.09, 227.31, 551.89],
209
- :detect_ruling_lines => true,
210
- :extraction_method => "original")
211
-
212
- expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
213
-
214
- assert_equal expected, table
215
- end
216
-
217
- def test_missing_spaces_around_an_ampersand
218
- pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
219
- character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
220
- page_obj = character_extractor.extract.next
221
- lines = page_obj.ruling_lines
222
- vertical_rulings = lines.select(&:vertical?)
223
-
224
- area = [170, 28, 185, 833] #top left bottom right
225
-
226
- expected = Tabula::Table.new_from_array([
227
- ["", "REGIONAL PULMONARY & SLEEP",],
228
- ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"],
229
- ["", "MEDICINE", ],
230
- ])
231
-
232
- assert_equal expected, lines_to_table(page_obj.get_area(area).make_table(:vertical_rulings => vertical_rulings))
233
- end
234
-
235
- def test_forest_disclosure_report
236
- skip "Skipping until we support multiline cells"
237
- pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
238
- character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
239
- lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
240
- vertical_rulings = lines.select(&:vertical?) #.uniq{|line| (line.left / 10).round }
241
-
242
- page_obj = character_extractor.extract.next
243
- characters = page_obj.get_text([110, 28, 218, 833])
244
- #top left bottom right
245
- expected = Tabula::Table.new_from_array([
246
- ['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '', '$85.00'],
247
- ['TOTAL', '', '', '','$85.00'],
248
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '', '$78.80'],
249
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '', '$392.45'],
250
- ['TOTAL', '', '', '', '$471.25'],
251
- ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '', '$20.39'],
252
- ['TOTAL', '', '', '','$20.39'],
253
- ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '', '$310.33'],
254
- ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '', '$4,700.00'],
255
- ['TOTAL', '', '', '', '$5,010.33'],
256
- ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '', '$193.67'],
257
- ['TOTAL', '', '', '', '$193.67'],
258
- ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '', '$19.50']
259
- ])
260
-
261
- assert_equal expected, lines_to_table(Tabula.make_table(characters, :vertical_rulings => vertical_rulings))
262
- end
263
-
264
- # TODO Spaces inserted in words - fails
265
- def test_bo_page24
266
- table = table_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
267
- 1,
268
- [425.625, 53.125, 575.714, 810.535],
269
- :detect_ruling_lines => false)
270
-
271
- expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
272
-
273
- assert_equal expected, table
274
- end
275
-
276
-
277
- def test_vertical_rulings_splitting_words
278
- #if a vertical ruling crosses over a word, the word should be split at that vertical ruling
279
- # before, the entire word would end up on one side of the vertical ruling.
280
- pdf_file_path = File.expand_path('data/vertical_rulings_bug.pdf', File.dirname(__FILE__))
281
-
282
- #both of these are semantically "correct"; the difference is in how we handle multi-line cells
283
- expected = Tabula::Table.new_from_array([
284
- ["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
285
- ["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
286
- ["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH ABRAHAMSON", "", "", "$22.93", "", "", "$22.93"]
287
- ])
288
- other_expected = Tabula::Table.new_from_array([
289
- ["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
290
- ["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
291
- ["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH", "", "", "$22.93", "", "", "$22.93"],
292
- ["", "", "", "ABRAHAMSON"]
293
- ])
294
-
295
- #N.B. it's "MORGANTOWN", "WV" that we're most interested in here (it used to show up as ["MORGANTOWNWV", "", ""])
296
-
297
-
298
- extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, 1...2) #:all ) # 1..2643
299
- extractor.extract.each_with_index do |pdf_page, page_index|
300
-
301
- page_areas = [[250, 0, 325, 1700]]
302
-
303
- scale_factor = pdf_page.width / 1700
304
-
305
- vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
306
-
307
- tables = page_areas.map do |page_area|
308
- pdf_page.get_area(page_area).make_table(:vertical_rulings => vertical_rulings)
309
- end
310
- assert_equal expected, lines_to_table(tables.first)
311
- end
312
- end
313
-
314
- def test_vertical_rulings_prevent_merging_of_columns
315
- expected = [["SZARANGOWICZ", "GUSTAVO ALEJANDRO", "25.096.244", "20-25096244-5", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TAILHADE", "LUIS RODOLFO", "21.386.299", "20-21386299-6", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TEDESCHI", "ADRIÁN ALBERTO", "24.171.507", "20-24171507-9", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["URRIZA", "MARÍA TERESA", "18.135.604", "27-18135604-4", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["USTARROZ", "GERÓNIMO JAVIER", "24.912.947", "20-24912947-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VALSANGIACOMO BLANC", "OFERNANDO JORGE", "26.800.203", "20-26800203-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VICENTE", "PABLO ARIEL", "21.897.586", "20-21897586-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["AMBURI", "HUGO ALBERTO", "14.096.560", "20-14096560-0", "09/10/2013", "EFECTIVO", "$ 20.000,00"], ["BERRA", "CLAUDIA SUSANA", "14.433.112", "27-14433112-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"]]
316
-
317
- vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
318
-
319
- table = table_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
320
- 1,
321
- [255.57,40.43,398.76,557.35],
322
- :vertical_rulings => vertical_rulings)
323
-
324
- assert_equal expected, table
325
- end
326
-
327
- def test_get_spacing_and_merging_right
328
- table = table_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
329
- 1,
330
- [52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
331
- :detect_ruling_lines => true)
332
-
333
- expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
334
-
335
- assert_equal expected, table
336
-
337
- end
338
-
339
-
340
- class SpreadsheetsHasCellsTester
341
- include Tabula::HasCells
342
- attr_accessor :cells
343
- def initialize(cells)
344
- @cells = cells
345
- end
346
- end
347
-
348
- #just tests the algorithm
349
- def test_cells_to_spreadsheets
350
-
351
- cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
352
- Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
353
- Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
354
- Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
355
- Tabula::Cell.new(90.0, 18.0, 52.0, 4.0), Tabula::Cell.new(94.0, 18.0, 52.0, 6.0),
356
- Tabula::Cell.new(100.0, 18.0, 52.0, 28.0), Tabula::Cell.new(128.0, 18.0, 52.0, 4.0),
357
- Tabula::Cell.new(132.0, 18.0, 52.0, 64.0), Tabula::Cell.new(196.0, 18.0, 52.0, 66.0),
358
- Tabula::Cell.new(262.0, 18.0, 52.0, 4.0), Tabula::Cell.new(266.0, 18.0, 52.0, 84.0),
359
- Tabula::Cell.new(350.0, 18.0, 52.0, 4.0), Tabula::Cell.new(354.0, 18.0, 52.0, 32.0),
360
- Tabula::Cell.new(386.0, 18.0, 52.0, 38.0), Tabula::Cell.new(424.0, 18.0, 52.0, 18.0),
361
- Tabula::Cell.new(442.0, 18.0, 52.0, 74.0), Tabula::Cell.new(516.0, 18.0, 52.0, 28.0),
362
- Tabula::Cell.new(544.0, 18.0, 52.0, 4.0), Tabula::Cell.new(44.0, 70.0, 156.0, 6.0),
363
- Tabula::Cell.new(50.0, 70.0, 156.0, 4.0), Tabula::Cell.new(54.0, 70.0, 156.0, 6.0),
364
- Tabula::Cell.new(60.0, 70.0, 156.0, 4.0), Tabula::Cell.new(64.0, 70.0, 156.0, 6.0),
365
- Tabula::Cell.new(70.0, 70.0, 156.0, 4.0), Tabula::Cell.new(74.0, 70.0, 156.0, 6.0),
366
- Tabula::Cell.new(84.0, 70.0, 2.0, 6.0), Tabula::Cell.new(90.0, 70.0, 156.0, 4.0),
367
- Tabula::Cell.new(94.0, 70.0, 156.0, 6.0), Tabula::Cell.new(100.0, 70.0, 156.0, 28.0),
368
- Tabula::Cell.new(128.0, 70.0, 156.0, 4.0), Tabula::Cell.new(132.0, 70.0, 156.0, 64.0),
369
- Tabula::Cell.new(196.0, 70.0, 156.0, 66.0), Tabula::Cell.new(262.0, 70.0, 156.0, 4.0),
370
- Tabula::Cell.new(266.0, 70.0, 156.0, 84.0), Tabula::Cell.new(350.0, 70.0, 156.0, 4.0),
371
- Tabula::Cell.new(354.0, 70.0, 156.0, 32.0), Tabula::Cell.new(386.0, 70.0, 156.0, 38.0),
372
- Tabula::Cell.new(424.0, 70.0, 156.0, 18.0), Tabula::Cell.new(442.0, 70.0, 156.0, 74.0),
373
- Tabula::Cell.new(516.0, 70.0, 156.0, 28.0), Tabula::Cell.new(544.0, 70.0, 156.0, 4.0),
374
- Tabula::Cell.new(84.0, 72.0, 446.0, 6.0), Tabula::Cell.new(90.0, 226.0, 176.0, 4.0),
375
- Tabula::Cell.new(94.0, 226.0, 176.0, 6.0), Tabula::Cell.new(100.0, 226.0, 176.0, 28.0),
376
- Tabula::Cell.new(128.0, 226.0, 176.0, 4.0), Tabula::Cell.new(132.0, 226.0, 176.0, 64.0),
377
- Tabula::Cell.new(196.0, 226.0, 176.0, 66.0), Tabula::Cell.new(262.0, 226.0, 176.0, 4.0),
378
- Tabula::Cell.new(266.0, 226.0, 176.0, 84.0), Tabula::Cell.new(350.0, 226.0, 176.0, 4.0),
379
- Tabula::Cell.new(354.0, 226.0, 176.0, 32.0), Tabula::Cell.new(386.0, 226.0, 176.0, 38.0),
380
- Tabula::Cell.new(424.0, 226.0, 176.0, 18.0), Tabula::Cell.new(442.0, 226.0, 176.0, 74.0),
381
- Tabula::Cell.new(516.0, 226.0, 176.0, 28.0), Tabula::Cell.new(544.0, 226.0, 176.0, 4.0),
382
- Tabula::Cell.new(90.0, 402.0, 116.0, 4.0), Tabula::Cell.new(94.0, 402.0, 116.0, 6.0),
383
- Tabula::Cell.new(100.0, 402.0, 116.0, 28.0), Tabula::Cell.new(128.0, 402.0, 116.0, 4.0),
384
- Tabula::Cell.new(132.0, 402.0, 116.0, 64.0), Tabula::Cell.new(196.0, 402.0, 116.0, 66.0),
385
- Tabula::Cell.new(262.0, 402.0, 116.0, 4.0), Tabula::Cell.new(266.0, 402.0, 116.0, 84.0),
386
- Tabula::Cell.new(350.0, 402.0, 116.0, 4.0), Tabula::Cell.new(354.0, 402.0, 116.0, 32.0),
387
- Tabula::Cell.new(386.0, 402.0, 116.0, 38.0), Tabula::Cell.new(424.0, 402.0, 116.0, 18.0),
388
- Tabula::Cell.new(442.0, 402.0, 116.0, 74.0), Tabula::Cell.new(516.0, 402.0, 116.0, 28.0),
389
- Tabula::Cell.new(544.0, 402.0, 116.0, 4.0), Tabula::Cell.new(84.0, 518.0, 246.0, 6.0),
390
- Tabula::Cell.new(90.0, 518.0, 186.0, 4.0), Tabula::Cell.new(94.0, 518.0, 186.0, 6.0),
391
- Tabula::Cell.new(100.0, 518.0, 186.0, 28.0), Tabula::Cell.new(128.0, 518.0, 186.0, 4.0),
392
- Tabula::Cell.new(132.0, 518.0, 186.0, 64.0), Tabula::Cell.new(196.0, 518.0, 186.0, 66.0),
393
- Tabula::Cell.new(262.0, 518.0, 186.0, 4.0), Tabula::Cell.new(266.0, 518.0, 186.0, 84.0),
394
- Tabula::Cell.new(350.0, 518.0, 186.0, 4.0), Tabula::Cell.new(354.0, 518.0, 186.0, 32.0),
395
- Tabula::Cell.new(386.0, 518.0, 186.0, 38.0), Tabula::Cell.new(424.0, 518.0, 186.0, 18.0),
396
- Tabula::Cell.new(442.0, 518.0, 186.0, 74.0), Tabula::Cell.new(516.0, 518.0, 186.0, 28.0),
397
- Tabula::Cell.new(544.0, 518.0, 186.0, 4.0), Tabula::Cell.new(90.0, 704.0, 60.0, 4.0),
398
- Tabula::Cell.new(94.0, 704.0, 60.0, 6.0), Tabula::Cell.new(100.0, 704.0, 60.0, 28.0),
399
- Tabula::Cell.new(128.0, 704.0, 60.0, 4.0), Tabula::Cell.new(132.0, 704.0, 60.0, 64.0),
400
- Tabula::Cell.new(196.0, 704.0, 60.0, 66.0), Tabula::Cell.new(262.0, 704.0, 60.0, 4.0),
401
- Tabula::Cell.new(266.0, 704.0, 60.0, 84.0), Tabula::Cell.new(350.0, 704.0, 60.0, 4.0),
402
- Tabula::Cell.new(354.0, 704.0, 60.0, 32.0), Tabula::Cell.new(386.0, 704.0, 60.0, 38.0),
403
- Tabula::Cell.new(424.0, 704.0, 60.0, 18.0), Tabula::Cell.new(442.0, 704.0, 60.0, 74.0),
404
- Tabula::Cell.new(516.0, 704.0, 60.0, 28.0), Tabula::Cell.new(544.0, 704.0, 60.0, 4.0),
405
- Tabula::Cell.new(84.0, 764.0, 216.0, 6.0), Tabula::Cell.new(90.0, 764.0, 216.0, 4.0),
406
- Tabula::Cell.new(94.0, 764.0, 216.0, 6.0), Tabula::Cell.new(100.0, 764.0, 216.0, 28.0),
407
- Tabula::Cell.new(128.0, 764.0, 216.0, 4.0), Tabula::Cell.new(132.0, 764.0, 216.0, 64.0),
408
- Tabula::Cell.new(196.0, 764.0, 216.0, 66.0), Tabula::Cell.new(262.0, 764.0, 216.0, 4.0),
409
- Tabula::Cell.new(266.0, 764.0, 216.0, 84.0), Tabula::Cell.new(350.0, 764.0, 216.0, 4.0),
410
- Tabula::Cell.new(354.0, 764.0, 216.0, 32.0), Tabula::Cell.new(386.0, 764.0, 216.0, 38.0),
411
- Tabula::Cell.new(424.0, 764.0, 216.0, 18.0), Tabula::Cell.new(442.0, 764.0, 216.0, 74.0),
412
- Tabula::Cell.new(516.0, 764.0, 216.0, 28.0), Tabula::Cell.new(544.0, 764.0, 216.0, 4.0)]
413
-
414
-
415
- expected_spreadsheets = [Tabula::Spreadsheet.new(40.0, 18.0, 208.0, 40.0, nil, nil, nil, nil),
416
- Tabula::Spreadsheet.new(84.0, 18.0, 962.0, 464.0,nil, nil, nil, nil)]
417
-
418
- #compares spreadsheets on area only.
419
- assert_equal expected_spreadsheets.map{|s| [s.x, s.y, s.width, s.height] },
420
- SpreadsheetsHasCellsTester.new(cells).find_spreadsheets_from_cells.map{|a| s = a.getBounds; [s.x, s.y, s.width, s.height] }
421
-
422
-
423
- end
424
-
425
- def test_add_spanning_cells
426
- skip "until I write it"
427
- end
428
-
429
- def test_add_placeholder_cells_to_funny_shaped_tables
430
- skip "until I write it, cf 01005787B_Pakistan.pdf"
431
- end
432
-
433
- class CellsHasCellsTester
434
- include Tabula::HasCells
435
- attr_accessor :vertical_ruling_lines, :horizontal_ruling_lines, :cells
436
- def initialize(vertical_ruling_lines, horizontal_ruling_lines)
437
- @cells = []
438
- @vertical_ruling_lines = vertical_ruling_lines
439
- @horizontal_ruling_lines = horizontal_ruling_lines
440
- find_cells!
441
- end
442
- end
443
-
444
- #just tests the algorithm
445
- def test_lines_to_cells
446
- vertical_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 0.0, 40.0),
447
- Tabula::Ruling.new(44.0, 70.0, 0.0, 36.0),
448
- Tabula::Ruling.new(40.0, 226.0, 0.0, 40.0)]
449
-
450
- horizontal_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 208.0, 0.0),
451
- Tabula::Ruling.new(44.0, 18.0, 208.0, 0.0),
452
- Tabula::Ruling.new(50.0, 18.0, 208.0, 0.0),
453
- Tabula::Ruling.new(54.0, 18.0, 208.0, 0.0),
454
- Tabula::Ruling.new(60.0, 18.0, 208.0, 0.0),
455
- Tabula::Ruling.new(64.0, 18.0, 208.0, 0.0),
456
- Tabula::Ruling.new(70.0, 18.0, 208.0, 0.0),
457
- Tabula::Ruling.new(74.0, 18.0, 208.0, 0.0),
458
- Tabula::Ruling.new(80.0, 18.0, 208.0, 0.0)]
459
-
460
- expected_cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
461
- Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
462
- Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
463
- Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
464
- Tabula::Cell.new(44.0, 70.0, 156.0, 6.0), Tabula::Cell.new(50.0, 70.0, 156.0, 4.0),
465
- Tabula::Cell.new(54.0, 70.0, 156.0, 6.0), Tabula::Cell.new(60.0, 70.0, 156.0, 4.0),
466
- Tabula::Cell.new(64.0, 70.0, 156.0, 6.0), Tabula::Cell.new(70.0, 70.0, 156.0, 4.0),
467
- Tabula::Cell.new(74.0, 70.0, 156.0, 6.0), ]
468
-
469
- actual_cells = CellsHasCellsTester.new(vertical_ruling_lines, horizontal_ruling_lines).cells
470
- assert_equal Set.new(expected_cells), Set.new(actual_cells) #I don't care about order
471
- end
472
-
473
- #this is the real deal!!
474
- def test_extract_tabular_data_using_lines_and_spreadsheets
475
- pdf_file_path = "./test/data/frx_2012_disclosure.pdf"
476
- expected_data_path = "./test/data/frx_2012_disclosure.tsv"
477
- expected = open(expected_data_path, 'r').read #.split("\n").map{|line| line.split("\t")}
478
-
479
- Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all).extract.each do |pdf_page|
480
- spreadsheet = pdf_page.spreadsheets.first
481
- assert_equal expected, spreadsheet.to_tsv
482
- end
483
- end
484
-
485
- def test_cope_with_a_tableless_page
486
- pdf_file_path = "./test/data/no_tables.pdf"
487
-
488
- spreadsheets = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all, '',
489
- :line_color_filter => lambda{|components| components.all?{|c| c < 0.1}}
490
- ).extract.to_a.first.spreadsheets
491
-
492
- assert_equal 0, spreadsheets.size
493
- end
494
-
495
- def test_spanning_cells
496
- pdf_file_path = "./test/data/spanning_cells.pdf"
497
- expected_data_path = "./test/data/spanning_cells.csv"
498
- expected = open(expected_data_path, 'r').read
499
-
500
- Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
501
- spreadsheet = pdf_page.spreadsheets.first
502
- assert_equal expected, spreadsheet.to_csv
503
- end
504
- end
505
-
506
- def test_almost_vertical_lines
507
- pdf_file_path = "./test/data/puertos1.pdf"
508
- top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
509
- area = Tabula::ZoneEntity.new(top, left,
510
- right - left, bottom - top)
511
-
512
- Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
513
- rulings = Tabula::Ruling.crop_rulings_to_area(pdf_page.ruling_lines, area)
514
- # TODO assertion not entirely correct, should do the trick for now
515
- assert_equal 15, rulings.select(&:vertical?).count
516
- end
517
- end
518
-
519
- def test_extract_spreadsheet_within_an_area
520
- pdf_file_path = "./test/data/puertos1.pdf"
521
- top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
522
-
523
- Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
524
- area = pdf_page.get_area([top, left, bottom, right])
525
- table = area.spreadsheets.first.to_a
526
- assert_equal 15, table.length
527
- assert_equal ["", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM"], table.first
528
- assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
529
- end
530
- end
531
-
532
- def test_remove_repeated_text
533
- top, left, bottom, right = 106.07142857142858, 50.91428571428572, 141.42857142857144, 755.2285714285715
534
-
535
- table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
536
- 1,
537
- [top,left,bottom,right],
538
- :detect_ruling_lines => false,
539
- :extraction_method => 'original')
540
-
541
- ary = table_to_array(table)
542
- assert_equal ary[1][1], "$ 18,969,610"
543
- assert_equal ary[1][2], "$ 18,157,722"
544
- end
545
-
546
- def test_remove_overlapping_text
547
- # one of those PDFs that put characters on top of another to make text "bold"
548
- top,left,bottom,right = 399.98571428571427, 36.06428571428571, 425.1214285714285, 544.2428571428571
549
- table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
550
- 1,
551
- [top,left,bottom,right],
552
- :detect_ruling_lines => false,
553
- :extraction_method => 'original')
554
-
555
- ary = table_to_array(table)
556
- assert_equal ary.first.first, "Community development"
557
- end
558
-
559
- def test_cells_including_line_returns
560
- data = []
561
- pdf_file_path = "./test/data/sydney_disclosure_contract.pdf"
562
- Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
563
- pdf_page.spreadsheets.each do |spreadsheet|
564
- spreadsheet.cells.each do |cell|
565
- cell.text_elements = pdf_page.get_cell_text(cell)
566
- cell.options = ({:use_line_returns => true, :cell_debug => 0})
567
- data << cell.text
568
- end
569
- end
570
- end
571
- assert_equal ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \nAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \nDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender \nTender evaluation criteria included: \n- The schedule of prices \n- Compliance with technical specifications/Technical assessment \n- Operational Plan including maintenance procedures"], data
572
- end
573
-
574
- end
575
-
576
- class TestIsTabularHeuristic < Minitest::Test
577
-
578
- EXPECTED_TO_BE_SPREADSHEET = ['47008204D_USA.page4.pdf', 'GSK_2012_Q4.page437.pdf', 'strongschools.pdf', 'tabla_subsidios.pdf']
579
- NOT_EXPECTED_TO_BE_SPREADSHEET = ['560015757GV_China.page1.pdf', 'S2MNCEbirdisland.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
580
-
581
- File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
582
-
583
- def test_heuristic_detects_spreadsheets
584
- EXPECTED_TO_BE_SPREADSHEET.each do |f|
585
- path = File.expand_path('data/' + f, File.dirname(__FILE__))
586
- extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
587
- page = extractor.extract.first
588
- page.get_ruling_lines!
589
- assert page.is_tabular?, "failed on file #{f}"
590
- end
591
- end
592
-
593
- def test_heuristic_detects_non_spreadsheets
594
- NOT_EXPECTED_TO_BE_SPREADSHEET.each do |f|
595
- path = File.expand_path('data/' + f, File.dirname(__FILE__))
596
- extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
597
- page = extractor.extract.first
598
- page.get_ruling_lines!
599
- assert !page.is_tabular?, "failed on file #{f}"
600
- end
601
- end
602
-
603
- end