tabula-extractor 0.7.2-java → 0.7.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +4 -8
  4. data/bin/tabula +3 -3
  5. data/lib/tabula.rb +9 -5
  6. data/lib/tabula/entities.rb +1 -0
  7. data/lib/tabula/entities/cell.rb +6 -4
  8. data/lib/tabula/entities/has_cells.rb +22 -78
  9. data/lib/tabula/entities/line.rb +52 -6
  10. data/lib/tabula/entities/page.rb +43 -50
  11. data/lib/tabula/entities/ruling.rb +83 -105
  12. data/lib/tabula/entities/spreadsheet.rb +74 -11
  13. data/lib/tabula/entities/table.rb +55 -37
  14. data/lib/tabula/entities/tabular.rb +42 -0
  15. data/lib/tabula/entities/text_chunk.rb +55 -52
  16. data/lib/tabula/entities/text_element.rb +129 -62
  17. data/lib/tabula/entities/zone_entity.rb +15 -6
  18. data/lib/tabula/extraction.rb +114 -49
  19. data/lib/tabula/line_segment_detector.rb +0 -5
  20. data/lib/tabula/table_extractor.rb +32 -37
  21. data/lib/tabula/version.rb +1 -1
  22. data/tabula-extractor.gemspec +2 -5
  23. metadata +13 -95
  24. data/ext/COPYING +0 -661
  25. data/ext/Makefile.OSX +0 -18
  26. data/ext/Makefile.defaults +0 -9
  27. data/ext/Makefile.linux32 +0 -11
  28. data/ext/Makefile.linux64 +0 -12
  29. data/ext/Makefile.mingw +0 -10
  30. data/ext/Makefile.mingw64 +0 -10
  31. data/ext/liblsd-linux32.so +0 -0
  32. data/ext/liblsd-linux64.so +0 -0
  33. data/ext/liblsd.def +0 -3
  34. data/ext/liblsd.dll +0 -0
  35. data/ext/liblsd.dylib +0 -0
  36. data/ext/liblsd64.dll +0 -0
  37. data/ext/lsd.c +0 -2270
  38. data/ext/lsd.h +0 -283
  39. data/test/data/47008204D_USA.page4.pdf +0 -0
  40. data/test/data/560015757GV_China.page1.pdf +0 -0
  41. data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
  42. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  43. data/test/data/S2MNCEbirdisland.pdf +0 -0
  44. data/test/data/argentina_diputados_voting_record.pdf +0 -0
  45. data/test/data/bo_page24.pdf +0 -0
  46. data/test/data/campaign_donors.pdf +0 -0
  47. data/test/data/frx_2012_disclosure.pdf +0 -0
  48. data/test/data/frx_2012_disclosure.tsv +0 -88
  49. data/test/data/gre.pdf +0 -0
  50. data/test/data/no_tables.pdf +0 -0
  51. data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
  52. data/test/data/puertos1.pdf +0 -0
  53. data/test/data/spanning_cells.csv +0 -21
  54. data/test/data/spanning_cells.pdf +0 -0
  55. data/test/data/strongschools.pdf +0 -0
  56. data/test/data/sydney_disclosure_contract.pdf +0 -0
  57. data/test/data/tabla_subsidios.pdf +0 -0
  58. data/test/data/vertical_rulings_bug.pdf +0 -0
  59. data/test/data/vietnam3.pdf +0 -0
  60. data/test/data/wc2012.pdf +0 -0
  61. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  62. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  63. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  64. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  65. data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
  66. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  67. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  68. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  69. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  70. data/test/heuristic.rb +0 -50
  71. data/test/test_bin_tabula.sh +0 -7
  72. data/test/tests.rb +0 -603
data/test/heuristic.rb DELETED
@@ -1,50 +0,0 @@
1
- #a list of filenames and the correct answer
2
- # no more bs.
3
- require_relative '../lib/tabula'
4
-
5
-
6
- should_use_spreadsheet = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "spreadsheet/*") ).map{|a| [a, true]}
7
- should_use_original = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "original/*") ).map{|a| [a, false]}
8
-
9
- correct = []
10
- misclassified_as_original = []
11
- misclassified_as_spreadsheet = []
12
-
13
-
14
-
15
- def heuristic(page)
16
- page.is_tabular?
17
- end
18
-
19
- (should_use_spreadsheet + should_use_original) .each do |filename, expected_to_be_tabular|
20
- extractor = Tabula::Extraction::ObjectExtractor.new(filename, [1])
21
-
22
- page = extractor.extract.first
23
- page.get_ruling_lines!
24
- # puts "#{File.basename(filename)} | #{expected_to_be_tabular}"
25
- page_is_tabular = heuristic(page)
26
- # puts ""
27
-
28
- if page_is_tabular && expected_to_be_tabular || !page_is_tabular && !expected_to_be_tabular
29
- correct << filename
30
- elsif page_is_tabular && !expected_to_be_tabular
31
- misclassified_as_spreadsheet << filename
32
- elsif !page_is_tabular && expected_to_be_tabular
33
- misclassified_as_original << filename
34
- end
35
- end
36
-
37
- puts "#{correct.size} PDFs were correctly classified"
38
- puts "#{misclassified_as_original.size + misclassified_as_spreadsheet.size} PDFs were incorrectly classified"
39
- unless misclassified_as_spreadsheet.empty?
40
- puts "#{misclassified_as_spreadsheet.size} PDFs should use the original extraction algorithm\n\t but was classified as needing the spreadsheet algorithm"
41
- misclassified_as_spreadsheet.each do |filename|
42
- puts " - #{File.basename(filename)}"
43
- end
44
- end
45
- unless misclassified_as_original.empty?
46
- puts "#{misclassified_as_original.size} PDFs should use the spreadsheet extraction algorithm\n\t but was classified as needing the original algorithm"
47
- misclassified_as_original.each do |filename|
48
- puts " - #{File.basename(filename)}"
49
- end
50
- end
@@ -1,7 +0,0 @@
1
- bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf --silent -o test.csv
2
- bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf -o test.csv
3
- bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv
4
- bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv --format TSV
5
- bin/tabula test/data/campaign_donors.pdf -o test.csv --columns 47,147,256,310,375,431,504 #columns should work
6
- bin/tabula test/data/argentina_diputados_voting_record.pdf --guess -o test.csv --format TSV #should exclude guff
7
- bin/tabula test/data/vertical_rulings_bug.pdf --area 250,0,325,1700 -o test.csv --format TSV #should be only a few lines
data/test/tests.rb DELETED
@@ -1,603 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- require 'minitest'
3
- require 'minitest/autorun'
4
-
5
- require_relative '../lib/tabula'
6
-
7
- def table_to_array(table)
8
- lines_to_array(table.rows)
9
- end
10
-
11
- def lines_to_array(lines)
12
- lines.map do |l|
13
- l.map { |te| te.text.strip }
14
- end
15
- end
16
-
17
- def lines_to_table(lines)
18
- Tabula::Table.new_from_array(lines_to_array(lines))
19
- end
20
-
21
-
22
- # I don't want to pollute the "real" class with a funny inspect method. Just for testing comparisons.
23
- module Tabula
24
- class Table
25
- def inspect
26
- "[" + lines.map(&:inspect).join(",") + "]"
27
- end
28
- end
29
- end
30
-
31
- module Tabula
32
- class Line
33
- def inspect
34
- @text_elements.map{|te| te.nil? ? '' : te.text}.inspect
35
- end
36
- end
37
- end
38
-
39
-
40
- class TestEntityComparability < Minitest::Test
41
- def test_text_element_comparability
42
- base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
43
-
44
- two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
45
- three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
46
- four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
47
-
48
- five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
49
- six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
50
- seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
51
- assert_equal base, two
52
- assert_equal base, three
53
- assert_equal base, four
54
-
55
- refute_equal base, five
56
- refute_equal base, six
57
- refute_equal base, seven
58
- end
59
-
60
- def test_line_comparability
61
- text_base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
62
-
63
- text_two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
64
- text_three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
65
- text_four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
66
-
67
- text_five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
68
- text_six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
69
- text_seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
70
- line_base = Tabula::Line.new
71
- line_base.text_elements = [text_base, text_two, text_three]
72
- line_equal = Tabula::Line.new
73
- line_equal.text_elements = [text_base, text_two, text_three]
74
- line_equal_but_longer = Tabula::Line.new
75
- line_equal_but_longer.text_elements = [text_base, text_two, text_three, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
76
- line_unequal = Tabula::Line.new
77
- line_unequal.text_elements = [text_base, text_two, text_three, text_five]
78
- line_unequal_and_longer = Tabula::Line.new
79
- line_unequal_and_longer.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
80
- line_unequal_and_longer_and_different = Tabula::Line.new
81
- line_unequal_and_longer_and_different.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, 'whatever']
82
-
83
- assert_equal line_base, line_equal
84
- assert_equal line_base, line_equal_but_longer
85
- refute_equal line_base, line_unequal
86
- refute_equal line_base, line_unequal_and_longer
87
- refute_equal line_base, line_unequal_and_longer_and_different
88
- end
89
-
90
- def test_table_comparability
91
- rows_base = [["a", "b", "c"], ['', 'd', '']]
92
- rows_equal = [["a", "b", "c"], ['', 'd']]
93
- rows_equal_padded = [['', "a", "b", "c"], ['', '', 'd']]
94
- rows_unequal_one = [["a", "b", "c"], ['d']]
95
- rows_unequal_two = [["a", "b", "c"], ['d', '']]
96
- rows_unequal_three = [["a", "b", "c"], ['d'], ['a','b', 'd']]
97
- rows_unequal_four = [["a", "b", "c"]]
98
-
99
- table_base = Tabula::Table.new_from_array(rows_base)
100
- table_equal = Tabula::Table.new_from_array(rows_equal)
101
- table_equal_column_padded = Tabula::Table.new_from_array(rows_equal_padded)
102
- table_unequal_one = Tabula::Table.new_from_array(rows_unequal_one)
103
- table_unequal_two = Tabula::Table.new_from_array(rows_unequal_two)
104
- table_unequal_three = Tabula::Table.new_from_array(rows_unequal_three)
105
- table_unequal_four = Tabula::Table.new_from_array(rows_unequal_four)
106
-
107
- assert_equal table_base, table_equal
108
- assert_equal table_base, table_equal_column_padded
109
- refute_equal table_base, table_unequal_one
110
- refute_equal table_base, table_unequal_two
111
- refute_equal table_base, table_unequal_three
112
- refute_equal table_base, table_unequal_four
113
- end
114
- end
115
-
116
- class TestPagesInfoExtractor < Minitest::Test
117
- def test_pages_info_extractor
118
- extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
119
-
120
- i = 0
121
- extractor.pages.each do |page|
122
- assert_instance_of Tabula::Page, page
123
- i += 1
124
- end
125
- assert_equal 2, i
126
- end
127
- end
128
-
129
- class TestTableGuesser < Minitest::Test
130
- def test_find_rects_from_lines_with_lsd
131
- skip "Skipping until we actually use LSD"
132
- filename = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
133
- page_index = 0
134
- lines = Tabula::Extraction::LineExtractor.lines_in_pdf_page(filename, page_index, :render_pdf => true)
135
-
136
- page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
137
- page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
138
- expected_page_areas = [[54.087890625, 50.203125, 734.220703125, 550.44140625]]
139
- assert_equal expected_page_areas, page_areas
140
- end
141
-
142
- end
143
-
144
- class TestDumper < Minitest::Test
145
-
146
- def test_extractor
147
- extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
148
- page = extractor.extract.next
149
- assert_instance_of Tabula::Page, page
150
- end
151
-
152
- def test_get_by_area
153
- extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
154
- characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
155
- assert_equal characters.size, 206
156
- end
157
- end
158
-
159
- class TestRulingIntersection < Minitest::Test
160
- def test_ruling_intersection
161
- horizontals = [Tabula::Ruling.new(10, 1, 10, 0)]
162
- verticals = [Tabula::Ruling.new(1, 3, 0, 11),
163
- Tabula::Ruling.new(1, 4, 0, 11)]
164
- ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
165
- assert_equal 2, ints.size
166
- assert_equal ints[0][0].getX, 3.0
167
- assert_equal ints[0][0].getY, 10.0
168
- assert_equal ints[1][0].getX, 4.0
169
- assert_equal ints[1][0].getY, 10.0
170
-
171
- verticals = [Tabula::Ruling.new(20, 3, 0, 11)]
172
- ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
173
- assert_equal ints.size, 0
174
- end
175
- end
176
-
177
- class TestExtractor < Minitest::Test
178
-
179
- def test_table_extraction_1
180
- table = table_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
181
- 1,
182
- [107.1, 57.9214, 394.5214, 290.7],
183
- :detect_ruling_lines => false)
184
-
185
- expected = [["Prior Scale","New Scale","% Rank*"], ["800","170","99"], ["790","170","99"], ["780","170","99"], ["770","170","99"], ["760","170","99"], ["750","169","99"], ["740","169","99"], ["730","168","98"], ["720","168","98"], ["710","167","97"], ["700","166","96"], ["690","165","95"], ["680","165","95"], ["670","164","93"], ["660","164","93"], ["650","163","91"]]
186
-
187
- assert_equal expected, table
188
- end
189
-
190
- def test_diputados_voting_record
191
- table = table_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
192
- 1,
193
- [269.875, 12.75, 790.5, 561])
194
-
195
- expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
196
-
197
- assert_equal expected, table
198
- end
199
-
200
- def test_forest_disclosure_report_dont_regress
201
- # this is the current state of the expected output. Ideally the output should be like
202
- # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
203
- # and a solution for half-x-height-offset lines.
204
- pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
205
-
206
- table = Tabula.extract_table(pdf_file_path,
207
- 1,
208
- [106.01, 48.09, 227.31, 551.89],
209
- :detect_ruling_lines => true,
210
- :extraction_method => "original")
211
-
212
- expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
213
-
214
- assert_equal expected, table
215
- end
216
-
217
- def test_missing_spaces_around_an_ampersand
218
- pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
219
- character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
220
- page_obj = character_extractor.extract.next
221
- lines = page_obj.ruling_lines
222
- vertical_rulings = lines.select(&:vertical?)
223
-
224
- area = [170, 28, 185, 833] #top left bottom right
225
-
226
- expected = Tabula::Table.new_from_array([
227
- ["", "REGIONAL PULMONARY & SLEEP",],
228
- ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"],
229
- ["", "MEDICINE", ],
230
- ])
231
-
232
- assert_equal expected, lines_to_table(page_obj.get_area(area).make_table(:vertical_rulings => vertical_rulings))
233
- end
234
-
235
- def test_forest_disclosure_report
236
- skip "Skipping until we support multiline cells"
237
- pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
238
- character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
239
- lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
240
- vertical_rulings = lines.select(&:vertical?) #.uniq{|line| (line.left / 10).round }
241
-
242
- page_obj = character_extractor.extract.next
243
- characters = page_obj.get_text([110, 28, 218, 833])
244
- #top left bottom right
245
- expected = Tabula::Table.new_from_array([
246
- ['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '', '$85.00'],
247
- ['TOTAL', '', '', '','$85.00'],
248
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '', '$78.80'],
249
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '', '$392.45'],
250
- ['TOTAL', '', '', '', '$471.25'],
251
- ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '', '$20.39'],
252
- ['TOTAL', '', '', '','$20.39'],
253
- ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '', '$310.33'],
254
- ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '', '$4,700.00'],
255
- ['TOTAL', '', '', '', '$5,010.33'],
256
- ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '', '$193.67'],
257
- ['TOTAL', '', '', '', '$193.67'],
258
- ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '', '$19.50']
259
- ])
260
-
261
- assert_equal expected, lines_to_table(Tabula.make_table(characters, :vertical_rulings => vertical_rulings))
262
- end
263
-
264
- # TODO Spaces inserted in words - fails
265
- def test_bo_page24
266
- table = table_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
267
- 1,
268
- [425.625, 53.125, 575.714, 810.535],
269
- :detect_ruling_lines => false)
270
-
271
- expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
272
-
273
- assert_equal expected, table
274
- end
275
-
276
-
277
- def test_vertical_rulings_splitting_words
278
- #if a vertical ruling crosses over a word, the word should be split at that vertical ruling
279
- # before, the entire word would end up on one side of the vertical ruling.
280
- pdf_file_path = File.expand_path('data/vertical_rulings_bug.pdf', File.dirname(__FILE__))
281
-
282
- #both of these are semantically "correct"; the difference is in how we handle multi-line cells
283
- expected = Tabula::Table.new_from_array([
284
- ["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
285
- ["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
286
- ["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH ABRAHAMSON", "", "", "$22.93", "", "", "$22.93"]
287
- ])
288
- other_expected = Tabula::Table.new_from_array([
289
- ["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
290
- ["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
291
- ["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH", "", "", "$22.93", "", "", "$22.93"],
292
- ["", "", "", "ABRAHAMSON"]
293
- ])
294
-
295
- #N.B. it's "MORGANTOWN", "WV" that we're most interested in here (it used to show up as ["MORGANTOWNWV", "", ""])
296
-
297
-
298
- extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, 1...2) #:all ) # 1..2643
299
- extractor.extract.each_with_index do |pdf_page, page_index|
300
-
301
- page_areas = [[250, 0, 325, 1700]]
302
-
303
- scale_factor = pdf_page.width / 1700
304
-
305
- vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
306
-
307
- tables = page_areas.map do |page_area|
308
- pdf_page.get_area(page_area).make_table(:vertical_rulings => vertical_rulings)
309
- end
310
- assert_equal expected, lines_to_table(tables.first)
311
- end
312
- end
313
-
314
- def test_vertical_rulings_prevent_merging_of_columns
315
- expected = [["SZARANGOWICZ", "GUSTAVO ALEJANDRO", "25.096.244", "20-25096244-5", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TAILHADE", "LUIS RODOLFO", "21.386.299", "20-21386299-6", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TEDESCHI", "ADRIÁN ALBERTO", "24.171.507", "20-24171507-9", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["URRIZA", "MARÍA TERESA", "18.135.604", "27-18135604-4", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["USTARROZ", "GERÓNIMO JAVIER", "24.912.947", "20-24912947-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VALSANGIACOMO BLANC", "OFERNANDO JORGE", "26.800.203", "20-26800203-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VICENTE", "PABLO ARIEL", "21.897.586", "20-21897586-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["AMBURI", "HUGO ALBERTO", "14.096.560", "20-14096560-0", "09/10/2013", "EFECTIVO", "$ 20.000,00"], ["BERRA", "CLAUDIA SUSANA", "14.433.112", "27-14433112-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"]]
316
-
317
- vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
318
-
319
- table = table_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
320
- 1,
321
- [255.57,40.43,398.76,557.35],
322
- :vertical_rulings => vertical_rulings)
323
-
324
- assert_equal expected, table
325
- end
326
-
327
- def test_get_spacing_and_merging_right
328
- table = table_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
329
- 1,
330
- [52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
331
- :detect_ruling_lines => true)
332
-
333
- expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
334
-
335
- assert_equal expected, table
336
-
337
- end
338
-
339
-
340
- class SpreadsheetsHasCellsTester
341
- include Tabula::HasCells
342
- attr_accessor :cells
343
- def initialize(cells)
344
- @cells = cells
345
- end
346
- end
347
-
348
- #just tests the algorithm
349
- def test_cells_to_spreadsheets
350
-
351
- cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
352
- Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
353
- Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
354
- Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
355
- Tabula::Cell.new(90.0, 18.0, 52.0, 4.0), Tabula::Cell.new(94.0, 18.0, 52.0, 6.0),
356
- Tabula::Cell.new(100.0, 18.0, 52.0, 28.0), Tabula::Cell.new(128.0, 18.0, 52.0, 4.0),
357
- Tabula::Cell.new(132.0, 18.0, 52.0, 64.0), Tabula::Cell.new(196.0, 18.0, 52.0, 66.0),
358
- Tabula::Cell.new(262.0, 18.0, 52.0, 4.0), Tabula::Cell.new(266.0, 18.0, 52.0, 84.0),
359
- Tabula::Cell.new(350.0, 18.0, 52.0, 4.0), Tabula::Cell.new(354.0, 18.0, 52.0, 32.0),
360
- Tabula::Cell.new(386.0, 18.0, 52.0, 38.0), Tabula::Cell.new(424.0, 18.0, 52.0, 18.0),
361
- Tabula::Cell.new(442.0, 18.0, 52.0, 74.0), Tabula::Cell.new(516.0, 18.0, 52.0, 28.0),
362
- Tabula::Cell.new(544.0, 18.0, 52.0, 4.0), Tabula::Cell.new(44.0, 70.0, 156.0, 6.0),
363
- Tabula::Cell.new(50.0, 70.0, 156.0, 4.0), Tabula::Cell.new(54.0, 70.0, 156.0, 6.0),
364
- Tabula::Cell.new(60.0, 70.0, 156.0, 4.0), Tabula::Cell.new(64.0, 70.0, 156.0, 6.0),
365
- Tabula::Cell.new(70.0, 70.0, 156.0, 4.0), Tabula::Cell.new(74.0, 70.0, 156.0, 6.0),
366
- Tabula::Cell.new(84.0, 70.0, 2.0, 6.0), Tabula::Cell.new(90.0, 70.0, 156.0, 4.0),
367
- Tabula::Cell.new(94.0, 70.0, 156.0, 6.0), Tabula::Cell.new(100.0, 70.0, 156.0, 28.0),
368
- Tabula::Cell.new(128.0, 70.0, 156.0, 4.0), Tabula::Cell.new(132.0, 70.0, 156.0, 64.0),
369
- Tabula::Cell.new(196.0, 70.0, 156.0, 66.0), Tabula::Cell.new(262.0, 70.0, 156.0, 4.0),
370
- Tabula::Cell.new(266.0, 70.0, 156.0, 84.0), Tabula::Cell.new(350.0, 70.0, 156.0, 4.0),
371
- Tabula::Cell.new(354.0, 70.0, 156.0, 32.0), Tabula::Cell.new(386.0, 70.0, 156.0, 38.0),
372
- Tabula::Cell.new(424.0, 70.0, 156.0, 18.0), Tabula::Cell.new(442.0, 70.0, 156.0, 74.0),
373
- Tabula::Cell.new(516.0, 70.0, 156.0, 28.0), Tabula::Cell.new(544.0, 70.0, 156.0, 4.0),
374
- Tabula::Cell.new(84.0, 72.0, 446.0, 6.0), Tabula::Cell.new(90.0, 226.0, 176.0, 4.0),
375
- Tabula::Cell.new(94.0, 226.0, 176.0, 6.0), Tabula::Cell.new(100.0, 226.0, 176.0, 28.0),
376
- Tabula::Cell.new(128.0, 226.0, 176.0, 4.0), Tabula::Cell.new(132.0, 226.0, 176.0, 64.0),
377
- Tabula::Cell.new(196.0, 226.0, 176.0, 66.0), Tabula::Cell.new(262.0, 226.0, 176.0, 4.0),
378
- Tabula::Cell.new(266.0, 226.0, 176.0, 84.0), Tabula::Cell.new(350.0, 226.0, 176.0, 4.0),
379
- Tabula::Cell.new(354.0, 226.0, 176.0, 32.0), Tabula::Cell.new(386.0, 226.0, 176.0, 38.0),
380
- Tabula::Cell.new(424.0, 226.0, 176.0, 18.0), Tabula::Cell.new(442.0, 226.0, 176.0, 74.0),
381
- Tabula::Cell.new(516.0, 226.0, 176.0, 28.0), Tabula::Cell.new(544.0, 226.0, 176.0, 4.0),
382
- Tabula::Cell.new(90.0, 402.0, 116.0, 4.0), Tabula::Cell.new(94.0, 402.0, 116.0, 6.0),
383
- Tabula::Cell.new(100.0, 402.0, 116.0, 28.0), Tabula::Cell.new(128.0, 402.0, 116.0, 4.0),
384
- Tabula::Cell.new(132.0, 402.0, 116.0, 64.0), Tabula::Cell.new(196.0, 402.0, 116.0, 66.0),
385
- Tabula::Cell.new(262.0, 402.0, 116.0, 4.0), Tabula::Cell.new(266.0, 402.0, 116.0, 84.0),
386
- Tabula::Cell.new(350.0, 402.0, 116.0, 4.0), Tabula::Cell.new(354.0, 402.0, 116.0, 32.0),
387
- Tabula::Cell.new(386.0, 402.0, 116.0, 38.0), Tabula::Cell.new(424.0, 402.0, 116.0, 18.0),
388
- Tabula::Cell.new(442.0, 402.0, 116.0, 74.0), Tabula::Cell.new(516.0, 402.0, 116.0, 28.0),
389
- Tabula::Cell.new(544.0, 402.0, 116.0, 4.0), Tabula::Cell.new(84.0, 518.0, 246.0, 6.0),
390
- Tabula::Cell.new(90.0, 518.0, 186.0, 4.0), Tabula::Cell.new(94.0, 518.0, 186.0, 6.0),
391
- Tabula::Cell.new(100.0, 518.0, 186.0, 28.0), Tabula::Cell.new(128.0, 518.0, 186.0, 4.0),
392
- Tabula::Cell.new(132.0, 518.0, 186.0, 64.0), Tabula::Cell.new(196.0, 518.0, 186.0, 66.0),
393
- Tabula::Cell.new(262.0, 518.0, 186.0, 4.0), Tabula::Cell.new(266.0, 518.0, 186.0, 84.0),
394
- Tabula::Cell.new(350.0, 518.0, 186.0, 4.0), Tabula::Cell.new(354.0, 518.0, 186.0, 32.0),
395
- Tabula::Cell.new(386.0, 518.0, 186.0, 38.0), Tabula::Cell.new(424.0, 518.0, 186.0, 18.0),
396
- Tabula::Cell.new(442.0, 518.0, 186.0, 74.0), Tabula::Cell.new(516.0, 518.0, 186.0, 28.0),
397
- Tabula::Cell.new(544.0, 518.0, 186.0, 4.0), Tabula::Cell.new(90.0, 704.0, 60.0, 4.0),
398
- Tabula::Cell.new(94.0, 704.0, 60.0, 6.0), Tabula::Cell.new(100.0, 704.0, 60.0, 28.0),
399
- Tabula::Cell.new(128.0, 704.0, 60.0, 4.0), Tabula::Cell.new(132.0, 704.0, 60.0, 64.0),
400
- Tabula::Cell.new(196.0, 704.0, 60.0, 66.0), Tabula::Cell.new(262.0, 704.0, 60.0, 4.0),
401
- Tabula::Cell.new(266.0, 704.0, 60.0, 84.0), Tabula::Cell.new(350.0, 704.0, 60.0, 4.0),
402
- Tabula::Cell.new(354.0, 704.0, 60.0, 32.0), Tabula::Cell.new(386.0, 704.0, 60.0, 38.0),
403
- Tabula::Cell.new(424.0, 704.0, 60.0, 18.0), Tabula::Cell.new(442.0, 704.0, 60.0, 74.0),
404
- Tabula::Cell.new(516.0, 704.0, 60.0, 28.0), Tabula::Cell.new(544.0, 704.0, 60.0, 4.0),
405
- Tabula::Cell.new(84.0, 764.0, 216.0, 6.0), Tabula::Cell.new(90.0, 764.0, 216.0, 4.0),
406
- Tabula::Cell.new(94.0, 764.0, 216.0, 6.0), Tabula::Cell.new(100.0, 764.0, 216.0, 28.0),
407
- Tabula::Cell.new(128.0, 764.0, 216.0, 4.0), Tabula::Cell.new(132.0, 764.0, 216.0, 64.0),
408
- Tabula::Cell.new(196.0, 764.0, 216.0, 66.0), Tabula::Cell.new(262.0, 764.0, 216.0, 4.0),
409
- Tabula::Cell.new(266.0, 764.0, 216.0, 84.0), Tabula::Cell.new(350.0, 764.0, 216.0, 4.0),
410
- Tabula::Cell.new(354.0, 764.0, 216.0, 32.0), Tabula::Cell.new(386.0, 764.0, 216.0, 38.0),
411
- Tabula::Cell.new(424.0, 764.0, 216.0, 18.0), Tabula::Cell.new(442.0, 764.0, 216.0, 74.0),
412
- Tabula::Cell.new(516.0, 764.0, 216.0, 28.0), Tabula::Cell.new(544.0, 764.0, 216.0, 4.0)]
413
-
414
-
415
- expected_spreadsheets = [Tabula::Spreadsheet.new(40.0, 18.0, 208.0, 40.0, nil, nil, nil, nil),
416
- Tabula::Spreadsheet.new(84.0, 18.0, 962.0, 464.0,nil, nil, nil, nil)]
417
-
418
- #compares spreadsheets on area only.
419
- assert_equal expected_spreadsheets.map{|s| [s.x, s.y, s.width, s.height] },
420
- SpreadsheetsHasCellsTester.new(cells).find_spreadsheets_from_cells.map{|a| s = a.getBounds; [s.x, s.y, s.width, s.height] }
421
-
422
-
423
- end
424
-
425
- def test_add_spanning_cells
426
- skip "until I write it"
427
- end
428
-
429
- def test_add_placeholder_cells_to_funny_shaped_tables
430
- skip "until I write it, cf 01005787B_Pakistan.pdf"
431
- end
432
-
433
- class CellsHasCellsTester
434
- include Tabula::HasCells
435
- attr_accessor :vertical_ruling_lines, :horizontal_ruling_lines, :cells
436
- def initialize(vertical_ruling_lines, horizontal_ruling_lines)
437
- @cells = []
438
- @vertical_ruling_lines = vertical_ruling_lines
439
- @horizontal_ruling_lines = horizontal_ruling_lines
440
- find_cells!
441
- end
442
- end
443
-
444
- #just tests the algorithm
445
- def test_lines_to_cells
446
- vertical_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 0.0, 40.0),
447
- Tabula::Ruling.new(44.0, 70.0, 0.0, 36.0),
448
- Tabula::Ruling.new(40.0, 226.0, 0.0, 40.0)]
449
-
450
- horizontal_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 208.0, 0.0),
451
- Tabula::Ruling.new(44.0, 18.0, 208.0, 0.0),
452
- Tabula::Ruling.new(50.0, 18.0, 208.0, 0.0),
453
- Tabula::Ruling.new(54.0, 18.0, 208.0, 0.0),
454
- Tabula::Ruling.new(60.0, 18.0, 208.0, 0.0),
455
- Tabula::Ruling.new(64.0, 18.0, 208.0, 0.0),
456
- Tabula::Ruling.new(70.0, 18.0, 208.0, 0.0),
457
- Tabula::Ruling.new(74.0, 18.0, 208.0, 0.0),
458
- Tabula::Ruling.new(80.0, 18.0, 208.0, 0.0)]
459
-
460
- expected_cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
461
- Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
462
- Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
463
- Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
464
- Tabula::Cell.new(44.0, 70.0, 156.0, 6.0), Tabula::Cell.new(50.0, 70.0, 156.0, 4.0),
465
- Tabula::Cell.new(54.0, 70.0, 156.0, 6.0), Tabula::Cell.new(60.0, 70.0, 156.0, 4.0),
466
- Tabula::Cell.new(64.0, 70.0, 156.0, 6.0), Tabula::Cell.new(70.0, 70.0, 156.0, 4.0),
467
- Tabula::Cell.new(74.0, 70.0, 156.0, 6.0), ]
468
-
469
- actual_cells = CellsHasCellsTester.new(vertical_ruling_lines, horizontal_ruling_lines).cells
470
- assert_equal Set.new(expected_cells), Set.new(actual_cells) #I don't care about order
471
- end
472
-
473
- #this is the real deal!!
474
- def test_extract_tabular_data_using_lines_and_spreadsheets
475
- pdf_file_path = "./test/data/frx_2012_disclosure.pdf"
476
- expected_data_path = "./test/data/frx_2012_disclosure.tsv"
477
- expected = open(expected_data_path, 'r').read #.split("\n").map{|line| line.split("\t")}
478
-
479
- Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all).extract.each do |pdf_page|
480
- spreadsheet = pdf_page.spreadsheets.first
481
- assert_equal expected, spreadsheet.to_tsv
482
- end
483
- end
484
-
485
- def test_cope_with_a_tableless_page
486
- pdf_file_path = "./test/data/no_tables.pdf"
487
-
488
- spreadsheets = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all, '',
489
- :line_color_filter => lambda{|components| components.all?{|c| c < 0.1}}
490
- ).extract.to_a.first.spreadsheets
491
-
492
- assert_equal 0, spreadsheets.size
493
- end
494
-
495
- def test_spanning_cells
496
- pdf_file_path = "./test/data/spanning_cells.pdf"
497
- expected_data_path = "./test/data/spanning_cells.csv"
498
- expected = open(expected_data_path, 'r').read
499
-
500
- Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
501
- spreadsheet = pdf_page.spreadsheets.first
502
- assert_equal expected, spreadsheet.to_csv
503
- end
504
- end
505
-
506
- def test_almost_vertical_lines
507
- pdf_file_path = "./test/data/puertos1.pdf"
508
- top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
509
- area = Tabula::ZoneEntity.new(top, left,
510
- right - left, bottom - top)
511
-
512
- Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
513
- rulings = Tabula::Ruling.crop_rulings_to_area(pdf_page.ruling_lines, area)
514
- # TODO assertion not entirely correct, should do the trick for now
515
- assert_equal 15, rulings.select(&:vertical?).count
516
- end
517
- end
518
-
519
- def test_extract_spreadsheet_within_an_area
520
- pdf_file_path = "./test/data/puertos1.pdf"
521
- top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
522
-
523
- Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
524
- area = pdf_page.get_area([top, left, bottom, right])
525
- table = area.spreadsheets.first.to_a
526
- assert_equal 15, table.length
527
- assert_equal ["", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM"], table.first
528
- assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
529
- end
530
- end
531
-
532
- def test_remove_repeated_text
533
- top, left, bottom, right = 106.07142857142858, 50.91428571428572, 141.42857142857144, 755.2285714285715
534
-
535
- table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
536
- 1,
537
- [top,left,bottom,right],
538
- :detect_ruling_lines => false,
539
- :extraction_method => 'original')
540
-
541
- ary = table_to_array(table)
542
- assert_equal ary[1][1], "$ 18,969,610"
543
- assert_equal ary[1][2], "$ 18,157,722"
544
- end
545
-
546
- def test_remove_overlapping_text
547
- # one of those PDFs that put characters on top of another to make text "bold"
548
- top,left,bottom,right = 399.98571428571427, 36.06428571428571, 425.1214285714285, 544.2428571428571
549
- table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
550
- 1,
551
- [top,left,bottom,right],
552
- :detect_ruling_lines => false,
553
- :extraction_method => 'original')
554
-
555
- ary = table_to_array(table)
556
- assert_equal ary.first.first, "Community development"
557
- end
558
-
559
- def test_cells_including_line_returns
560
- data = []
561
- pdf_file_path = "./test/data/sydney_disclosure_contract.pdf"
562
- Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
563
- pdf_page.spreadsheets.each do |spreadsheet|
564
- spreadsheet.cells.each do |cell|
565
- cell.text_elements = pdf_page.get_cell_text(cell)
566
- cell.options = ({:use_line_returns => true, :cell_debug => 0})
567
- data << cell.text
568
- end
569
- end
570
- end
571
- assert_equal ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \nAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \nDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender \nTender evaluation criteria included: \n- The schedule of prices \n- Compliance with technical specifications/Technical assessment \n- Operational Plan including maintenance procedures"], data
572
- end
573
-
574
- end
575
-
576
- class TestIsTabularHeuristic < Minitest::Test
577
-
578
- EXPECTED_TO_BE_SPREADSHEET = ['47008204D_USA.page4.pdf', 'GSK_2012_Q4.page437.pdf', 'strongschools.pdf', 'tabla_subsidios.pdf']
579
- NOT_EXPECTED_TO_BE_SPREADSHEET = ['560015757GV_China.page1.pdf', 'S2MNCEbirdisland.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
580
-
581
- File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
582
-
583
- def test_heuristic_detects_spreadsheets
584
- EXPECTED_TO_BE_SPREADSHEET.each do |f|
585
- path = File.expand_path('data/' + f, File.dirname(__FILE__))
586
- extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
587
- page = extractor.extract.first
588
- page.get_ruling_lines!
589
- assert page.is_tabular?, "failed on file #{f}"
590
- end
591
- end
592
-
593
- def test_heuristic_detects_non_spreadsheets
594
- NOT_EXPECTED_TO_BE_SPREADSHEET.each do |f|
595
- path = File.expand_path('data/' + f, File.dirname(__FILE__))
596
- extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
597
- page = extractor.extract.first
598
- page.get_ruling_lines!
599
- assert !page.is_tabular?, "failed on file #{f}"
600
- end
601
- end
602
-
603
- end