tabula-extractor 0.6.6-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.6.6'
2
+ VERSION = '0.7.0'
3
3
  end
@@ -5,9 +5,9 @@ module Tabula
5
5
  module Writers
6
6
 
7
7
  def Writers.CSV(lines, output=$stdout)
8
- lines.each { |l|
8
+ lines.each do |l|
9
9
  output.write CSV.generate_line(l.map(&:text), row_sep: "\r\n")
10
- }
10
+ end
11
11
  end
12
12
 
13
13
  def Writers.JSON(lines, output=$stdout)
@@ -15,12 +15,11 @@ module Tabula
15
15
  end
16
16
 
17
17
  def Writers.TSV(lines, output=$stdout)
18
- lines.each { |l|
19
- output.write(l.map(&:text).join("\t") + "\n")
20
- }
18
+ lines.each do |l|
19
+ output.write CSV.generate_line(l.map(&:text), col_sep: "\t", row_sep: "\r\n")
20
+ end
21
21
  end
22
22
 
23
-
24
23
  def Writers.HTML(lines, output=$stdout)
25
24
  raise "not implemented"
26
25
  end
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
24
24
  s.add_development_dependency 'minitest'
25
25
  s.add_development_dependency 'bundler', '>= 1.3.4'
26
26
  s.add_development_dependency 'ruby-debug'
27
+ s.add_development_dependency 'pry'
27
28
 
28
29
  s.add_runtime_dependency "trollop", ["~> 2.0"]
29
30
  # s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,88 @@
1
+ FOREST LABORATORIES, INC. DISCLOSURE REPORT "" "" "" ""
2
+ Calendar Year - 2012 "" "" "" ""
3
+ Physician Related Entity (if applicable) City / State Purpose of Payment Amount ($USD) * **
4
+ AALAEI, BEHZAD "" HIGHLAND, IN MEALS $51.24
5
+ TOTAL "" "" "" $51.24
6
+ AAMODT, DENISE, E "" ALBUQUERQUE, NM MEALS $66.12
7
+ TOTAL "" "" "" $66.12
8
+ AANONSEN, DEBORAH, A "" STATEN ISLAND, NY MEALS $85.00
9
+ TOTAL "" "" "" $85.00
10
+ AARON, CAREN, T "" RICHMOND, VA EDUCATIONAL ITEMS $78.80
11
+ AARON, CAREN, T "" RICHMOND, VA MEALS $392.45
12
+ TOTAL "" "" "" $471.25
13
+ AARON, JOHN "" CLARKSVILLE, TN MEALS $20.39
14
+ TOTAL "" "" "" $20.39
15
+ AARON, JOSHUA, N "" WEST GROVE, PA MEALS $310.33
16
+ AARON, JOSHUA, N REGIONAL PULMONARY & SLEEP MEDICINE WEST GROVE, PA SPEAKING FEES $4,700.00
17
+ TOTAL "" "" "" $5,010.33
18
+ AARON, MAUREEN, M "" MARTINSVILLE, VA MEALS $193.67
19
+ TOTAL "" "" "" $193.67
20
+ AARON, MICHAEL, L "" WEST ISLIP, NY MEALS $19.50
21
+ TOTAL "" "" "" $19.50
22
+ AARON, MICHAEL, R "" BROOKLYN, NY MEALS $65.92
23
+ TOTAL "" "" "" $65.92
24
+ AARONS, MARK, G "" PINEHURST, NC MEALS $154.19
25
+ TOTAL "" "" "" $154.19
26
+ AARONSON, GARY, A "" PHILADELPHIA, PA MEALS $205.17
27
+ TOTAL "" "" "" $205.17
28
+ AARONSON, ROBERT, M "" TUCSON, AZ MEALS $24.38
29
+ TOTAL "" "" "" $24.38
30
+ AASHEIM, RICHARD, J "" GREENEVILLE, TN EDUCATIONAL ITEMS $2.27
31
+ AASHEIM, RICHARD, J "" GREENEVILLE, TN MEALS $100.76
32
+ TOTAL "" "" "" $103.03
33
+ AASMAA, SIRIKE, T "" MONTVILLE, NJ MEALS $53.33
34
+ TOTAL "" "" "" $53.33
35
+ AAZAMI, HESSAM "" GRANADA HILLS, CA MEALS $402.90
36
+ TOTAL "" "" "" $402.90
37
+ ABAABA, ABIEDU, C "" JACKSONVILLE, FL MEALS $13.49
38
+ TOTAL "" "" "" $13.49
39
+ ABABNEH, ALAELDIN, A "" KANSAS CITY, KS MEALS $10.31
40
+ TOTAL "" "" "" $10.31
41
+ ABAD, ANTONIO, A "" CORAL SPRINGS, FL MEALS $516.29
42
+ TOTAL "" "" "" $516.29
43
+ ABADEER, PETER, S "" NORMAL, IL MEALS $200.38
44
+ TOTAL "" "" "" $200.38
45
+ ABAD, ENZO, L "" MIAMI, FL MEALS $67.61
46
+ TOTAL "" "" "" $67.61
47
+ ABADIAN SHARIFABAD, MANOOCHEHR "" GRANADA HILLS, CA MEALS $12.37
48
+ TOTAL "" "" "" $12.37
49
+ ABADI, CHRISTOPHER, A "" WARWICK, RI MEALS $157.42
50
+ TOTAL "" "" "" $157.42
51
+ ABADIE, MARCUS, G "" ATHENS, TX MEALS $361.89
52
+ TOTAL "" "" "" $361.89
53
+ ABADI, JAMSHEED, S "" BROOKLYN, NY MEALS $363.40
54
+ TOTAL "" "" "" $363.40
55
+ ABADILLA, JUNE, E "" JACKSON, KY MEALS $105.33
56
+ TOTAL "" "" "" $105.33
57
+ ABAD, JOHN, P "" NEWARK, OH MEALS $347.64
58
+ TOTAL "" "" "" $347.64
59
+ ABAD, JOSE, F "" FOLSOM, CA MEALS $30.28
60
+ TOTAL "" "" "" $30.28
61
+ ABAD, REMEDIOS, D "" WILNINGTON, DE MEALS $26.85
62
+ TOTAL "" "" "" $26.85
63
+ ABAD, SO KIM, F "" WICHITA FALLS, TX MEALS $136.52
64
+ TOTAL "" "" "" $136.52
65
+ ABAD, ZOILO, R "" MIAMI, FL MEALS $93.83
66
+ TOTAL "" "" "" $93.83
67
+ ABALIHI, CAROL, N "" EL PASO, TX MEALS $88.48
68
+ TOTAL "" "" "" $88.48
69
+ ABALOS, ANNA, T "" ROSEVILLE, CA MEALS $178.60
70
+ TOTAL "" "" "" $178.60
71
+ ABALOS, ARTURO, Z "" DELANO, CA MEALS $48.06
72
+ TOTAL "" "" "" $48.06
73
+ ABALOS, JOSEPH, M "" SENECA, PA MEALS $39.03
74
+ TOTAL "" "" "" $39.03
75
+ ABANDO, JOSE, R "" DAYTONA BEACH, FL MEALS $83.44
76
+ TOTAL "" "" "" $83.44
77
+ ABANG, ANTHONY, E "" ELIZABETHTOWN, KY MEALS $12.62
78
+ TOTAL "" "" "" $12.62
79
+ ABAN, KENRIC, T "" SAN DIEGO, CA MEALS $11.91
80
+ TOTAL "" "" "" $11.91
81
+ ABAQUETA, ALVIN, Y "" CHARLOTTE, NC MEALS $233.71
82
+ TOTAL "" "" "" $233.71
83
+ ABARCA, SERGIO, O "" TOOELE, UT MEALS $159.58
84
+ TOTAL "" "" "" $159.58
85
+ ABARIKWU, CONSTANTIA, A "" PHOENIX, AZ MEALS $153.57
86
+ TOTAL "" "" "" $153.57
87
+ ABASHIDZE, TEAH, A "" CLEVELAND, OH MEALS $153.59
88
+ TOTAL "" "" "" $153.59
Binary file
Binary file
@@ -0,0 +1,21 @@
1
+ Improved operation scenario,"","","","",""
2
+ Volume servers in:,2007,2008,2009,2010,2011
3
+ Server closets,"1,505","1,580","1,643","1,673","1,689"
4
+ Server rooms,"1,512","1,586","1,646","1,677","1,693"
5
+ Localized data centers,"1,512","1,586","1,646","1,677","1,693"
6
+ Mid-tier data centers,"1,512","1,586","1,646","1,677","1,693"
7
+ Enterprise-class data centers,"1,512","1,586","1,646","1,677","1,693"
8
+ Best practice scenario,"","","","",""
9
+ Volume servers in:,2007,2008,2009,2010,2011
10
+ Server closets,"1,456","1,439","1,386","1,296","1,326"
11
+ Server rooms,"1,465","1,472","1,427","1,334","1,371"
12
+ Localized data centers,"1,465","1,471","1,426","1,334","1,371"
13
+ Mid-tier data centers,"1,465","1,471","1,426","1,334","1,371"
14
+ Enterprise-class data centers,"1,465","1,471","1,426","1,334","1,371"
15
+ State-of-the-art scenario,"","","","",""
16
+ Volume servers in:,2007,2008,2009,2010,2011
17
+ Server closets,"1,485","1,471","1,424","1,315","1,349"
18
+ Server rooms,"1,495","1,573","1,586","1,424","1,485"
19
+ Localized data centers,"1,495","1,572","1,585","1,424","1,485"
20
+ Mid-tier data centers,"1,495","1,572","1,585","1,424","1,485"
21
+ Enterprise-class data centers,"1,495","1,572","1,585","1,424","1,485"
Binary file
Binary file
Binary file
data/test/heuristic.rb ADDED
@@ -0,0 +1,50 @@
1
+ #a list of filenames and the correct answer
2
+ # no more bs.
3
+ require_relative '../lib/tabula'
4
+
5
+
6
+ should_use_spreadsheet = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "spreadsheet/*") ).map{|a| [a, true]}
7
+ should_use_original = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "original/*") ).map{|a| [a, false]}
8
+
9
+ correct = []
10
+ misclassified_as_original = []
11
+ misclassified_as_spreadsheet = []
12
+
13
+
14
+
15
+ def heuristic(page)
16
+ page.is_tabular?
17
+ end
18
+
19
+ (should_use_spreadsheet + should_use_original) .each do |filename, expected_to_be_tabular|
20
+ extractor = Tabula::Extraction::ObjectExtractor.new(filename, [1])
21
+
22
+ page = extractor.extract.first
23
+ page.get_ruling_lines!
24
+ # puts "#{File.basename(filename)} | #{expected_to_be_tabular}"
25
+ page_is_tabular = heuristic(page)
26
+ # puts ""
27
+
28
+ if page_is_tabular && expected_to_be_tabular || !page_is_tabular && !expected_to_be_tabular
29
+ correct << filename
30
+ elsif page_is_tabular && !expected_to_be_tabular
31
+ misclassified_as_spreadsheet << filename
32
+ elsif !page_is_tabular && expected_to_be_tabular
33
+ misclassified_as_original << filename
34
+ end
35
+ end
36
+
37
+ puts "#{correct.size} PDFs were correctly classified"
38
+ puts "#{misclassified_as_original.size + misclassified_as_spreadsheet.size} PDFs were incorrectly classified"
39
+ unless misclassified_as_spreadsheet.empty?
40
+ puts "#{misclassified_as_spreadsheet.size} PDFs should use the original extraction algorithm\n\t but was classified as needing the spreadsheet algorithm"
41
+ misclassified_as_spreadsheet.each do |filename|
42
+ puts " - #{File.basename(filename)}"
43
+ end
44
+ end
45
+ unless misclassified_as_original.empty?
46
+ puts "#{misclassified_as_original.size} PDFs should use the spreadsheet extraction algorithm\n\t but was classified as needing the original algorithm"
47
+ misclassified_as_original.each do |filename|
48
+ puts " - #{File.basename(filename)}"
49
+ end
50
+ end
@@ -0,0 +1,7 @@
1
+ bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf --silent -o test.csv
2
+ bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf -o test.csv
3
+ bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv
4
+ bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv --format TSV
5
+ bin/tabula test/data/campaign_donors.pdf -o test.csv --columns 47,147,256,310,375,431,504 #columns should work
6
+ bin/tabula test/data/argentina_diputados_voting_record.pdf --guess -o test.csv --format TSV #should exclude guff
7
+ bin/tabula test/data/vertical_rulings_bug.pdf --area 250,0,325,1700 -o test.csv --format TSV #should be only a few lines
data/test/tests.rb CHANGED
@@ -6,10 +6,109 @@ require_relative '../lib/tabula'
6
6
 
7
7
  def lines_to_array(lines)
8
8
  lines.map { |l|
9
- l.map { |te| te.text }
9
+ l.map { |te| te.text.strip }
10
10
  }
11
11
  end
12
12
 
13
+ def lines_to_table(lines)
14
+ Tabula::Table.new_from_array(lines_to_array(lines))
15
+ end
16
+
17
+
18
+ # I don't want to pollute the "real" clasend a funny inspect method. Just for testing comparisons.
19
+ module Tabula
20
+ class Table
21
+ def inspect
22
+ "[" + lines.map(&:inspect).join(",") + "]"
23
+ end
24
+ end
25
+ end
26
+
27
+ module Tabula
28
+ class Line
29
+ def inspect
30
+ @text_elements.map(&:text).inspect
31
+ end
32
+ end
33
+ end
34
+
35
+
36
+ class TestEntityComparability < Minitest::Test
37
+ def test_text_element_comparability
38
+ base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
39
+
40
+ two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
41
+ three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
42
+ four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
43
+
44
+ five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
45
+ six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
46
+ seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
47
+ assert_equal base, two
48
+ assert_equal base, three
49
+ assert_equal base, four
50
+
51
+ refute_equal base, five
52
+ refute_equal base, six
53
+ refute_equal base, seven
54
+ end
55
+
56
+ def test_line_comparability
57
+ text_base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
58
+
59
+ text_two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
60
+ text_three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
61
+ text_four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
62
+
63
+ text_five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
64
+ text_six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
65
+ text_seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
66
+ line_base = Tabula::Line.new
67
+ line_base.text_elements = [text_base, text_two, text_three]
68
+ line_equal = Tabula::Line.new
69
+ line_equal.text_elements = [text_base, text_two, text_three]
70
+ line_equal_but_longer = Tabula::Line.new
71
+ line_equal_but_longer.text_elements = [text_base, text_two, text_three, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
72
+ line_unequal = Tabula::Line.new
73
+ line_unequal.text_elements = [text_base, text_two, text_three, text_five]
74
+ line_unequal_and_longer = Tabula::Line.new
75
+ line_unequal_and_longer.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
76
+ line_unequal_and_longer_and_different = Tabula::Line.new
77
+ line_unequal_and_longer_and_different.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, 'whatever']
78
+
79
+ assert_equal line_base, line_equal
80
+ assert_equal line_base, line_equal_but_longer
81
+ refute_equal line_base, line_unequal
82
+ refute_equal line_base, line_unequal_and_longer
83
+ refute_equal line_base, line_unequal_and_longer_and_different
84
+ end
85
+
86
+ def test_table_comparability
87
+ rows_base = [["a", "b", "c"], ['', 'd', '']]
88
+ rows_equal = [["a", "b", "c"], ['', 'd']]
89
+ rows_equal_padded = [['', "a", "b", "c"], ['', '', 'd']]
90
+ rows_unequal_one = [["a", "b", "c"], ['d']]
91
+ rows_unequal_two = [["a", "b", "c"], ['d', '']]
92
+ rows_unequal_three = [["a", "b", "c"], ['d'], ['a','b', 'd']]
93
+ rows_unequal_four = [["a", "b", "c"]]
94
+
95
+ table_base = Tabula::Table.new_from_array(rows_base)
96
+ table_equal = Tabula::Table.new_from_array(rows_equal)
97
+ table_equal_column_padded = Tabula::Table.new_from_array(rows_equal_padded)
98
+ table_unequal_one = Tabula::Table.new_from_array(rows_unequal_one)
99
+ table_unequal_two = Tabula::Table.new_from_array(rows_unequal_two)
100
+ table_unequal_three = Tabula::Table.new_from_array(rows_unequal_three)
101
+ table_unequal_four = Tabula::Table.new_from_array(rows_unequal_four)
102
+
103
+ assert_equal table_base, table_equal
104
+ assert_equal table_base, table_equal_column_padded
105
+ refute_equal table_base, table_unequal_one
106
+ refute_equal table_base, table_unequal_two
107
+ refute_equal table_base, table_unequal_three
108
+ refute_equal table_base, table_unequal_four
109
+ end
110
+ end
111
+
13
112
  class TestPagesInfoExtractor < Minitest::Test
14
113
  def test_pages_info_extractor
15
114
  extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
@@ -24,42 +123,74 @@ class TestPagesInfoExtractor < Minitest::Test
24
123
  end
25
124
 
26
125
  class TestTableGuesser < Minitest::Test
126
+ def test_find_rects_from_lines_with_lsd
127
+ skip "Skipping until we actually use LSD"
128
+ filename = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
129
+ page_index = 0
130
+ lines = Tabula::Extraction::LineExtractor.lines_in_pdf_page(filename, page_index, :render_pdf => true)
131
+
132
+ page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
133
+ page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
134
+ expected_page_areas = [[54.087890625, 50.203125, 734.220703125, 550.44140625]]
135
+ assert_equal expected_page_areas, page_areas
136
+ end
137
+
27
138
  end
28
139
 
29
140
  class TestDumper < Minitest::Test
30
141
 
31
142
  def test_extractor
32
- extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
33
- page = extractor.extract.first
143
+ extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
144
+ page = extractor.extract.next
34
145
  assert_instance_of Tabula::Page, page
35
146
  end
36
147
 
37
148
  def test_get_by_area
38
-
39
- # http://localhost:8080/debug/418b1d5698e5c7b724551d9610c071ab3063275c/characters?x1=57.921428571428564&x2=290.7&y1=107.1&y2=394.52142857142854&page=1&use_lines=false
40
- extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
149
+ extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
41
150
  characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
42
151
  assert_equal characters.size, 206
43
152
  end
44
153
  end
45
154
 
155
+ class TestRulingIntersection < Minitest::Test
156
+ def test_ruling_intersection
157
+ horizontals = [Tabula::Ruling.new(10, 1, 10, 0)]
158
+ verticals = [Tabula::Ruling.new(1, 3, 0, 11),
159
+ Tabula::Ruling.new(1, 4, 0, 11)]
160
+ ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
161
+ assert_equal 2, ints.size
162
+ assert_equal ints[0][0].getX, 3.0
163
+ assert_equal ints[0][0].getY, 10.0
164
+ assert_equal ints[1][0].getX, 4.0
165
+ assert_equal ints[1][0].getY, 10.0
166
+
167
+ verticals = [Tabula::Ruling.new(20, 3, 0, 11)]
168
+ ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
169
+ assert_equal ints.size, 0
170
+ end
171
+ end
172
+
46
173
  class TestExtractor < Minitest::Test
47
174
 
48
175
  def test_table_extraction_1
49
- character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
50
- characters = character_extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
51
- table = lines_to_array Tabula.make_table(characters)
52
- expected = [["Prior Scale ", "New Scale ", "% Rank* "], ["800 ", "170 ", "99 "], ["790 ", "170 ", "99 "], ["780 ", "170 ", "99 "], ["770 ", "170 ", "99 "], ["760 ", "170 ", "99 "], ["750 ", "169 ", "99 "], ["740 ", "169 ", "99 "], ["730 ", "168 ", "98 "], ["720 ", "168 ", "98 "], ["710 ", "167 ", "97 "], ["700 ", "166 ", "96 "], ["690 ", "165 ", "95 "], ["680 ", "165 ", "95 "], ["670 ", "164 ", "93 "], ["660 ", "164 ", "93 "], ["650 ", "163 ", "91 "]]
176
+ table = lines_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
177
+ 1,
178
+ [107.1, 57.9214, 394.5214, 290.7],
179
+ :detect_ruling_lines => false)
180
+
181
+ expected = [["Prior Scale","New Scale","% Rank*"], ["800","170","99"], ["790","170","99"], ["780","170","99"], ["770","170","99"], ["760","170","99"], ["750","169","99"], ["740","169","99"], ["730","168","98"], ["720","168","98"], ["710","167","97"], ["700","166","96"], ["690","165","95"], ["680","165","95"], ["670","164","93"], ["660","164","93"], ["650","163","91"]]
182
+
53
183
  assert_equal expected, table
54
184
  end
55
185
 
56
186
  def test_diputados_voting_record
57
- character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)))
58
- characters = character_extractor.extract.next.get_text([269.875, 12.75, 790.5, 561])
187
+ table = lines_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
188
+ 1,
189
+ [269.875, 12.75, 790.5, 561])
59
190
 
60
191
  expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
61
192
 
62
- assert_equal expected, lines_to_array(Tabula.make_table(characters))
193
+ assert_equal expected, table
63
194
  end
64
195
 
65
196
  def test_forest_disclosure_report_dont_regress
@@ -67,80 +198,362 @@ class TestExtractor < Minitest::Test
67
198
  # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
68
199
  # and a solution for half-x-height-offset lines.
69
200
  pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
70
- character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
71
- lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
72
- vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
73
201
 
202
+ table = lines_to_table Tabula.extract_table(pdf_file_path,
203
+ 1,
204
+ [106.01, 48.09, 227.31, 551.89],
205
+ :detect_ruling_lines => true)
74
206
 
75
- characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
76
- #top left bottom right
77
- expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
78
- ['TOTAL', '', '', '','$85.00'],
79
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
80
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
81
- ['TOTAL', '', '', '', '$471.25'],
82
- ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
83
- ['TOTAL', '', '', '','$20.39'],
84
- ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
85
- ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
86
- ['TOTAL', '', '', '', '$5,010.33'],
87
- ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
88
- ['TOTAL', '', '', '', '$193.67'],
89
- ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
90
-
91
- assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
207
+ expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
208
+
209
+
210
+ assert_equal expected, table
92
211
  end
93
212
 
94
213
  def test_missing_spaces_around_an_ampersand
95
214
  pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
96
- character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
97
- lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
98
- vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
215
+ character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
216
+ page_obj = character_extractor.extract.next
217
+ lines = page_obj.ruling_lines
218
+ vertical_rulings = lines.select(&:vertical?)
99
219
 
220
+ area = [170, 28, 185, 833] #top left bottom right
100
221
 
101
- characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
102
- #top left bottom right
103
- expected = [
104
- ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
105
- ]
222
+ expected = Tabula::Table.new_from_array([
223
+ ["", "REGIONAL PULMONARY & SLEEP",],
224
+ ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"],
225
+ ["", "MEDICINE", ],
226
+ ])
106
227
 
107
- assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
228
+ assert_equal expected, lines_to_table(page_obj.get_area(area).make_table(:vertical_rulings => vertical_rulings))
108
229
  end
109
230
 
110
231
  def test_forest_disclosure_report
111
232
  skip "Skipping until we support multiline cells"
112
233
  pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
113
- character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
234
+ character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
114
235
  lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
115
- vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
236
+ vertical_rulings = lines.select(&:vertical?) #.uniq{|line| (line.left / 10).round }
116
237
 
117
- characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
238
+ page_obj = character_extractor.extract.next
239
+ characters = page_obj.get_text([110, 28, 218, 833])
118
240
  #top left bottom right
119
- expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
120
- ['TOTAL', '', '', '','$85.00'],
121
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
122
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
123
- ['TOTAL', '', '', '', '$471.25'],
124
- ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
125
- ['TOTAL', '', '', '','$20.39'],
126
- ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
127
- ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '$4,700.00'],
128
- ['TOTAL', '', '', '', '$5,010.33'],
129
- ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
130
- ['TOTAL', '', '', '', '$193.67'],
131
- ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
132
-
133
- assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
241
+ expected = Tabula::Table.new_from_array([
242
+ ['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '', '$85.00'],
243
+ ['TOTAL', '', '', '','$85.00'],
244
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '', '$78.80'],
245
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '', '$392.45'],
246
+ ['TOTAL', '', '', '', '$471.25'],
247
+ ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '', '$20.39'],
248
+ ['TOTAL', '', '', '','$20.39'],
249
+ ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '', '$310.33'],
250
+ ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '', '$4,700.00'],
251
+ ['TOTAL', '', '', '', '$5,010.33'],
252
+ ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '', '$193.67'],
253
+ ['TOTAL', '', '', '', '$193.67'],
254
+ ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '', '$19.50']
255
+ ])
256
+
257
+ assert_equal expected, lines_to_table(Tabula.make_table(characters, :vertical_rulings => vertical_rulings))
134
258
  end
135
259
 
136
260
  # TODO Spaces inserted in words - fails
137
261
  def test_bo_page24
138
- character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
139
- characters = character_extractor.extract.next.get_text([435.625, 53.125, 585.7142857142857, 810.5357142857142])
262
+ table = lines_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
263
+ 1,
264
+ [425.625, 53.125, 575.714, 810.535],
265
+ :detect_ruling_lines => false)
140
266
 
141
267
  expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
142
- assert_equal expected, lines_to_array(Tabula.make_table(characters))
268
+
269
+ assert_equal expected, table
270
+ end
271
+
272
+
273
+ def test_vertical_rulings_splitting_words
274
+ #if a vertical ruling crosses over a word, the word should be split at that vertical ruling
275
+ # before, the entire word would end up on one side of the vertical ruling.
276
+ pdf_file_path = File.expand_path('data/vertical_rulings_bug.pdf', File.dirname(__FILE__))
277
+
278
+ #both of these are semantically "correct"; the difference is in how we handle multi-line cells
279
+ expected = Tabula::Table.new_from_array([
280
+ ["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
281
+ ["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
282
+ ["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH ABRAHAMSON", "", "", "$22.93", "", "", "$22.93"]
283
+ ])
284
+ other_expected = Tabula::Table.new_from_array([
285
+ ["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
286
+ ["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
287
+ ["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH", "", "", "$22.93", "", "", "$22.93"],
288
+ ["", "", "", "ABRAHAMSON"]
289
+ ])
290
+
291
+ #N.B. it's "MORGANTOWN", "WV" that we're most interested in here (it used to show up as ["MORGANTOWNWV", "", ""])
292
+
293
+
294
+ extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, 1...2) #:all ) # 1..2643
295
+ extractor.extract.each_with_index do |pdf_page, page_index|
296
+
297
+ page_areas = [[250, 0, 325, 1700]]
298
+
299
+ scale_factor = pdf_page.width / 1700
300
+
301
+ vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
302
+
303
+ tables = page_areas.map do |page_area|
304
+ pdf_page.get_area(page_area).make_table(:vertical_rulings => vertical_rulings)
305
+ end
306
+ assert_equal expected, lines_to_table(tables.first)
307
+ end
308
+ end
309
+
310
+ def test_vertical_rulings_prevent_merging_of_columns
311
+ expected = [["SZARANGOWICZ", "GUSTAVO ALEJANDRO", "25.096.244", "20-25096244-5", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TAILHADE", "LUIS RODOLFO", "21.386.299", "20-21386299-6", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TEDESCHI", "ADRIÁN ALBERTO", "24.171.507", "20-24171507-9", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["URRIZA", "MARÍA TERESA", "18.135.604", "27-18135604-4", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["USTARROZ", "GERÓNIMO JAVIER", "24.912.947", "20-24912947-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VALSANGIACOMO BLANC", "OFERNANDO JORGE", "26.800.203", "20-26800203-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VICENTE", "PABLO ARIEL", "21.897.586", "20-21897586-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["AMBURI", "HUGO ALBERTO", "14.096.560", "20-14096560-0", "09/10/2013", "EFECTIVO", "$ 20.000,00"], ["BERRA", "CLAUDIA SUSANA", "14.433.112", "27-14433112-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"]]
312
+
313
+ vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
314
+
315
+ table = lines_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
316
+ 1,
317
+ [255.57,40.43,398.76,557.35],
318
+ :vertical_rulings => vertical_rulings)
319
+
320
+ assert_equal expected, table
321
+ end
322
+
323
+ def test_get_spacing_and_merging_right
324
+ table = lines_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
325
+ 1,
326
+ [52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
327
+ :detect_ruling_lines => true)
328
+
329
+ expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia ", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
330
+
331
+ assert_equal expected, table
332
+
333
+ end
334
+
335
+
336
+ class SpreadsheetsHasCellsTester
337
+ include Tabula::HasCells
338
+ attr_accessor :cells
339
+ def initialize(cells)
340
+ @cells = cells
341
+ end
342
+ end
343
+
344
+ #just tests the algorithm
345
+ def test_cells_to_spreadsheets
346
+
347
+ cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
348
+ Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
349
+ Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
350
+ Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
351
+ Tabula::Cell.new(90.0, 18.0, 52.0, 4.0), Tabula::Cell.new(94.0, 18.0, 52.0, 6.0),
352
+ Tabula::Cell.new(100.0, 18.0, 52.0, 28.0), Tabula::Cell.new(128.0, 18.0, 52.0, 4.0),
353
+ Tabula::Cell.new(132.0, 18.0, 52.0, 64.0), Tabula::Cell.new(196.0, 18.0, 52.0, 66.0),
354
+ Tabula::Cell.new(262.0, 18.0, 52.0, 4.0), Tabula::Cell.new(266.0, 18.0, 52.0, 84.0),
355
+ Tabula::Cell.new(350.0, 18.0, 52.0, 4.0), Tabula::Cell.new(354.0, 18.0, 52.0, 32.0),
356
+ Tabula::Cell.new(386.0, 18.0, 52.0, 38.0), Tabula::Cell.new(424.0, 18.0, 52.0, 18.0),
357
+ Tabula::Cell.new(442.0, 18.0, 52.0, 74.0), Tabula::Cell.new(516.0, 18.0, 52.0, 28.0),
358
+ Tabula::Cell.new(544.0, 18.0, 52.0, 4.0), Tabula::Cell.new(44.0, 70.0, 156.0, 6.0),
359
+ Tabula::Cell.new(50.0, 70.0, 156.0, 4.0), Tabula::Cell.new(54.0, 70.0, 156.0, 6.0),
360
+ Tabula::Cell.new(60.0, 70.0, 156.0, 4.0), Tabula::Cell.new(64.0, 70.0, 156.0, 6.0),
361
+ Tabula::Cell.new(70.0, 70.0, 156.0, 4.0), Tabula::Cell.new(74.0, 70.0, 156.0, 6.0),
362
+ Tabula::Cell.new(84.0, 70.0, 2.0, 6.0), Tabula::Cell.new(90.0, 70.0, 156.0, 4.0),
363
+ Tabula::Cell.new(94.0, 70.0, 156.0, 6.0), Tabula::Cell.new(100.0, 70.0, 156.0, 28.0),
364
+ Tabula::Cell.new(128.0, 70.0, 156.0, 4.0), Tabula::Cell.new(132.0, 70.0, 156.0, 64.0),
365
+ Tabula::Cell.new(196.0, 70.0, 156.0, 66.0), Tabula::Cell.new(262.0, 70.0, 156.0, 4.0),
366
+ Tabula::Cell.new(266.0, 70.0, 156.0, 84.0), Tabula::Cell.new(350.0, 70.0, 156.0, 4.0),
367
+ Tabula::Cell.new(354.0, 70.0, 156.0, 32.0), Tabula::Cell.new(386.0, 70.0, 156.0, 38.0),
368
+ Tabula::Cell.new(424.0, 70.0, 156.0, 18.0), Tabula::Cell.new(442.0, 70.0, 156.0, 74.0),
369
+ Tabula::Cell.new(516.0, 70.0, 156.0, 28.0), Tabula::Cell.new(544.0, 70.0, 156.0, 4.0),
370
+ Tabula::Cell.new(84.0, 72.0, 446.0, 6.0), Tabula::Cell.new(90.0, 226.0, 176.0, 4.0),
371
+ Tabula::Cell.new(94.0, 226.0, 176.0, 6.0), Tabula::Cell.new(100.0, 226.0, 176.0, 28.0),
372
+ Tabula::Cell.new(128.0, 226.0, 176.0, 4.0), Tabula::Cell.new(132.0, 226.0, 176.0, 64.0),
373
+ Tabula::Cell.new(196.0, 226.0, 176.0, 66.0), Tabula::Cell.new(262.0, 226.0, 176.0, 4.0),
374
+ Tabula::Cell.new(266.0, 226.0, 176.0, 84.0), Tabula::Cell.new(350.0, 226.0, 176.0, 4.0),
375
+ Tabula::Cell.new(354.0, 226.0, 176.0, 32.0), Tabula::Cell.new(386.0, 226.0, 176.0, 38.0),
376
+ Tabula::Cell.new(424.0, 226.0, 176.0, 18.0), Tabula::Cell.new(442.0, 226.0, 176.0, 74.0),
377
+ Tabula::Cell.new(516.0, 226.0, 176.0, 28.0), Tabula::Cell.new(544.0, 226.0, 176.0, 4.0),
378
+ Tabula::Cell.new(90.0, 402.0, 116.0, 4.0), Tabula::Cell.new(94.0, 402.0, 116.0, 6.0),
379
+ Tabula::Cell.new(100.0, 402.0, 116.0, 28.0), Tabula::Cell.new(128.0, 402.0, 116.0, 4.0),
380
+ Tabula::Cell.new(132.0, 402.0, 116.0, 64.0), Tabula::Cell.new(196.0, 402.0, 116.0, 66.0),
381
+ Tabula::Cell.new(262.0, 402.0, 116.0, 4.0), Tabula::Cell.new(266.0, 402.0, 116.0, 84.0),
382
+ Tabula::Cell.new(350.0, 402.0, 116.0, 4.0), Tabula::Cell.new(354.0, 402.0, 116.0, 32.0),
383
+ Tabula::Cell.new(386.0, 402.0, 116.0, 38.0), Tabula::Cell.new(424.0, 402.0, 116.0, 18.0),
384
+ Tabula::Cell.new(442.0, 402.0, 116.0, 74.0), Tabula::Cell.new(516.0, 402.0, 116.0, 28.0),
385
+ Tabula::Cell.new(544.0, 402.0, 116.0, 4.0), Tabula::Cell.new(84.0, 518.0, 246.0, 6.0),
386
+ Tabula::Cell.new(90.0, 518.0, 186.0, 4.0), Tabula::Cell.new(94.0, 518.0, 186.0, 6.0),
387
+ Tabula::Cell.new(100.0, 518.0, 186.0, 28.0), Tabula::Cell.new(128.0, 518.0, 186.0, 4.0),
388
+ Tabula::Cell.new(132.0, 518.0, 186.0, 64.0), Tabula::Cell.new(196.0, 518.0, 186.0, 66.0),
389
+ Tabula::Cell.new(262.0, 518.0, 186.0, 4.0), Tabula::Cell.new(266.0, 518.0, 186.0, 84.0),
390
+ Tabula::Cell.new(350.0, 518.0, 186.0, 4.0), Tabula::Cell.new(354.0, 518.0, 186.0, 32.0),
391
+ Tabula::Cell.new(386.0, 518.0, 186.0, 38.0), Tabula::Cell.new(424.0, 518.0, 186.0, 18.0),
392
+ Tabula::Cell.new(442.0, 518.0, 186.0, 74.0), Tabula::Cell.new(516.0, 518.0, 186.0, 28.0),
393
+ Tabula::Cell.new(544.0, 518.0, 186.0, 4.0), Tabula::Cell.new(90.0, 704.0, 60.0, 4.0),
394
+ Tabula::Cell.new(94.0, 704.0, 60.0, 6.0), Tabula::Cell.new(100.0, 704.0, 60.0, 28.0),
395
+ Tabula::Cell.new(128.0, 704.0, 60.0, 4.0), Tabula::Cell.new(132.0, 704.0, 60.0, 64.0),
396
+ Tabula::Cell.new(196.0, 704.0, 60.0, 66.0), Tabula::Cell.new(262.0, 704.0, 60.0, 4.0),
397
+ Tabula::Cell.new(266.0, 704.0, 60.0, 84.0), Tabula::Cell.new(350.0, 704.0, 60.0, 4.0),
398
+ Tabula::Cell.new(354.0, 704.0, 60.0, 32.0), Tabula::Cell.new(386.0, 704.0, 60.0, 38.0),
399
+ Tabula::Cell.new(424.0, 704.0, 60.0, 18.0), Tabula::Cell.new(442.0, 704.0, 60.0, 74.0),
400
+ Tabula::Cell.new(516.0, 704.0, 60.0, 28.0), Tabula::Cell.new(544.0, 704.0, 60.0, 4.0),
401
+ Tabula::Cell.new(84.0, 764.0, 216.0, 6.0), Tabula::Cell.new(90.0, 764.0, 216.0, 4.0),
402
+ Tabula::Cell.new(94.0, 764.0, 216.0, 6.0), Tabula::Cell.new(100.0, 764.0, 216.0, 28.0),
403
+ Tabula::Cell.new(128.0, 764.0, 216.0, 4.0), Tabula::Cell.new(132.0, 764.0, 216.0, 64.0),
404
+ Tabula::Cell.new(196.0, 764.0, 216.0, 66.0), Tabula::Cell.new(262.0, 764.0, 216.0, 4.0),
405
+ Tabula::Cell.new(266.0, 764.0, 216.0, 84.0), Tabula::Cell.new(350.0, 764.0, 216.0, 4.0),
406
+ Tabula::Cell.new(354.0, 764.0, 216.0, 32.0), Tabula::Cell.new(386.0, 764.0, 216.0, 38.0),
407
+ Tabula::Cell.new(424.0, 764.0, 216.0, 18.0), Tabula::Cell.new(442.0, 764.0, 216.0, 74.0),
408
+ Tabula::Cell.new(516.0, 764.0, 216.0, 28.0), Tabula::Cell.new(544.0, 764.0, 216.0, 4.0)]
409
+
410
+
411
+ expected_spreadsheets = [Tabula::Spreadsheet.new(40.0, 18.0, 208.0, 40.0, nil, nil, nil, nil),
412
+ Tabula::Spreadsheet.new(84.0, 18.0, 962.0, 464.0,nil, nil, nil, nil)]
413
+
414
+ #compares spreadsheets on area only.
415
+ assert_equal expected_spreadsheets.map{|s| [s.x, s.y, s.width, s.height] },
416
+ SpreadsheetsHasCellsTester.new(cells).find_spreadsheets_from_cells.map{|a| s = a.getBounds; [s.x, s.y, s.width, s.height] }
417
+
418
+
419
+ end
420
+
421
+ def test_add_spanning_cells
422
+ skip "until I write it"
423
+ end
424
+
425
+ def test_add_placeholder_cells_to_funny_shaped_tables
426
+ skip "until I write it, cf 01005787B_Pakistan.pdf"
427
+ end
428
+
429
+ class CellsHasCellsTester
430
+ include Tabula::HasCells
431
+ attr_accessor :vertical_ruling_lines, :horizontal_ruling_lines, :cells
432
+ def initialize(vertical_ruling_lines, horizontal_ruling_lines)
433
+ @cells = []
434
+ @vertical_ruling_lines = vertical_ruling_lines
435
+ @horizontal_ruling_lines = horizontal_ruling_lines
436
+ find_cells!
437
+ end
438
+ end
439
+
440
+ #just tests the algorithm
441
+ def test_lines_to_cells
442
+ vertical_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 0.0, 40.0),
443
+ Tabula::Ruling.new(44.0, 70.0, 0.0, 36.0),
444
+ Tabula::Ruling.new(40.0, 226.0, 0.0, 40.0)]
445
+
446
+ horizontal_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 208.0, 0.0),
447
+ Tabula::Ruling.new(44.0, 18.0, 208.0, 0.0),
448
+ Tabula::Ruling.new(50.0, 18.0, 208.0, 0.0),
449
+ Tabula::Ruling.new(54.0, 18.0, 208.0, 0.0),
450
+ Tabula::Ruling.new(60.0, 18.0, 208.0, 0.0),
451
+ Tabula::Ruling.new(64.0, 18.0, 208.0, 0.0),
452
+ Tabula::Ruling.new(70.0, 18.0, 208.0, 0.0),
453
+ Tabula::Ruling.new(74.0, 18.0, 208.0, 0.0),
454
+ Tabula::Ruling.new(80.0, 18.0, 208.0, 0.0)]
455
+
456
+ expected_cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
457
+ Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
458
+ Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
459
+ Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
460
+ Tabula::Cell.new(44.0, 70.0, 156.0, 6.0), Tabula::Cell.new(50.0, 70.0, 156.0, 4.0),
461
+ Tabula::Cell.new(54.0, 70.0, 156.0, 6.0), Tabula::Cell.new(60.0, 70.0, 156.0, 4.0),
462
+ Tabula::Cell.new(64.0, 70.0, 156.0, 6.0), Tabula::Cell.new(70.0, 70.0, 156.0, 4.0),
463
+ Tabula::Cell.new(74.0, 70.0, 156.0, 6.0), ]
464
+
465
+ actual_cells = CellsHasCellsTester.new(vertical_ruling_lines, horizontal_ruling_lines).cells
466
+ assert_equal Set.new(expected_cells), Set.new(actual_cells) #I don't care about order
143
467
  end
144
468
 
469
+ #this is the real deal!!
470
+ def test_extract_tabular_data_using_lines_and_spreadsheets
471
+ pdf_file_path = "./test/data/frx_2012_disclosure.pdf"
472
+ expected_data_path = "./test/data/frx_2012_disclosure.tsv"
473
+ expected = open(expected_data_path, 'r').read #.split("\n").map{|line| line.split("\t")}
474
+
475
+ Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all).extract.each do |pdf_page|
476
+ spreadsheet = pdf_page.spreadsheets.first
477
+ assert_equal expected, spreadsheet.to_tsv
478
+ end
479
+ end
480
+
481
+ def test_cope_with_a_tableless_page
482
+ pdf_file_path = "./test/data/no_tables.pdf"
483
+
484
+ spreadsheets = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all, '',
485
+ :line_color_filter => lambda{|components| components.all?{|c| c < 0.1}}
486
+ ).extract.to_a.first.spreadsheets
487
+
488
+ assert_equal 0, spreadsheets.size
489
+ end
490
+
491
+ def test_spanning_cells
492
+ pdf_file_path = "./test/data/spanning_cells.pdf"
493
+ expected_data_path = "./test/data/spanning_cells.csv"
494
+ expected = open(expected_data_path, 'r').read
495
+
496
+ Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
497
+ spreadsheet = pdf_page.spreadsheets.first
498
+ assert_equal expected, spreadsheet.to_csv
499
+ end
500
+ end
501
+
502
+ def test_almost_vertical_lines
503
+ pdf_file_path = "./test/data/puertos1.pdf"
504
+ top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
505
+ area = Tabula::ZoneEntity.new(top, left,
506
+ right - left, bottom - top)
507
+
508
+ Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
509
+ rulings = Tabula::Ruling.crop_rulings_to_area(pdf_page.ruling_lines, area)
510
+ # TODO assertion not entirely correct, should do the trick for now
511
+ assert_equal 15, rulings.select(&:vertical?).count
512
+ end
513
+ end
514
+
515
+ def test_extract_spreadsheet_within_an_area
516
+ pdf_file_path = "./test/data/puertos1.pdf"
517
+ top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
518
+
519
+ Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
520
+ area = pdf_page.get_area([top, left, bottom, right])
521
+ table = area.spreadsheets.first.to_a
522
+ assert_equal 15, table.length
523
+ assert_equal ["", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM"], table.first
524
+ assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
525
+ end
526
+ end
527
+ end
528
+
529
+ class TestIsTabularHeuristic < Minitest::Test
530
+
531
+ EXPECTED_TO_BE_SPREADSHEET = ['47008204D_USA.page4.pdf', 'GSK_2012_Q4.page437.pdf', 'strongschools.pdf', 'tabla_subsidios.pdf']
532
+ NOT_EXPECTED_TO_BE_SPREADSHEET = ['560015757GV_China.page1.pdf', 'S2MNCEbirdisland.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
533
+
534
+ File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
535
+
536
+ def test_heuristic_detects_spreadsheets
537
+ EXPECTED_TO_BE_SPREADSHEET.each do |f|
538
+ path = File.expand_path('data/' + f, File.dirname(__FILE__))
539
+ extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
540
+ page = extractor.extract.first
541
+ page.get_ruling_lines!
542
+ assert page.is_tabular?
543
+ end
544
+ end
545
+
546
+ def test_heuristic_detects_non_spreadsheets
547
+ NOT_EXPECTED_TO_BE_SPREADSHEET.each do |f|
548
+ path = File.expand_path('data/' + f, File.dirname(__FILE__))
549
+ extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
550
+ page = extractor.extract.first
551
+ page.get_ruling_lines!
552
+ assert !page.is_tabular?
553
+ end
554
+ end
555
+
556
+
557
+
145
558
 
146
559
  end