tabula-extractor 0.6.6-java → 0.7.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.6.6'
2
+ VERSION = '0.7.0'
3
3
  end
@@ -5,9 +5,9 @@ module Tabula
5
5
  module Writers
6
6
 
7
7
  def Writers.CSV(lines, output=$stdout)
8
- lines.each { |l|
8
+ lines.each do |l|
9
9
  output.write CSV.generate_line(l.map(&:text), row_sep: "\r\n")
10
- }
10
+ end
11
11
  end
12
12
 
13
13
  def Writers.JSON(lines, output=$stdout)
@@ -15,12 +15,11 @@ module Tabula
15
15
  end
16
16
 
17
17
  def Writers.TSV(lines, output=$stdout)
18
- lines.each { |l|
19
- output.write(l.map(&:text).join("\t") + "\n")
20
- }
18
+ lines.each do |l|
19
+ output.write CSV.generate_line(l.map(&:text), col_sep: "\t", row_sep: "\r\n")
20
+ end
21
21
  end
22
22
 
23
-
24
23
  def Writers.HTML(lines, output=$stdout)
25
24
  raise "not implemented"
26
25
  end
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
24
24
  s.add_development_dependency 'minitest'
25
25
  s.add_development_dependency 'bundler', '>= 1.3.4'
26
26
  s.add_development_dependency 'ruby-debug'
27
+ s.add_development_dependency 'pry'
27
28
 
28
29
  s.add_runtime_dependency "trollop", ["~> 2.0"]
29
30
  # s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,88 @@
1
+ FOREST LABORATORIES, INC. DISCLOSURE REPORT "" "" "" ""
2
+ Calendar Year - 2012 "" "" "" ""
3
+ Physician Related Entity (if applicable) City / State Purpose of Payment Amount ($USD) * **
4
+ AALAEI, BEHZAD "" HIGHLAND, IN MEALS $51.24
5
+ TOTAL "" "" "" $51.24
6
+ AAMODT, DENISE, E "" ALBUQUERQUE, NM MEALS $66.12
7
+ TOTAL "" "" "" $66.12
8
+ AANONSEN, DEBORAH, A "" STATEN ISLAND, NY MEALS $85.00
9
+ TOTAL "" "" "" $85.00
10
+ AARON, CAREN, T "" RICHMOND, VA EDUCATIONAL ITEMS $78.80
11
+ AARON, CAREN, T "" RICHMOND, VA MEALS $392.45
12
+ TOTAL "" "" "" $471.25
13
+ AARON, JOHN "" CLARKSVILLE, TN MEALS $20.39
14
+ TOTAL "" "" "" $20.39
15
+ AARON, JOSHUA, N "" WEST GROVE, PA MEALS $310.33
16
+ AARON, JOSHUA, N REGIONAL PULMONARY & SLEEP MEDICINE WEST GROVE, PA SPEAKING FEES $4,700.00
17
+ TOTAL "" "" "" $5,010.33
18
+ AARON, MAUREEN, M "" MARTINSVILLE, VA MEALS $193.67
19
+ TOTAL "" "" "" $193.67
20
+ AARON, MICHAEL, L "" WEST ISLIP, NY MEALS $19.50
21
+ TOTAL "" "" "" $19.50
22
+ AARON, MICHAEL, R "" BROOKLYN, NY MEALS $65.92
23
+ TOTAL "" "" "" $65.92
24
+ AARONS, MARK, G "" PINEHURST, NC MEALS $154.19
25
+ TOTAL "" "" "" $154.19
26
+ AARONSON, GARY, A "" PHILADELPHIA, PA MEALS $205.17
27
+ TOTAL "" "" "" $205.17
28
+ AARONSON, ROBERT, M "" TUCSON, AZ MEALS $24.38
29
+ TOTAL "" "" "" $24.38
30
+ AASHEIM, RICHARD, J "" GREENEVILLE, TN EDUCATIONAL ITEMS $2.27
31
+ AASHEIM, RICHARD, J "" GREENEVILLE, TN MEALS $100.76
32
+ TOTAL "" "" "" $103.03
33
+ AASMAA, SIRIKE, T "" MONTVILLE, NJ MEALS $53.33
34
+ TOTAL "" "" "" $53.33
35
+ AAZAMI, HESSAM "" GRANADA HILLS, CA MEALS $402.90
36
+ TOTAL "" "" "" $402.90
37
+ ABAABA, ABIEDU, C "" JACKSONVILLE, FL MEALS $13.49
38
+ TOTAL "" "" "" $13.49
39
+ ABABNEH, ALAELDIN, A "" KANSAS CITY, KS MEALS $10.31
40
+ TOTAL "" "" "" $10.31
41
+ ABAD, ANTONIO, A "" CORAL SPRINGS, FL MEALS $516.29
42
+ TOTAL "" "" "" $516.29
43
+ ABADEER, PETER, S "" NORMAL, IL MEALS $200.38
44
+ TOTAL "" "" "" $200.38
45
+ ABAD, ENZO, L "" MIAMI, FL MEALS $67.61
46
+ TOTAL "" "" "" $67.61
47
+ ABADIAN SHARIFABAD, MANOOCHEHR "" GRANADA HILLS, CA MEALS $12.37
48
+ TOTAL "" "" "" $12.37
49
+ ABADI, CHRISTOPHER, A "" WARWICK, RI MEALS $157.42
50
+ TOTAL "" "" "" $157.42
51
+ ABADIE, MARCUS, G "" ATHENS, TX MEALS $361.89
52
+ TOTAL "" "" "" $361.89
53
+ ABADI, JAMSHEED, S "" BROOKLYN, NY MEALS $363.40
54
+ TOTAL "" "" "" $363.40
55
+ ABADILLA, JUNE, E "" JACKSON, KY MEALS $105.33
56
+ TOTAL "" "" "" $105.33
57
+ ABAD, JOHN, P "" NEWARK, OH MEALS $347.64
58
+ TOTAL "" "" "" $347.64
59
+ ABAD, JOSE, F "" FOLSOM, CA MEALS $30.28
60
+ TOTAL "" "" "" $30.28
61
+ ABAD, REMEDIOS, D "" WILNINGTON, DE MEALS $26.85
62
+ TOTAL "" "" "" $26.85
63
+ ABAD, SO KIM, F "" WICHITA FALLS, TX MEALS $136.52
64
+ TOTAL "" "" "" $136.52
65
+ ABAD, ZOILO, R "" MIAMI, FL MEALS $93.83
66
+ TOTAL "" "" "" $93.83
67
+ ABALIHI, CAROL, N "" EL PASO, TX MEALS $88.48
68
+ TOTAL "" "" "" $88.48
69
+ ABALOS, ANNA, T "" ROSEVILLE, CA MEALS $178.60
70
+ TOTAL "" "" "" $178.60
71
+ ABALOS, ARTURO, Z "" DELANO, CA MEALS $48.06
72
+ TOTAL "" "" "" $48.06
73
+ ABALOS, JOSEPH, M "" SENECA, PA MEALS $39.03
74
+ TOTAL "" "" "" $39.03
75
+ ABANDO, JOSE, R "" DAYTONA BEACH, FL MEALS $83.44
76
+ TOTAL "" "" "" $83.44
77
+ ABANG, ANTHONY, E "" ELIZABETHTOWN, KY MEALS $12.62
78
+ TOTAL "" "" "" $12.62
79
+ ABAN, KENRIC, T "" SAN DIEGO, CA MEALS $11.91
80
+ TOTAL "" "" "" $11.91
81
+ ABAQUETA, ALVIN, Y "" CHARLOTTE, NC MEALS $233.71
82
+ TOTAL "" "" "" $233.71
83
+ ABARCA, SERGIO, O "" TOOELE, UT MEALS $159.58
84
+ TOTAL "" "" "" $159.58
85
+ ABARIKWU, CONSTANTIA, A "" PHOENIX, AZ MEALS $153.57
86
+ TOTAL "" "" "" $153.57
87
+ ABASHIDZE, TEAH, A "" CLEVELAND, OH MEALS $153.59
88
+ TOTAL "" "" "" $153.59
Binary file
Binary file
@@ -0,0 +1,21 @@
1
+ Improved operation scenario,"","","","",""
2
+ Volume servers in:,2007,2008,2009,2010,2011
3
+ Server closets,"1,505","1,580","1,643","1,673","1,689"
4
+ Server rooms,"1,512","1,586","1,646","1,677","1,693"
5
+ Localized data centers,"1,512","1,586","1,646","1,677","1,693"
6
+ Mid-tier data centers,"1,512","1,586","1,646","1,677","1,693"
7
+ Enterprise-class data centers,"1,512","1,586","1,646","1,677","1,693"
8
+ Best practice scenario,"","","","",""
9
+ Volume servers in:,2007,2008,2009,2010,2011
10
+ Server closets,"1,456","1,439","1,386","1,296","1,326"
11
+ Server rooms,"1,465","1,472","1,427","1,334","1,371"
12
+ Localized data centers,"1,465","1,471","1,426","1,334","1,371"
13
+ Mid-tier data centers,"1,465","1,471","1,426","1,334","1,371"
14
+ Enterprise-class data centers,"1,465","1,471","1,426","1,334","1,371"
15
+ State-of-the-art scenario,"","","","",""
16
+ Volume servers in:,2007,2008,2009,2010,2011
17
+ Server closets,"1,485","1,471","1,424","1,315","1,349"
18
+ Server rooms,"1,495","1,573","1,586","1,424","1,485"
19
+ Localized data centers,"1,495","1,572","1,585","1,424","1,485"
20
+ Mid-tier data centers,"1,495","1,572","1,585","1,424","1,485"
21
+ Enterprise-class data centers,"1,495","1,572","1,585","1,424","1,485"
Binary file
Binary file
Binary file
data/test/heuristic.rb ADDED
@@ -0,0 +1,50 @@
1
+ #a list of filenames and the correct answer
2
+ # no more bs.
3
+ require_relative '../lib/tabula'
4
+
5
+
6
+ should_use_spreadsheet = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "spreadsheet/*") ).map{|a| [a, true]}
7
+ should_use_original = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "original/*") ).map{|a| [a, false]}
8
+
9
+ correct = []
10
+ misclassified_as_original = []
11
+ misclassified_as_spreadsheet = []
12
+
13
+
14
+
15
+ def heuristic(page)
16
+ page.is_tabular?
17
+ end
18
+
19
+ (should_use_spreadsheet + should_use_original) .each do |filename, expected_to_be_tabular|
20
+ extractor = Tabula::Extraction::ObjectExtractor.new(filename, [1])
21
+
22
+ page = extractor.extract.first
23
+ page.get_ruling_lines!
24
+ # puts "#{File.basename(filename)} | #{expected_to_be_tabular}"
25
+ page_is_tabular = heuristic(page)
26
+ # puts ""
27
+
28
+ if page_is_tabular && expected_to_be_tabular || !page_is_tabular && !expected_to_be_tabular
29
+ correct << filename
30
+ elsif page_is_tabular && !expected_to_be_tabular
31
+ misclassified_as_spreadsheet << filename
32
+ elsif !page_is_tabular && expected_to_be_tabular
33
+ misclassified_as_original << filename
34
+ end
35
+ end
36
+
37
+ puts "#{correct.size} PDFs were correctly classified"
38
+ puts "#{misclassified_as_original.size + misclassified_as_spreadsheet.size} PDFs were incorrectly classified"
39
+ unless misclassified_as_spreadsheet.empty?
40
+ puts "#{misclassified_as_spreadsheet.size} PDFs should use the original extraction algorithm\n\t but was classified as needing the spreadsheet algorithm"
41
+ misclassified_as_spreadsheet.each do |filename|
42
+ puts " - #{File.basename(filename)}"
43
+ end
44
+ end
45
+ unless misclassified_as_original.empty?
46
+ puts "#{misclassified_as_original.size} PDFs should use the spreadsheet extraction algorithm\n\t but was classified as needing the original algorithm"
47
+ misclassified_as_original.each do |filename|
48
+ puts " - #{File.basename(filename)}"
49
+ end
50
+ end
@@ -0,0 +1,7 @@
1
+ bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf --silent -o test.csv
2
+ bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf -o test.csv
3
+ bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv
4
+ bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv --format TSV
5
+ bin/tabula test/data/campaign_donors.pdf -o test.csv --columns 47,147,256,310,375,431,504 #columns should work
6
+ bin/tabula test/data/argentina_diputados_voting_record.pdf --guess -o test.csv --format TSV #should exclude guff
7
+ bin/tabula test/data/vertical_rulings_bug.pdf --area 250,0,325,1700 -o test.csv --format TSV #should be only a few lines
data/test/tests.rb CHANGED
@@ -6,10 +6,109 @@ require_relative '../lib/tabula'
6
6
 
7
7
  def lines_to_array(lines)
8
8
  lines.map { |l|
9
- l.map { |te| te.text }
9
+ l.map { |te| te.text.strip }
10
10
  }
11
11
  end
12
12
 
13
+ def lines_to_table(lines)
14
+ Tabula::Table.new_from_array(lines_to_array(lines))
15
+ end
16
+
17
+
18
+ # I don't want to pollute the "real" clasend a funny inspect method. Just for testing comparisons.
19
+ module Tabula
20
+ class Table
21
+ def inspect
22
+ "[" + lines.map(&:inspect).join(",") + "]"
23
+ end
24
+ end
25
+ end
26
+
27
+ module Tabula
28
+ class Line
29
+ def inspect
30
+ @text_elements.map(&:text).inspect
31
+ end
32
+ end
33
+ end
34
+
35
+
36
+ class TestEntityComparability < Minitest::Test
37
+ def test_text_element_comparability
38
+ base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
39
+
40
+ two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
41
+ three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
42
+ four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
43
+
44
+ five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
45
+ six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
46
+ seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
47
+ assert_equal base, two
48
+ assert_equal base, three
49
+ assert_equal base, four
50
+
51
+ refute_equal base, five
52
+ refute_equal base, six
53
+ refute_equal base, seven
54
+ end
55
+
56
+ def test_line_comparability
57
+ text_base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
58
+
59
+ text_two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
60
+ text_three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
61
+ text_four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
62
+
63
+ text_five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
64
+ text_six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
65
+ text_seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
66
+ line_base = Tabula::Line.new
67
+ line_base.text_elements = [text_base, text_two, text_three]
68
+ line_equal = Tabula::Line.new
69
+ line_equal.text_elements = [text_base, text_two, text_three]
70
+ line_equal_but_longer = Tabula::Line.new
71
+ line_equal_but_longer.text_elements = [text_base, text_two, text_three, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
72
+ line_unequal = Tabula::Line.new
73
+ line_unequal.text_elements = [text_base, text_two, text_three, text_five]
74
+ line_unequal_and_longer = Tabula::Line.new
75
+ line_unequal_and_longer.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
76
+ line_unequal_and_longer_and_different = Tabula::Line.new
77
+ line_unequal_and_longer_and_different.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, 'whatever']
78
+
79
+ assert_equal line_base, line_equal
80
+ assert_equal line_base, line_equal_but_longer
81
+ refute_equal line_base, line_unequal
82
+ refute_equal line_base, line_unequal_and_longer
83
+ refute_equal line_base, line_unequal_and_longer_and_different
84
+ end
85
+
86
+ def test_table_comparability
87
+ rows_base = [["a", "b", "c"], ['', 'd', '']]
88
+ rows_equal = [["a", "b", "c"], ['', 'd']]
89
+ rows_equal_padded = [['', "a", "b", "c"], ['', '', 'd']]
90
+ rows_unequal_one = [["a", "b", "c"], ['d']]
91
+ rows_unequal_two = [["a", "b", "c"], ['d', '']]
92
+ rows_unequal_three = [["a", "b", "c"], ['d'], ['a','b', 'd']]
93
+ rows_unequal_four = [["a", "b", "c"]]
94
+
95
+ table_base = Tabula::Table.new_from_array(rows_base)
96
+ table_equal = Tabula::Table.new_from_array(rows_equal)
97
+ table_equal_column_padded = Tabula::Table.new_from_array(rows_equal_padded)
98
+ table_unequal_one = Tabula::Table.new_from_array(rows_unequal_one)
99
+ table_unequal_two = Tabula::Table.new_from_array(rows_unequal_two)
100
+ table_unequal_three = Tabula::Table.new_from_array(rows_unequal_three)
101
+ table_unequal_four = Tabula::Table.new_from_array(rows_unequal_four)
102
+
103
+ assert_equal table_base, table_equal
104
+ assert_equal table_base, table_equal_column_padded
105
+ refute_equal table_base, table_unequal_one
106
+ refute_equal table_base, table_unequal_two
107
+ refute_equal table_base, table_unequal_three
108
+ refute_equal table_base, table_unequal_four
109
+ end
110
+ end
111
+
13
112
  class TestPagesInfoExtractor < Minitest::Test
14
113
  def test_pages_info_extractor
15
114
  extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
@@ -24,42 +123,74 @@ class TestPagesInfoExtractor < Minitest::Test
24
123
  end
25
124
 
26
125
  class TestTableGuesser < Minitest::Test
126
+ def test_find_rects_from_lines_with_lsd
127
+ skip "Skipping until we actually use LSD"
128
+ filename = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
129
+ page_index = 0
130
+ lines = Tabula::Extraction::LineExtractor.lines_in_pdf_page(filename, page_index, :render_pdf => true)
131
+
132
+ page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
133
+ page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
134
+ expected_page_areas = [[54.087890625, 50.203125, 734.220703125, 550.44140625]]
135
+ assert_equal expected_page_areas, page_areas
136
+ end
137
+
27
138
  end
28
139
 
29
140
  class TestDumper < Minitest::Test
30
141
 
31
142
  def test_extractor
32
- extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
33
- page = extractor.extract.first
143
+ extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
144
+ page = extractor.extract.next
34
145
  assert_instance_of Tabula::Page, page
35
146
  end
36
147
 
37
148
  def test_get_by_area
38
-
39
- # http://localhost:8080/debug/418b1d5698e5c7b724551d9610c071ab3063275c/characters?x1=57.921428571428564&x2=290.7&y1=107.1&y2=394.52142857142854&page=1&use_lines=false
40
- extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
149
+ extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
41
150
  characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
42
151
  assert_equal characters.size, 206
43
152
  end
44
153
  end
45
154
 
155
+ class TestRulingIntersection < Minitest::Test
156
+ def test_ruling_intersection
157
+ horizontals = [Tabula::Ruling.new(10, 1, 10, 0)]
158
+ verticals = [Tabula::Ruling.new(1, 3, 0, 11),
159
+ Tabula::Ruling.new(1, 4, 0, 11)]
160
+ ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
161
+ assert_equal 2, ints.size
162
+ assert_equal ints[0][0].getX, 3.0
163
+ assert_equal ints[0][0].getY, 10.0
164
+ assert_equal ints[1][0].getX, 4.0
165
+ assert_equal ints[1][0].getY, 10.0
166
+
167
+ verticals = [Tabula::Ruling.new(20, 3, 0, 11)]
168
+ ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
169
+ assert_equal ints.size, 0
170
+ end
171
+ end
172
+
46
173
  class TestExtractor < Minitest::Test
47
174
 
48
175
  def test_table_extraction_1
49
- character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
50
- characters = character_extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
51
- table = lines_to_array Tabula.make_table(characters)
52
- expected = [["Prior Scale ", "New Scale ", "% Rank* "], ["800 ", "170 ", "99 "], ["790 ", "170 ", "99 "], ["780 ", "170 ", "99 "], ["770 ", "170 ", "99 "], ["760 ", "170 ", "99 "], ["750 ", "169 ", "99 "], ["740 ", "169 ", "99 "], ["730 ", "168 ", "98 "], ["720 ", "168 ", "98 "], ["710 ", "167 ", "97 "], ["700 ", "166 ", "96 "], ["690 ", "165 ", "95 "], ["680 ", "165 ", "95 "], ["670 ", "164 ", "93 "], ["660 ", "164 ", "93 "], ["650 ", "163 ", "91 "]]
176
+ table = lines_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
177
+ 1,
178
+ [107.1, 57.9214, 394.5214, 290.7],
179
+ :detect_ruling_lines => false)
180
+
181
+ expected = [["Prior Scale","New Scale","% Rank*"], ["800","170","99"], ["790","170","99"], ["780","170","99"], ["770","170","99"], ["760","170","99"], ["750","169","99"], ["740","169","99"], ["730","168","98"], ["720","168","98"], ["710","167","97"], ["700","166","96"], ["690","165","95"], ["680","165","95"], ["670","164","93"], ["660","164","93"], ["650","163","91"]]
182
+
53
183
  assert_equal expected, table
54
184
  end
55
185
 
56
186
  def test_diputados_voting_record
57
- character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)))
58
- characters = character_extractor.extract.next.get_text([269.875, 12.75, 790.5, 561])
187
+ table = lines_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
188
+ 1,
189
+ [269.875, 12.75, 790.5, 561])
59
190
 
60
191
  expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
61
192
 
62
- assert_equal expected, lines_to_array(Tabula.make_table(characters))
193
+ assert_equal expected, table
63
194
  end
64
195
 
65
196
  def test_forest_disclosure_report_dont_regress
@@ -67,80 +198,362 @@ class TestExtractor < Minitest::Test
67
198
  # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
68
199
  # and a solution for half-x-height-offset lines.
69
200
  pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
70
- character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
71
- lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
72
- vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
73
201
 
202
+ table = lines_to_table Tabula.extract_table(pdf_file_path,
203
+ 1,
204
+ [106.01, 48.09, 227.31, 551.89],
205
+ :detect_ruling_lines => true)
74
206
 
75
- characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
76
- #top left bottom right
77
- expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
78
- ['TOTAL', '', '', '','$85.00'],
79
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
80
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
81
- ['TOTAL', '', '', '', '$471.25'],
82
- ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
83
- ['TOTAL', '', '', '','$20.39'],
84
- ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
85
- ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
86
- ['TOTAL', '', '', '', '$5,010.33'],
87
- ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
88
- ['TOTAL', '', '', '', '$193.67'],
89
- ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
90
-
91
- assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
207
+ expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
208
+
209
+
210
+ assert_equal expected, table
92
211
  end
93
212
 
94
213
  def test_missing_spaces_around_an_ampersand
95
214
  pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
96
- character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
97
- lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
98
- vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
215
+ character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
216
+ page_obj = character_extractor.extract.next
217
+ lines = page_obj.ruling_lines
218
+ vertical_rulings = lines.select(&:vertical?)
99
219
 
220
+ area = [170, 28, 185, 833] #top left bottom right
100
221
 
101
- characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
102
- #top left bottom right
103
- expected = [
104
- ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
105
- ]
222
+ expected = Tabula::Table.new_from_array([
223
+ ["", "REGIONAL PULMONARY & SLEEP",],
224
+ ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"],
225
+ ["", "MEDICINE", ],
226
+ ])
106
227
 
107
- assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
228
+ assert_equal expected, lines_to_table(page_obj.get_area(area).make_table(:vertical_rulings => vertical_rulings))
108
229
  end
109
230
 
110
231
  def test_forest_disclosure_report
111
232
  skip "Skipping until we support multiline cells"
112
233
  pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
113
- character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
234
+ character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
114
235
  lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
115
- vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
236
+ vertical_rulings = lines.select(&:vertical?) #.uniq{|line| (line.left / 10).round }
116
237
 
117
- characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
238
+ page_obj = character_extractor.extract.next
239
+ characters = page_obj.get_text([110, 28, 218, 833])
118
240
  #top left bottom right
119
- expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
120
- ['TOTAL', '', '', '','$85.00'],
121
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
122
- ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
123
- ['TOTAL', '', '', '', '$471.25'],
124
- ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
125
- ['TOTAL', '', '', '','$20.39'],
126
- ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
127
- ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '$4,700.00'],
128
- ['TOTAL', '', '', '', '$5,010.33'],
129
- ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
130
- ['TOTAL', '', '', '', '$193.67'],
131
- ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
132
-
133
- assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
241
+ expected = Tabula::Table.new_from_array([
242
+ ['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '', '$85.00'],
243
+ ['TOTAL', '', '', '','$85.00'],
244
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '', '$78.80'],
245
+ ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '', '$392.45'],
246
+ ['TOTAL', '', '', '', '$471.25'],
247
+ ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '', '$20.39'],
248
+ ['TOTAL', '', '', '','$20.39'],
249
+ ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '', '$310.33'],
250
+ ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '', '$4,700.00'],
251
+ ['TOTAL', '', '', '', '$5,010.33'],
252
+ ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '', '$193.67'],
253
+ ['TOTAL', '', '', '', '$193.67'],
254
+ ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '', '$19.50']
255
+ ])
256
+
257
+ assert_equal expected, lines_to_table(Tabula.make_table(characters, :vertical_rulings => vertical_rulings))
134
258
  end
135
259
 
136
260
  # TODO Spaces inserted in words - fails
137
261
  def test_bo_page24
138
- character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
139
- characters = character_extractor.extract.next.get_text([435.625, 53.125, 585.7142857142857, 810.5357142857142])
262
+ table = lines_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
263
+ 1,
264
+ [425.625, 53.125, 575.714, 810.535],
265
+ :detect_ruling_lines => false)
140
266
 
141
267
  expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
142
- assert_equal expected, lines_to_array(Tabula.make_table(characters))
268
+
269
+ assert_equal expected, table
270
+ end
271
+
272
+
273
+ def test_vertical_rulings_splitting_words
274
+ #if a vertical ruling crosses over a word, the word should be split at that vertical ruling
275
+ # before, the entire word would end up on one side of the vertical ruling.
276
+ pdf_file_path = File.expand_path('data/vertical_rulings_bug.pdf', File.dirname(__FILE__))
277
+
278
+ #both of these are semantically "correct"; the difference is in how we handle multi-line cells
279
+ expected = Tabula::Table.new_from_array([
280
+ ["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
281
+ ["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
282
+ ["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH ABRAHAMSON", "", "", "$22.93", "", "", "$22.93"]
283
+ ])
284
+ other_expected = Tabula::Table.new_from_array([
285
+ ["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
286
+ ["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
287
+ ["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH", "", "", "$22.93", "", "", "$22.93"],
288
+ ["", "", "", "ABRAHAMSON"]
289
+ ])
290
+
291
+ #N.B. it's "MORGANTOWN", "WV" that we're most interested in here (it used to show up as ["MORGANTOWNWV", "", ""])
292
+
293
+
294
+ extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, 1...2) #:all ) # 1..2643
295
+ extractor.extract.each_with_index do |pdf_page, page_index|
296
+
297
+ page_areas = [[250, 0, 325, 1700]]
298
+
299
+ scale_factor = pdf_page.width / 1700
300
+
301
+ vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
302
+
303
+ tables = page_areas.map do |page_area|
304
+ pdf_page.get_area(page_area).make_table(:vertical_rulings => vertical_rulings)
305
+ end
306
+ assert_equal expected, lines_to_table(tables.first)
307
+ end
308
+ end
309
+
310
+ def test_vertical_rulings_prevent_merging_of_columns
311
+ expected = [["SZARANGOWICZ", "GUSTAVO ALEJANDRO", "25.096.244", "20-25096244-5", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TAILHADE", "LUIS RODOLFO", "21.386.299", "20-21386299-6", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TEDESCHI", "ADRIÁN ALBERTO", "24.171.507", "20-24171507-9", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["URRIZA", "MARÍA TERESA", "18.135.604", "27-18135604-4", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["USTARROZ", "GERÓNIMO JAVIER", "24.912.947", "20-24912947-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VALSANGIACOMO BLANC", "OFERNANDO JORGE", "26.800.203", "20-26800203-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VICENTE", "PABLO ARIEL", "21.897.586", "20-21897586-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["AMBURI", "HUGO ALBERTO", "14.096.560", "20-14096560-0", "09/10/2013", "EFECTIVO", "$ 20.000,00"], ["BERRA", "CLAUDIA SUSANA", "14.433.112", "27-14433112-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"]]
312
+
313
+ vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
314
+
315
+ table = lines_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
316
+ 1,
317
+ [255.57,40.43,398.76,557.35],
318
+ :vertical_rulings => vertical_rulings)
319
+
320
+ assert_equal expected, table
321
+ end
322
+
323
+ def test_get_spacing_and_merging_right
324
+ table = lines_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
325
+ 1,
326
+ [52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
327
+ :detect_ruling_lines => true)
328
+
329
+ expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia ", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
330
+
331
+ assert_equal expected, table
332
+
333
+ end
334
+
335
+
336
+ class SpreadsheetsHasCellsTester
337
+ include Tabula::HasCells
338
+ attr_accessor :cells
339
+ def initialize(cells)
340
+ @cells = cells
341
+ end
342
+ end
343
+
344
+ #just tests the algorithm
345
+ def test_cells_to_spreadsheets
346
+
347
+ cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
348
+ Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
349
+ Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
350
+ Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
351
+ Tabula::Cell.new(90.0, 18.0, 52.0, 4.0), Tabula::Cell.new(94.0, 18.0, 52.0, 6.0),
352
+ Tabula::Cell.new(100.0, 18.0, 52.0, 28.0), Tabula::Cell.new(128.0, 18.0, 52.0, 4.0),
353
+ Tabula::Cell.new(132.0, 18.0, 52.0, 64.0), Tabula::Cell.new(196.0, 18.0, 52.0, 66.0),
354
+ Tabula::Cell.new(262.0, 18.0, 52.0, 4.0), Tabula::Cell.new(266.0, 18.0, 52.0, 84.0),
355
+ Tabula::Cell.new(350.0, 18.0, 52.0, 4.0), Tabula::Cell.new(354.0, 18.0, 52.0, 32.0),
356
+ Tabula::Cell.new(386.0, 18.0, 52.0, 38.0), Tabula::Cell.new(424.0, 18.0, 52.0, 18.0),
357
+ Tabula::Cell.new(442.0, 18.0, 52.0, 74.0), Tabula::Cell.new(516.0, 18.0, 52.0, 28.0),
358
+ Tabula::Cell.new(544.0, 18.0, 52.0, 4.0), Tabula::Cell.new(44.0, 70.0, 156.0, 6.0),
359
+ Tabula::Cell.new(50.0, 70.0, 156.0, 4.0), Tabula::Cell.new(54.0, 70.0, 156.0, 6.0),
360
+ Tabula::Cell.new(60.0, 70.0, 156.0, 4.0), Tabula::Cell.new(64.0, 70.0, 156.0, 6.0),
361
+ Tabula::Cell.new(70.0, 70.0, 156.0, 4.0), Tabula::Cell.new(74.0, 70.0, 156.0, 6.0),
362
+ Tabula::Cell.new(84.0, 70.0, 2.0, 6.0), Tabula::Cell.new(90.0, 70.0, 156.0, 4.0),
363
+ Tabula::Cell.new(94.0, 70.0, 156.0, 6.0), Tabula::Cell.new(100.0, 70.0, 156.0, 28.0),
364
+ Tabula::Cell.new(128.0, 70.0, 156.0, 4.0), Tabula::Cell.new(132.0, 70.0, 156.0, 64.0),
365
+ Tabula::Cell.new(196.0, 70.0, 156.0, 66.0), Tabula::Cell.new(262.0, 70.0, 156.0, 4.0),
366
+ Tabula::Cell.new(266.0, 70.0, 156.0, 84.0), Tabula::Cell.new(350.0, 70.0, 156.0, 4.0),
367
+ Tabula::Cell.new(354.0, 70.0, 156.0, 32.0), Tabula::Cell.new(386.0, 70.0, 156.0, 38.0),
368
+ Tabula::Cell.new(424.0, 70.0, 156.0, 18.0), Tabula::Cell.new(442.0, 70.0, 156.0, 74.0),
369
+ Tabula::Cell.new(516.0, 70.0, 156.0, 28.0), Tabula::Cell.new(544.0, 70.0, 156.0, 4.0),
370
+ Tabula::Cell.new(84.0, 72.0, 446.0, 6.0), Tabula::Cell.new(90.0, 226.0, 176.0, 4.0),
371
+ Tabula::Cell.new(94.0, 226.0, 176.0, 6.0), Tabula::Cell.new(100.0, 226.0, 176.0, 28.0),
372
+ Tabula::Cell.new(128.0, 226.0, 176.0, 4.0), Tabula::Cell.new(132.0, 226.0, 176.0, 64.0),
373
+ Tabula::Cell.new(196.0, 226.0, 176.0, 66.0), Tabula::Cell.new(262.0, 226.0, 176.0, 4.0),
374
+ Tabula::Cell.new(266.0, 226.0, 176.0, 84.0), Tabula::Cell.new(350.0, 226.0, 176.0, 4.0),
375
+ Tabula::Cell.new(354.0, 226.0, 176.0, 32.0), Tabula::Cell.new(386.0, 226.0, 176.0, 38.0),
376
+ Tabula::Cell.new(424.0, 226.0, 176.0, 18.0), Tabula::Cell.new(442.0, 226.0, 176.0, 74.0),
377
+ Tabula::Cell.new(516.0, 226.0, 176.0, 28.0), Tabula::Cell.new(544.0, 226.0, 176.0, 4.0),
378
+ Tabula::Cell.new(90.0, 402.0, 116.0, 4.0), Tabula::Cell.new(94.0, 402.0, 116.0, 6.0),
379
+ Tabula::Cell.new(100.0, 402.0, 116.0, 28.0), Tabula::Cell.new(128.0, 402.0, 116.0, 4.0),
380
+ Tabula::Cell.new(132.0, 402.0, 116.0, 64.0), Tabula::Cell.new(196.0, 402.0, 116.0, 66.0),
381
+ Tabula::Cell.new(262.0, 402.0, 116.0, 4.0), Tabula::Cell.new(266.0, 402.0, 116.0, 84.0),
382
+ Tabula::Cell.new(350.0, 402.0, 116.0, 4.0), Tabula::Cell.new(354.0, 402.0, 116.0, 32.0),
383
+ Tabula::Cell.new(386.0, 402.0, 116.0, 38.0), Tabula::Cell.new(424.0, 402.0, 116.0, 18.0),
384
+ Tabula::Cell.new(442.0, 402.0, 116.0, 74.0), Tabula::Cell.new(516.0, 402.0, 116.0, 28.0),
385
+ Tabula::Cell.new(544.0, 402.0, 116.0, 4.0), Tabula::Cell.new(84.0, 518.0, 246.0, 6.0),
386
+ Tabula::Cell.new(90.0, 518.0, 186.0, 4.0), Tabula::Cell.new(94.0, 518.0, 186.0, 6.0),
387
+ Tabula::Cell.new(100.0, 518.0, 186.0, 28.0), Tabula::Cell.new(128.0, 518.0, 186.0, 4.0),
388
+ Tabula::Cell.new(132.0, 518.0, 186.0, 64.0), Tabula::Cell.new(196.0, 518.0, 186.0, 66.0),
389
+ Tabula::Cell.new(262.0, 518.0, 186.0, 4.0), Tabula::Cell.new(266.0, 518.0, 186.0, 84.0),
390
+ Tabula::Cell.new(350.0, 518.0, 186.0, 4.0), Tabula::Cell.new(354.0, 518.0, 186.0, 32.0),
391
+ Tabula::Cell.new(386.0, 518.0, 186.0, 38.0), Tabula::Cell.new(424.0, 518.0, 186.0, 18.0),
392
+ Tabula::Cell.new(442.0, 518.0, 186.0, 74.0), Tabula::Cell.new(516.0, 518.0, 186.0, 28.0),
393
+ Tabula::Cell.new(544.0, 518.0, 186.0, 4.0), Tabula::Cell.new(90.0, 704.0, 60.0, 4.0),
394
+ Tabula::Cell.new(94.0, 704.0, 60.0, 6.0), Tabula::Cell.new(100.0, 704.0, 60.0, 28.0),
395
+ Tabula::Cell.new(128.0, 704.0, 60.0, 4.0), Tabula::Cell.new(132.0, 704.0, 60.0, 64.0),
396
+ Tabula::Cell.new(196.0, 704.0, 60.0, 66.0), Tabula::Cell.new(262.0, 704.0, 60.0, 4.0),
397
+ Tabula::Cell.new(266.0, 704.0, 60.0, 84.0), Tabula::Cell.new(350.0, 704.0, 60.0, 4.0),
398
+ Tabula::Cell.new(354.0, 704.0, 60.0, 32.0), Tabula::Cell.new(386.0, 704.0, 60.0, 38.0),
399
+ Tabula::Cell.new(424.0, 704.0, 60.0, 18.0), Tabula::Cell.new(442.0, 704.0, 60.0, 74.0),
400
+ Tabula::Cell.new(516.0, 704.0, 60.0, 28.0), Tabula::Cell.new(544.0, 704.0, 60.0, 4.0),
401
+ Tabula::Cell.new(84.0, 764.0, 216.0, 6.0), Tabula::Cell.new(90.0, 764.0, 216.0, 4.0),
402
+ Tabula::Cell.new(94.0, 764.0, 216.0, 6.0), Tabula::Cell.new(100.0, 764.0, 216.0, 28.0),
403
+ Tabula::Cell.new(128.0, 764.0, 216.0, 4.0), Tabula::Cell.new(132.0, 764.0, 216.0, 64.0),
404
+ Tabula::Cell.new(196.0, 764.0, 216.0, 66.0), Tabula::Cell.new(262.0, 764.0, 216.0, 4.0),
405
+ Tabula::Cell.new(266.0, 764.0, 216.0, 84.0), Tabula::Cell.new(350.0, 764.0, 216.0, 4.0),
406
+ Tabula::Cell.new(354.0, 764.0, 216.0, 32.0), Tabula::Cell.new(386.0, 764.0, 216.0, 38.0),
407
+ Tabula::Cell.new(424.0, 764.0, 216.0, 18.0), Tabula::Cell.new(442.0, 764.0, 216.0, 74.0),
408
+ Tabula::Cell.new(516.0, 764.0, 216.0, 28.0), Tabula::Cell.new(544.0, 764.0, 216.0, 4.0)]
409
+
410
+
411
+ expected_spreadsheets = [Tabula::Spreadsheet.new(40.0, 18.0, 208.0, 40.0, nil, nil, nil, nil),
412
+ Tabula::Spreadsheet.new(84.0, 18.0, 962.0, 464.0,nil, nil, nil, nil)]
413
+
414
+ #compares spreadsheets on area only.
415
+ assert_equal expected_spreadsheets.map{|s| [s.x, s.y, s.width, s.height] },
416
+ SpreadsheetsHasCellsTester.new(cells).find_spreadsheets_from_cells.map{|a| s = a.getBounds; [s.x, s.y, s.width, s.height] }
417
+
418
+
419
+ end
420
+
421
+ def test_add_spanning_cells
422
+ skip "until I write it"
423
+ end
424
+
425
+ def test_add_placeholder_cells_to_funny_shaped_tables
426
+ skip "until I write it, cf 01005787B_Pakistan.pdf"
427
+ end
428
+
429
+ class CellsHasCellsTester
430
+ include Tabula::HasCells
431
+ attr_accessor :vertical_ruling_lines, :horizontal_ruling_lines, :cells
432
+ def initialize(vertical_ruling_lines, horizontal_ruling_lines)
433
+ @cells = []
434
+ @vertical_ruling_lines = vertical_ruling_lines
435
+ @horizontal_ruling_lines = horizontal_ruling_lines
436
+ find_cells!
437
+ end
438
+ end
439
+
440
+ #just tests the algorithm
441
+ def test_lines_to_cells
442
+ vertical_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 0.0, 40.0),
443
+ Tabula::Ruling.new(44.0, 70.0, 0.0, 36.0),
444
+ Tabula::Ruling.new(40.0, 226.0, 0.0, 40.0)]
445
+
446
+ horizontal_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 208.0, 0.0),
447
+ Tabula::Ruling.new(44.0, 18.0, 208.0, 0.0),
448
+ Tabula::Ruling.new(50.0, 18.0, 208.0, 0.0),
449
+ Tabula::Ruling.new(54.0, 18.0, 208.0, 0.0),
450
+ Tabula::Ruling.new(60.0, 18.0, 208.0, 0.0),
451
+ Tabula::Ruling.new(64.0, 18.0, 208.0, 0.0),
452
+ Tabula::Ruling.new(70.0, 18.0, 208.0, 0.0),
453
+ Tabula::Ruling.new(74.0, 18.0, 208.0, 0.0),
454
+ Tabula::Ruling.new(80.0, 18.0, 208.0, 0.0)]
455
+
456
+ expected_cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
457
+ Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
458
+ Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
459
+ Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
460
+ Tabula::Cell.new(44.0, 70.0, 156.0, 6.0), Tabula::Cell.new(50.0, 70.0, 156.0, 4.0),
461
+ Tabula::Cell.new(54.0, 70.0, 156.0, 6.0), Tabula::Cell.new(60.0, 70.0, 156.0, 4.0),
462
+ Tabula::Cell.new(64.0, 70.0, 156.0, 6.0), Tabula::Cell.new(70.0, 70.0, 156.0, 4.0),
463
+ Tabula::Cell.new(74.0, 70.0, 156.0, 6.0), ]
464
+
465
+ actual_cells = CellsHasCellsTester.new(vertical_ruling_lines, horizontal_ruling_lines).cells
466
+ assert_equal Set.new(expected_cells), Set.new(actual_cells) #I don't care about order
143
467
  end
144
468
 
469
+ #this is the real deal!!
470
+ def test_extract_tabular_data_using_lines_and_spreadsheets
471
+ pdf_file_path = "./test/data/frx_2012_disclosure.pdf"
472
+ expected_data_path = "./test/data/frx_2012_disclosure.tsv"
473
+ expected = open(expected_data_path, 'r').read #.split("\n").map{|line| line.split("\t")}
474
+
475
+ Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all).extract.each do |pdf_page|
476
+ spreadsheet = pdf_page.spreadsheets.first
477
+ assert_equal expected, spreadsheet.to_tsv
478
+ end
479
+ end
480
+
481
+ def test_cope_with_a_tableless_page
482
+ pdf_file_path = "./test/data/no_tables.pdf"
483
+
484
+ spreadsheets = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all, '',
485
+ :line_color_filter => lambda{|components| components.all?{|c| c < 0.1}}
486
+ ).extract.to_a.first.spreadsheets
487
+
488
+ assert_equal 0, spreadsheets.size
489
+ end
490
+
491
+ def test_spanning_cells
492
+ pdf_file_path = "./test/data/spanning_cells.pdf"
493
+ expected_data_path = "./test/data/spanning_cells.csv"
494
+ expected = open(expected_data_path, 'r').read
495
+
496
+ Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
497
+ spreadsheet = pdf_page.spreadsheets.first
498
+ assert_equal expected, spreadsheet.to_csv
499
+ end
500
+ end
501
+
502
+ def test_almost_vertical_lines
503
+ pdf_file_path = "./test/data/puertos1.pdf"
504
+ top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
505
+ area = Tabula::ZoneEntity.new(top, left,
506
+ right - left, bottom - top)
507
+
508
+ Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
509
+ rulings = Tabula::Ruling.crop_rulings_to_area(pdf_page.ruling_lines, area)
510
+ # TODO assertion not entirely correct, should do the trick for now
511
+ assert_equal 15, rulings.select(&:vertical?).count
512
+ end
513
+ end
514
+
515
+ def test_extract_spreadsheet_within_an_area
516
+ pdf_file_path = "./test/data/puertos1.pdf"
517
+ top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
518
+
519
+ Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
520
+ area = pdf_page.get_area([top, left, bottom, right])
521
+ table = area.spreadsheets.first.to_a
522
+ assert_equal 15, table.length
523
+ assert_equal ["", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM"], table.first
524
+ assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
525
+ end
526
+ end
527
+ end
528
+
529
+ class TestIsTabularHeuristic < Minitest::Test
530
+
531
+ EXPECTED_TO_BE_SPREADSHEET = ['47008204D_USA.page4.pdf', 'GSK_2012_Q4.page437.pdf', 'strongschools.pdf', 'tabla_subsidios.pdf']
532
+ NOT_EXPECTED_TO_BE_SPREADSHEET = ['560015757GV_China.page1.pdf', 'S2MNCEbirdisland.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
533
+
534
+ File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
535
+
536
+ def test_heuristic_detects_spreadsheets
537
+ EXPECTED_TO_BE_SPREADSHEET.each do |f|
538
+ path = File.expand_path('data/' + f, File.dirname(__FILE__))
539
+ extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
540
+ page = extractor.extract.first
541
+ page.get_ruling_lines!
542
+ assert page.is_tabular?
543
+ end
544
+ end
545
+
546
+ def test_heuristic_detects_non_spreadsheets
547
+ NOT_EXPECTED_TO_BE_SPREADSHEET.each do |f|
548
+ path = File.expand_path('data/' + f, File.dirname(__FILE__))
549
+ extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
550
+ page = extractor.extract.first
551
+ page.get_ruling_lines!
552
+ assert !page.is_tabular?
553
+ end
554
+ end
555
+
556
+
557
+
145
558
 
146
559
  end