tabula-extractor 0.6.6-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
data/lib/tabula/version.rb
CHANGED
data/lib/tabula/writers.rb
CHANGED
@@ -5,9 +5,9 @@ module Tabula
|
|
5
5
|
module Writers
|
6
6
|
|
7
7
|
def Writers.CSV(lines, output=$stdout)
|
8
|
-
lines.each
|
8
|
+
lines.each do |l|
|
9
9
|
output.write CSV.generate_line(l.map(&:text), row_sep: "\r\n")
|
10
|
-
|
10
|
+
end
|
11
11
|
end
|
12
12
|
|
13
13
|
def Writers.JSON(lines, output=$stdout)
|
@@ -15,12 +15,11 @@ module Tabula
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def Writers.TSV(lines, output=$stdout)
|
18
|
-
lines.each
|
19
|
-
output.write(l.map(&:text)
|
20
|
-
|
18
|
+
lines.each do |l|
|
19
|
+
output.write CSV.generate_line(l.map(&:text), col_sep: "\t", row_sep: "\r\n")
|
20
|
+
end
|
21
21
|
end
|
22
22
|
|
23
|
-
|
24
23
|
def Writers.HTML(lines, output=$stdout)
|
25
24
|
raise "not implemented"
|
26
25
|
end
|
data/tabula-extractor.gemspec
CHANGED
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
|
|
24
24
|
s.add_development_dependency 'minitest'
|
25
25
|
s.add_development_dependency 'bundler', '>= 1.3.4'
|
26
26
|
s.add_development_dependency 'ruby-debug'
|
27
|
+
s.add_development_dependency 'pry'
|
27
28
|
|
28
29
|
s.add_runtime_dependency "trollop", ["~> 2.0"]
|
29
30
|
# s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,88 @@
|
|
1
|
+
FOREST LABORATORIES, INC. DISCLOSURE REPORT "" "" "" ""
|
2
|
+
Calendar Year - 2012 "" "" "" ""
|
3
|
+
Physician Related Entity (if applicable) City / State Purpose of Payment Amount ($USD) * **
|
4
|
+
AALAEI, BEHZAD "" HIGHLAND, IN MEALS $51.24
|
5
|
+
TOTAL "" "" "" $51.24
|
6
|
+
AAMODT, DENISE, E "" ALBUQUERQUE, NM MEALS $66.12
|
7
|
+
TOTAL "" "" "" $66.12
|
8
|
+
AANONSEN, DEBORAH, A "" STATEN ISLAND, NY MEALS $85.00
|
9
|
+
TOTAL "" "" "" $85.00
|
10
|
+
AARON, CAREN, T "" RICHMOND, VA EDUCATIONAL ITEMS $78.80
|
11
|
+
AARON, CAREN, T "" RICHMOND, VA MEALS $392.45
|
12
|
+
TOTAL "" "" "" $471.25
|
13
|
+
AARON, JOHN "" CLARKSVILLE, TN MEALS $20.39
|
14
|
+
TOTAL "" "" "" $20.39
|
15
|
+
AARON, JOSHUA, N "" WEST GROVE, PA MEALS $310.33
|
16
|
+
AARON, JOSHUA, N REGIONAL PULMONARY & SLEEP MEDICINE WEST GROVE, PA SPEAKING FEES $4,700.00
|
17
|
+
TOTAL "" "" "" $5,010.33
|
18
|
+
AARON, MAUREEN, M "" MARTINSVILLE, VA MEALS $193.67
|
19
|
+
TOTAL "" "" "" $193.67
|
20
|
+
AARON, MICHAEL, L "" WEST ISLIP, NY MEALS $19.50
|
21
|
+
TOTAL "" "" "" $19.50
|
22
|
+
AARON, MICHAEL, R "" BROOKLYN, NY MEALS $65.92
|
23
|
+
TOTAL "" "" "" $65.92
|
24
|
+
AARONS, MARK, G "" PINEHURST, NC MEALS $154.19
|
25
|
+
TOTAL "" "" "" $154.19
|
26
|
+
AARONSON, GARY, A "" PHILADELPHIA, PA MEALS $205.17
|
27
|
+
TOTAL "" "" "" $205.17
|
28
|
+
AARONSON, ROBERT, M "" TUCSON, AZ MEALS $24.38
|
29
|
+
TOTAL "" "" "" $24.38
|
30
|
+
AASHEIM, RICHARD, J "" GREENEVILLE, TN EDUCATIONAL ITEMS $2.27
|
31
|
+
AASHEIM, RICHARD, J "" GREENEVILLE, TN MEALS $100.76
|
32
|
+
TOTAL "" "" "" $103.03
|
33
|
+
AASMAA, SIRIKE, T "" MONTVILLE, NJ MEALS $53.33
|
34
|
+
TOTAL "" "" "" $53.33
|
35
|
+
AAZAMI, HESSAM "" GRANADA HILLS, CA MEALS $402.90
|
36
|
+
TOTAL "" "" "" $402.90
|
37
|
+
ABAABA, ABIEDU, C "" JACKSONVILLE, FL MEALS $13.49
|
38
|
+
TOTAL "" "" "" $13.49
|
39
|
+
ABABNEH, ALAELDIN, A "" KANSAS CITY, KS MEALS $10.31
|
40
|
+
TOTAL "" "" "" $10.31
|
41
|
+
ABAD, ANTONIO, A "" CORAL SPRINGS, FL MEALS $516.29
|
42
|
+
TOTAL "" "" "" $516.29
|
43
|
+
ABADEER, PETER, S "" NORMAL, IL MEALS $200.38
|
44
|
+
TOTAL "" "" "" $200.38
|
45
|
+
ABAD, ENZO, L "" MIAMI, FL MEALS $67.61
|
46
|
+
TOTAL "" "" "" $67.61
|
47
|
+
ABADIAN SHARIFABAD, MANOOCHEHR "" GRANADA HILLS, CA MEALS $12.37
|
48
|
+
TOTAL "" "" "" $12.37
|
49
|
+
ABADI, CHRISTOPHER, A "" WARWICK, RI MEALS $157.42
|
50
|
+
TOTAL "" "" "" $157.42
|
51
|
+
ABADIE, MARCUS, G "" ATHENS, TX MEALS $361.89
|
52
|
+
TOTAL "" "" "" $361.89
|
53
|
+
ABADI, JAMSHEED, S "" BROOKLYN, NY MEALS $363.40
|
54
|
+
TOTAL "" "" "" $363.40
|
55
|
+
ABADILLA, JUNE, E "" JACKSON, KY MEALS $105.33
|
56
|
+
TOTAL "" "" "" $105.33
|
57
|
+
ABAD, JOHN, P "" NEWARK, OH MEALS $347.64
|
58
|
+
TOTAL "" "" "" $347.64
|
59
|
+
ABAD, JOSE, F "" FOLSOM, CA MEALS $30.28
|
60
|
+
TOTAL "" "" "" $30.28
|
61
|
+
ABAD, REMEDIOS, D "" WILNINGTON, DE MEALS $26.85
|
62
|
+
TOTAL "" "" "" $26.85
|
63
|
+
ABAD, SO KIM, F "" WICHITA FALLS, TX MEALS $136.52
|
64
|
+
TOTAL "" "" "" $136.52
|
65
|
+
ABAD, ZOILO, R "" MIAMI, FL MEALS $93.83
|
66
|
+
TOTAL "" "" "" $93.83
|
67
|
+
ABALIHI, CAROL, N "" EL PASO, TX MEALS $88.48
|
68
|
+
TOTAL "" "" "" $88.48
|
69
|
+
ABALOS, ANNA, T "" ROSEVILLE, CA MEALS $178.60
|
70
|
+
TOTAL "" "" "" $178.60
|
71
|
+
ABALOS, ARTURO, Z "" DELANO, CA MEALS $48.06
|
72
|
+
TOTAL "" "" "" $48.06
|
73
|
+
ABALOS, JOSEPH, M "" SENECA, PA MEALS $39.03
|
74
|
+
TOTAL "" "" "" $39.03
|
75
|
+
ABANDO, JOSE, R "" DAYTONA BEACH, FL MEALS $83.44
|
76
|
+
TOTAL "" "" "" $83.44
|
77
|
+
ABANG, ANTHONY, E "" ELIZABETHTOWN, KY MEALS $12.62
|
78
|
+
TOTAL "" "" "" $12.62
|
79
|
+
ABAN, KENRIC, T "" SAN DIEGO, CA MEALS $11.91
|
80
|
+
TOTAL "" "" "" $11.91
|
81
|
+
ABAQUETA, ALVIN, Y "" CHARLOTTE, NC MEALS $233.71
|
82
|
+
TOTAL "" "" "" $233.71
|
83
|
+
ABARCA, SERGIO, O "" TOOELE, UT MEALS $159.58
|
84
|
+
TOTAL "" "" "" $159.58
|
85
|
+
ABARIKWU, CONSTANTIA, A "" PHOENIX, AZ MEALS $153.57
|
86
|
+
TOTAL "" "" "" $153.57
|
87
|
+
ABASHIDZE, TEAH, A "" CLEVELAND, OH MEALS $153.59
|
88
|
+
TOTAL "" "" "" $153.59
|
Binary file
|
Binary file
|
@@ -0,0 +1,21 @@
|
|
1
|
+
Improved operation scenario,"","","","",""
|
2
|
+
Volume servers in:,2007,2008,2009,2010,2011
|
3
|
+
Server closets,"1,505","1,580","1,643","1,673","1,689"
|
4
|
+
Server rooms,"1,512","1,586","1,646","1,677","1,693"
|
5
|
+
Localized data centers,"1,512","1,586","1,646","1,677","1,693"
|
6
|
+
Mid-tier data centers,"1,512","1,586","1,646","1,677","1,693"
|
7
|
+
Enterprise-class data centers,"1,512","1,586","1,646","1,677","1,693"
|
8
|
+
Best practice scenario,"","","","",""
|
9
|
+
Volume servers in:,2007,2008,2009,2010,2011
|
10
|
+
Server closets,"1,456","1,439","1,386","1,296","1,326"
|
11
|
+
Server rooms,"1,465","1,472","1,427","1,334","1,371"
|
12
|
+
Localized data centers,"1,465","1,471","1,426","1,334","1,371"
|
13
|
+
Mid-tier data centers,"1,465","1,471","1,426","1,334","1,371"
|
14
|
+
Enterprise-class data centers,"1,465","1,471","1,426","1,334","1,371"
|
15
|
+
State-of-the-art scenario,"","","","",""
|
16
|
+
Volume servers in:,2007,2008,2009,2010,2011
|
17
|
+
Server closets,"1,485","1,471","1,424","1,315","1,349"
|
18
|
+
Server rooms,"1,495","1,573","1,586","1,424","1,485"
|
19
|
+
Localized data centers,"1,495","1,572","1,585","1,424","1,485"
|
20
|
+
Mid-tier data centers,"1,495","1,572","1,585","1,424","1,485"
|
21
|
+
Enterprise-class data centers,"1,495","1,572","1,585","1,424","1,485"
|
Binary file
|
Binary file
|
File without changes
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/test/heuristic.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#a list of filenames and the correct answer
|
2
|
+
# no more bs.
|
3
|
+
require_relative '../lib/tabula'
|
4
|
+
|
5
|
+
|
6
|
+
should_use_spreadsheet = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "spreadsheet/*") ).map{|a| [a, true]}
|
7
|
+
should_use_original = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "original/*") ).map{|a| [a, false]}
|
8
|
+
|
9
|
+
correct = []
|
10
|
+
misclassified_as_original = []
|
11
|
+
misclassified_as_spreadsheet = []
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
def heuristic(page)
|
16
|
+
page.is_tabular?
|
17
|
+
end
|
18
|
+
|
19
|
+
(should_use_spreadsheet + should_use_original) .each do |filename, expected_to_be_tabular|
|
20
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(filename, [1])
|
21
|
+
|
22
|
+
page = extractor.extract.first
|
23
|
+
page.get_ruling_lines!
|
24
|
+
# puts "#{File.basename(filename)} | #{expected_to_be_tabular}"
|
25
|
+
page_is_tabular = heuristic(page)
|
26
|
+
# puts ""
|
27
|
+
|
28
|
+
if page_is_tabular && expected_to_be_tabular || !page_is_tabular && !expected_to_be_tabular
|
29
|
+
correct << filename
|
30
|
+
elsif page_is_tabular && !expected_to_be_tabular
|
31
|
+
misclassified_as_spreadsheet << filename
|
32
|
+
elsif !page_is_tabular && expected_to_be_tabular
|
33
|
+
misclassified_as_original << filename
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
puts "#{correct.size} PDFs were correctly classified"
|
38
|
+
puts "#{misclassified_as_original.size + misclassified_as_spreadsheet.size} PDFs were incorrectly classified"
|
39
|
+
unless misclassified_as_spreadsheet.empty?
|
40
|
+
puts "#{misclassified_as_spreadsheet.size} PDFs should use the original extraction algorithm\n\t but was classified as needing the spreadsheet algorithm"
|
41
|
+
misclassified_as_spreadsheet.each do |filename|
|
42
|
+
puts " - #{File.basename(filename)}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
unless misclassified_as_original.empty?
|
46
|
+
puts "#{misclassified_as_original.size} PDFs should use the spreadsheet extraction algorithm\n\t but was classified as needing the original algorithm"
|
47
|
+
misclassified_as_original.each do |filename|
|
48
|
+
puts " - #{File.basename(filename)}"
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf --silent -o test.csv
|
2
|
+
bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf -o test.csv
|
3
|
+
bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv
|
4
|
+
bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv --format TSV
|
5
|
+
bin/tabula test/data/campaign_donors.pdf -o test.csv --columns 47,147,256,310,375,431,504 #columns should work
|
6
|
+
bin/tabula test/data/argentina_diputados_voting_record.pdf --guess -o test.csv --format TSV #should exclude guff
|
7
|
+
bin/tabula test/data/vertical_rulings_bug.pdf --area 250,0,325,1700 -o test.csv --format TSV #should be only a few lines
|
data/test/tests.rb
CHANGED
@@ -6,10 +6,109 @@ require_relative '../lib/tabula'
|
|
6
6
|
|
7
7
|
def lines_to_array(lines)
|
8
8
|
lines.map { |l|
|
9
|
-
l.map { |te| te.text }
|
9
|
+
l.map { |te| te.text.strip }
|
10
10
|
}
|
11
11
|
end
|
12
12
|
|
13
|
+
def lines_to_table(lines)
|
14
|
+
Tabula::Table.new_from_array(lines_to_array(lines))
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
# I don't want to pollute the "real" clasend a funny inspect method. Just for testing comparisons.
|
19
|
+
module Tabula
|
20
|
+
class Table
|
21
|
+
def inspect
|
22
|
+
"[" + lines.map(&:inspect).join(",") + "]"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
module Tabula
|
28
|
+
class Line
|
29
|
+
def inspect
|
30
|
+
@text_elements.map(&:text).inspect
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
class TestEntityComparability < Minitest::Test
|
37
|
+
def test_text_element_comparability
|
38
|
+
base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
|
39
|
+
|
40
|
+
two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
|
41
|
+
three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
|
42
|
+
four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
|
43
|
+
|
44
|
+
five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
|
45
|
+
six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
|
46
|
+
seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
|
47
|
+
assert_equal base, two
|
48
|
+
assert_equal base, three
|
49
|
+
assert_equal base, four
|
50
|
+
|
51
|
+
refute_equal base, five
|
52
|
+
refute_equal base, six
|
53
|
+
refute_equal base, seven
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_line_comparability
|
57
|
+
text_base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
|
58
|
+
|
59
|
+
text_two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
|
60
|
+
text_three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
|
61
|
+
text_four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
|
62
|
+
|
63
|
+
text_five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
|
64
|
+
text_six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
|
65
|
+
text_seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
|
66
|
+
line_base = Tabula::Line.new
|
67
|
+
line_base.text_elements = [text_base, text_two, text_three]
|
68
|
+
line_equal = Tabula::Line.new
|
69
|
+
line_equal.text_elements = [text_base, text_two, text_three]
|
70
|
+
line_equal_but_longer = Tabula::Line.new
|
71
|
+
line_equal_but_longer.text_elements = [text_base, text_two, text_three, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
|
72
|
+
line_unequal = Tabula::Line.new
|
73
|
+
line_unequal.text_elements = [text_base, text_two, text_three, text_five]
|
74
|
+
line_unequal_and_longer = Tabula::Line.new
|
75
|
+
line_unequal_and_longer.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
|
76
|
+
line_unequal_and_longer_and_different = Tabula::Line.new
|
77
|
+
line_unequal_and_longer_and_different.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, 'whatever']
|
78
|
+
|
79
|
+
assert_equal line_base, line_equal
|
80
|
+
assert_equal line_base, line_equal_but_longer
|
81
|
+
refute_equal line_base, line_unequal
|
82
|
+
refute_equal line_base, line_unequal_and_longer
|
83
|
+
refute_equal line_base, line_unequal_and_longer_and_different
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_table_comparability
|
87
|
+
rows_base = [["a", "b", "c"], ['', 'd', '']]
|
88
|
+
rows_equal = [["a", "b", "c"], ['', 'd']]
|
89
|
+
rows_equal_padded = [['', "a", "b", "c"], ['', '', 'd']]
|
90
|
+
rows_unequal_one = [["a", "b", "c"], ['d']]
|
91
|
+
rows_unequal_two = [["a", "b", "c"], ['d', '']]
|
92
|
+
rows_unequal_three = [["a", "b", "c"], ['d'], ['a','b', 'd']]
|
93
|
+
rows_unequal_four = [["a", "b", "c"]]
|
94
|
+
|
95
|
+
table_base = Tabula::Table.new_from_array(rows_base)
|
96
|
+
table_equal = Tabula::Table.new_from_array(rows_equal)
|
97
|
+
table_equal_column_padded = Tabula::Table.new_from_array(rows_equal_padded)
|
98
|
+
table_unequal_one = Tabula::Table.new_from_array(rows_unequal_one)
|
99
|
+
table_unequal_two = Tabula::Table.new_from_array(rows_unequal_two)
|
100
|
+
table_unequal_three = Tabula::Table.new_from_array(rows_unequal_three)
|
101
|
+
table_unequal_four = Tabula::Table.new_from_array(rows_unequal_four)
|
102
|
+
|
103
|
+
assert_equal table_base, table_equal
|
104
|
+
assert_equal table_base, table_equal_column_padded
|
105
|
+
refute_equal table_base, table_unequal_one
|
106
|
+
refute_equal table_base, table_unequal_two
|
107
|
+
refute_equal table_base, table_unequal_three
|
108
|
+
refute_equal table_base, table_unequal_four
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
13
112
|
class TestPagesInfoExtractor < Minitest::Test
|
14
113
|
def test_pages_info_extractor
|
15
114
|
extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
@@ -24,42 +123,74 @@ class TestPagesInfoExtractor < Minitest::Test
|
|
24
123
|
end
|
25
124
|
|
26
125
|
class TestTableGuesser < Minitest::Test
|
126
|
+
def test_find_rects_from_lines_with_lsd
|
127
|
+
skip "Skipping until we actually use LSD"
|
128
|
+
filename = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
129
|
+
page_index = 0
|
130
|
+
lines = Tabula::Extraction::LineExtractor.lines_in_pdf_page(filename, page_index, :render_pdf => true)
|
131
|
+
|
132
|
+
page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
|
133
|
+
page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
|
134
|
+
expected_page_areas = [[54.087890625, 50.203125, 734.220703125, 550.44140625]]
|
135
|
+
assert_equal expected_page_areas, page_areas
|
136
|
+
end
|
137
|
+
|
27
138
|
end
|
28
139
|
|
29
140
|
class TestDumper < Minitest::Test
|
30
141
|
|
31
142
|
def test_extractor
|
32
|
-
extractor = Tabula::Extraction::
|
33
|
-
page = extractor.extract.
|
143
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
144
|
+
page = extractor.extract.next
|
34
145
|
assert_instance_of Tabula::Page, page
|
35
146
|
end
|
36
147
|
|
37
148
|
def test_get_by_area
|
38
|
-
|
39
|
-
# http://localhost:8080/debug/418b1d5698e5c7b724551d9610c071ab3063275c/characters?x1=57.921428571428564&x2=290.7&y1=107.1&y2=394.52142857142854&page=1&use_lines=false
|
40
|
-
extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
149
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
41
150
|
characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
|
42
151
|
assert_equal characters.size, 206
|
43
152
|
end
|
44
153
|
end
|
45
154
|
|
155
|
+
class TestRulingIntersection < Minitest::Test
|
156
|
+
def test_ruling_intersection
|
157
|
+
horizontals = [Tabula::Ruling.new(10, 1, 10, 0)]
|
158
|
+
verticals = [Tabula::Ruling.new(1, 3, 0, 11),
|
159
|
+
Tabula::Ruling.new(1, 4, 0, 11)]
|
160
|
+
ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
|
161
|
+
assert_equal 2, ints.size
|
162
|
+
assert_equal ints[0][0].getX, 3.0
|
163
|
+
assert_equal ints[0][0].getY, 10.0
|
164
|
+
assert_equal ints[1][0].getX, 4.0
|
165
|
+
assert_equal ints[1][0].getY, 10.0
|
166
|
+
|
167
|
+
verticals = [Tabula::Ruling.new(20, 3, 0, 11)]
|
168
|
+
ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
|
169
|
+
assert_equal ints.size, 0
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
46
173
|
class TestExtractor < Minitest::Test
|
47
174
|
|
48
175
|
def test_table_extraction_1
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
176
|
+
table = lines_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
|
177
|
+
1,
|
178
|
+
[107.1, 57.9214, 394.5214, 290.7],
|
179
|
+
:detect_ruling_lines => false)
|
180
|
+
|
181
|
+
expected = [["Prior Scale","New Scale","% Rank*"], ["800","170","99"], ["790","170","99"], ["780","170","99"], ["770","170","99"], ["760","170","99"], ["750","169","99"], ["740","169","99"], ["730","168","98"], ["720","168","98"], ["710","167","97"], ["700","166","96"], ["690","165","95"], ["680","165","95"], ["670","164","93"], ["660","164","93"], ["650","163","91"]]
|
182
|
+
|
53
183
|
assert_equal expected, table
|
54
184
|
end
|
55
185
|
|
56
186
|
def test_diputados_voting_record
|
57
|
-
|
58
|
-
|
187
|
+
table = lines_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
|
188
|
+
1,
|
189
|
+
[269.875, 12.75, 790.5, 561])
|
59
190
|
|
60
191
|
expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
|
61
192
|
|
62
|
-
assert_equal expected,
|
193
|
+
assert_equal expected, table
|
63
194
|
end
|
64
195
|
|
65
196
|
def test_forest_disclosure_report_dont_regress
|
@@ -67,80 +198,362 @@ class TestExtractor < Minitest::Test
|
|
67
198
|
# test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
|
68
199
|
# and a solution for half-x-height-offset lines.
|
69
200
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
70
|
-
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
71
|
-
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
72
|
-
vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
|
73
201
|
|
202
|
+
table = lines_to_table Tabula.extract_table(pdf_file_path,
|
203
|
+
1,
|
204
|
+
[106.01, 48.09, 227.31, 551.89],
|
205
|
+
:detect_ruling_lines => true)
|
74
206
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
|
80
|
-
['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
|
81
|
-
['TOTAL', '', '', '', '$471.25'],
|
82
|
-
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
|
83
|
-
['TOTAL', '', '', '','$20.39'],
|
84
|
-
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
|
85
|
-
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
86
|
-
['TOTAL', '', '', '', '$5,010.33'],
|
87
|
-
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
|
88
|
-
['TOTAL', '', '', '', '$193.67'],
|
89
|
-
['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
|
90
|
-
|
91
|
-
assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
|
207
|
+
expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
|
208
|
+
|
209
|
+
|
210
|
+
assert_equal expected, table
|
92
211
|
end
|
93
212
|
|
94
213
|
def test_missing_spaces_around_an_ampersand
|
95
214
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
96
|
-
character_extractor = Tabula::Extraction::
|
97
|
-
|
98
|
-
|
215
|
+
character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
|
216
|
+
page_obj = character_extractor.extract.next
|
217
|
+
lines = page_obj.ruling_lines
|
218
|
+
vertical_rulings = lines.select(&:vertical?)
|
99
219
|
|
220
|
+
area = [170, 28, 185, 833] #top left bottom right
|
100
221
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
222
|
+
expected = Tabula::Table.new_from_array([
|
223
|
+
["", "REGIONAL PULMONARY & SLEEP",],
|
224
|
+
["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"],
|
225
|
+
["", "MEDICINE", ],
|
226
|
+
])
|
106
227
|
|
107
|
-
assert_equal expected,
|
228
|
+
assert_equal expected, lines_to_table(page_obj.get_area(area).make_table(:vertical_rulings => vertical_rulings))
|
108
229
|
end
|
109
230
|
|
110
231
|
def test_forest_disclosure_report
|
111
232
|
skip "Skipping until we support multiline cells"
|
112
233
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
113
|
-
character_extractor = Tabula::Extraction::
|
234
|
+
character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
|
114
235
|
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
115
|
-
vertical_rulings = lines.select(&:vertical?)
|
236
|
+
vertical_rulings = lines.select(&:vertical?) #.uniq{|line| (line.left / 10).round }
|
116
237
|
|
117
|
-
|
238
|
+
page_obj = character_extractor.extract.next
|
239
|
+
characters = page_obj.get_text([110, 28, 218, 833])
|
118
240
|
#top left bottom right
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
241
|
+
expected = Tabula::Table.new_from_array([
|
242
|
+
['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '', '$85.00'],
|
243
|
+
['TOTAL', '', '', '','$85.00'],
|
244
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '', '$78.80'],
|
245
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '', '$392.45'],
|
246
|
+
['TOTAL', '', '', '', '$471.25'],
|
247
|
+
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '', '$20.39'],
|
248
|
+
['TOTAL', '', '', '','$20.39'],
|
249
|
+
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '', '$310.33'],
|
250
|
+
['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '', '$4,700.00'],
|
251
|
+
['TOTAL', '', '', '', '$5,010.33'],
|
252
|
+
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '', '$193.67'],
|
253
|
+
['TOTAL', '', '', '', '$193.67'],
|
254
|
+
['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '', '$19.50']
|
255
|
+
])
|
256
|
+
|
257
|
+
assert_equal expected, lines_to_table(Tabula.make_table(characters, :vertical_rulings => vertical_rulings))
|
134
258
|
end
|
135
259
|
|
136
260
|
# TODO Spaces inserted in words - fails
|
137
261
|
def test_bo_page24
|
138
|
-
|
139
|
-
|
262
|
+
table = lines_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
|
263
|
+
1,
|
264
|
+
[425.625, 53.125, 575.714, 810.535],
|
265
|
+
:detect_ruling_lines => false)
|
140
266
|
|
141
267
|
expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
|
142
|
-
|
268
|
+
|
269
|
+
assert_equal expected, table
|
270
|
+
end
|
271
|
+
|
272
|
+
|
273
|
+
def test_vertical_rulings_splitting_words
|
274
|
+
#if a vertical ruling crosses over a word, the word should be split at that vertical ruling
|
275
|
+
# before, the entire word would end up on one side of the vertical ruling.
|
276
|
+
pdf_file_path = File.expand_path('data/vertical_rulings_bug.pdf', File.dirname(__FILE__))
|
277
|
+
|
278
|
+
#both of these are semantically "correct"; the difference is in how we handle multi-line cells
|
279
|
+
expected = Tabula::Table.new_from_array([
|
280
|
+
["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
|
281
|
+
["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
|
282
|
+
["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH ABRAHAMSON", "", "", "$22.93", "", "", "$22.93"]
|
283
|
+
])
|
284
|
+
other_expected = Tabula::Table.new_from_array([
|
285
|
+
["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
|
286
|
+
["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
|
287
|
+
["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH", "", "", "$22.93", "", "", "$22.93"],
|
288
|
+
["", "", "", "ABRAHAMSON"]
|
289
|
+
])
|
290
|
+
|
291
|
+
#N.B. it's "MORGANTOWN", "WV" that we're most interested in here (it used to show up as ["MORGANTOWNWV", "", ""])
|
292
|
+
|
293
|
+
|
294
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, 1...2) #:all ) # 1..2643
|
295
|
+
extractor.extract.each_with_index do |pdf_page, page_index|
|
296
|
+
|
297
|
+
page_areas = [[250, 0, 325, 1700]]
|
298
|
+
|
299
|
+
scale_factor = pdf_page.width / 1700
|
300
|
+
|
301
|
+
vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
|
302
|
+
|
303
|
+
tables = page_areas.map do |page_area|
|
304
|
+
pdf_page.get_area(page_area).make_table(:vertical_rulings => vertical_rulings)
|
305
|
+
end
|
306
|
+
assert_equal expected, lines_to_table(tables.first)
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
def test_vertical_rulings_prevent_merging_of_columns
|
311
|
+
expected = [["SZARANGOWICZ", "GUSTAVO ALEJANDRO", "25.096.244", "20-25096244-5", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TAILHADE", "LUIS RODOLFO", "21.386.299", "20-21386299-6", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TEDESCHI", "ADRIÁN ALBERTO", "24.171.507", "20-24171507-9", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["URRIZA", "MARÍA TERESA", "18.135.604", "27-18135604-4", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["USTARROZ", "GERÓNIMO JAVIER", "24.912.947", "20-24912947-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VALSANGIACOMO BLANC", "OFERNANDO JORGE", "26.800.203", "20-26800203-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VICENTE", "PABLO ARIEL", "21.897.586", "20-21897586-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["AMBURI", "HUGO ALBERTO", "14.096.560", "20-14096560-0", "09/10/2013", "EFECTIVO", "$ 20.000,00"], ["BERRA", "CLAUDIA SUSANA", "14.433.112", "27-14433112-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"]]
|
312
|
+
|
313
|
+
vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
|
314
|
+
|
315
|
+
table = lines_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
|
316
|
+
1,
|
317
|
+
[255.57,40.43,398.76,557.35],
|
318
|
+
:vertical_rulings => vertical_rulings)
|
319
|
+
|
320
|
+
assert_equal expected, table
|
321
|
+
end
|
322
|
+
|
323
|
+
def test_get_spacing_and_merging_right
|
324
|
+
table = lines_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
|
325
|
+
1,
|
326
|
+
[52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
|
327
|
+
:detect_ruling_lines => true)
|
328
|
+
|
329
|
+
expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia ", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
|
330
|
+
|
331
|
+
assert_equal expected, table
|
332
|
+
|
333
|
+
end
|
334
|
+
|
335
|
+
|
336
|
+
class SpreadsheetsHasCellsTester
|
337
|
+
include Tabula::HasCells
|
338
|
+
attr_accessor :cells
|
339
|
+
def initialize(cells)
|
340
|
+
@cells = cells
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
#just tests the algorithm
|
345
|
+
def test_cells_to_spreadsheets
|
346
|
+
|
347
|
+
cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
|
348
|
+
Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
|
349
|
+
Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
|
350
|
+
Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
|
351
|
+
Tabula::Cell.new(90.0, 18.0, 52.0, 4.0), Tabula::Cell.new(94.0, 18.0, 52.0, 6.0),
|
352
|
+
Tabula::Cell.new(100.0, 18.0, 52.0, 28.0), Tabula::Cell.new(128.0, 18.0, 52.0, 4.0),
|
353
|
+
Tabula::Cell.new(132.0, 18.0, 52.0, 64.0), Tabula::Cell.new(196.0, 18.0, 52.0, 66.0),
|
354
|
+
Tabula::Cell.new(262.0, 18.0, 52.0, 4.0), Tabula::Cell.new(266.0, 18.0, 52.0, 84.0),
|
355
|
+
Tabula::Cell.new(350.0, 18.0, 52.0, 4.0), Tabula::Cell.new(354.0, 18.0, 52.0, 32.0),
|
356
|
+
Tabula::Cell.new(386.0, 18.0, 52.0, 38.0), Tabula::Cell.new(424.0, 18.0, 52.0, 18.0),
|
357
|
+
Tabula::Cell.new(442.0, 18.0, 52.0, 74.0), Tabula::Cell.new(516.0, 18.0, 52.0, 28.0),
|
358
|
+
Tabula::Cell.new(544.0, 18.0, 52.0, 4.0), Tabula::Cell.new(44.0, 70.0, 156.0, 6.0),
|
359
|
+
Tabula::Cell.new(50.0, 70.0, 156.0, 4.0), Tabula::Cell.new(54.0, 70.0, 156.0, 6.0),
|
360
|
+
Tabula::Cell.new(60.0, 70.0, 156.0, 4.0), Tabula::Cell.new(64.0, 70.0, 156.0, 6.0),
|
361
|
+
Tabula::Cell.new(70.0, 70.0, 156.0, 4.0), Tabula::Cell.new(74.0, 70.0, 156.0, 6.0),
|
362
|
+
Tabula::Cell.new(84.0, 70.0, 2.0, 6.0), Tabula::Cell.new(90.0, 70.0, 156.0, 4.0),
|
363
|
+
Tabula::Cell.new(94.0, 70.0, 156.0, 6.0), Tabula::Cell.new(100.0, 70.0, 156.0, 28.0),
|
364
|
+
Tabula::Cell.new(128.0, 70.0, 156.0, 4.0), Tabula::Cell.new(132.0, 70.0, 156.0, 64.0),
|
365
|
+
Tabula::Cell.new(196.0, 70.0, 156.0, 66.0), Tabula::Cell.new(262.0, 70.0, 156.0, 4.0),
|
366
|
+
Tabula::Cell.new(266.0, 70.0, 156.0, 84.0), Tabula::Cell.new(350.0, 70.0, 156.0, 4.0),
|
367
|
+
Tabula::Cell.new(354.0, 70.0, 156.0, 32.0), Tabula::Cell.new(386.0, 70.0, 156.0, 38.0),
|
368
|
+
Tabula::Cell.new(424.0, 70.0, 156.0, 18.0), Tabula::Cell.new(442.0, 70.0, 156.0, 74.0),
|
369
|
+
Tabula::Cell.new(516.0, 70.0, 156.0, 28.0), Tabula::Cell.new(544.0, 70.0, 156.0, 4.0),
|
370
|
+
Tabula::Cell.new(84.0, 72.0, 446.0, 6.0), Tabula::Cell.new(90.0, 226.0, 176.0, 4.0),
|
371
|
+
Tabula::Cell.new(94.0, 226.0, 176.0, 6.0), Tabula::Cell.new(100.0, 226.0, 176.0, 28.0),
|
372
|
+
Tabula::Cell.new(128.0, 226.0, 176.0, 4.0), Tabula::Cell.new(132.0, 226.0, 176.0, 64.0),
|
373
|
+
Tabula::Cell.new(196.0, 226.0, 176.0, 66.0), Tabula::Cell.new(262.0, 226.0, 176.0, 4.0),
|
374
|
+
Tabula::Cell.new(266.0, 226.0, 176.0, 84.0), Tabula::Cell.new(350.0, 226.0, 176.0, 4.0),
|
375
|
+
Tabula::Cell.new(354.0, 226.0, 176.0, 32.0), Tabula::Cell.new(386.0, 226.0, 176.0, 38.0),
|
376
|
+
Tabula::Cell.new(424.0, 226.0, 176.0, 18.0), Tabula::Cell.new(442.0, 226.0, 176.0, 74.0),
|
377
|
+
Tabula::Cell.new(516.0, 226.0, 176.0, 28.0), Tabula::Cell.new(544.0, 226.0, 176.0, 4.0),
|
378
|
+
Tabula::Cell.new(90.0, 402.0, 116.0, 4.0), Tabula::Cell.new(94.0, 402.0, 116.0, 6.0),
|
379
|
+
Tabula::Cell.new(100.0, 402.0, 116.0, 28.0), Tabula::Cell.new(128.0, 402.0, 116.0, 4.0),
|
380
|
+
Tabula::Cell.new(132.0, 402.0, 116.0, 64.0), Tabula::Cell.new(196.0, 402.0, 116.0, 66.0),
|
381
|
+
Tabula::Cell.new(262.0, 402.0, 116.0, 4.0), Tabula::Cell.new(266.0, 402.0, 116.0, 84.0),
|
382
|
+
Tabula::Cell.new(350.0, 402.0, 116.0, 4.0), Tabula::Cell.new(354.0, 402.0, 116.0, 32.0),
|
383
|
+
Tabula::Cell.new(386.0, 402.0, 116.0, 38.0), Tabula::Cell.new(424.0, 402.0, 116.0, 18.0),
|
384
|
+
Tabula::Cell.new(442.0, 402.0, 116.0, 74.0), Tabula::Cell.new(516.0, 402.0, 116.0, 28.0),
|
385
|
+
Tabula::Cell.new(544.0, 402.0, 116.0, 4.0), Tabula::Cell.new(84.0, 518.0, 246.0, 6.0),
|
386
|
+
Tabula::Cell.new(90.0, 518.0, 186.0, 4.0), Tabula::Cell.new(94.0, 518.0, 186.0, 6.0),
|
387
|
+
Tabula::Cell.new(100.0, 518.0, 186.0, 28.0), Tabula::Cell.new(128.0, 518.0, 186.0, 4.0),
|
388
|
+
Tabula::Cell.new(132.0, 518.0, 186.0, 64.0), Tabula::Cell.new(196.0, 518.0, 186.0, 66.0),
|
389
|
+
Tabula::Cell.new(262.0, 518.0, 186.0, 4.0), Tabula::Cell.new(266.0, 518.0, 186.0, 84.0),
|
390
|
+
Tabula::Cell.new(350.0, 518.0, 186.0, 4.0), Tabula::Cell.new(354.0, 518.0, 186.0, 32.0),
|
391
|
+
Tabula::Cell.new(386.0, 518.0, 186.0, 38.0), Tabula::Cell.new(424.0, 518.0, 186.0, 18.0),
|
392
|
+
Tabula::Cell.new(442.0, 518.0, 186.0, 74.0), Tabula::Cell.new(516.0, 518.0, 186.0, 28.0),
|
393
|
+
Tabula::Cell.new(544.0, 518.0, 186.0, 4.0), Tabula::Cell.new(90.0, 704.0, 60.0, 4.0),
|
394
|
+
Tabula::Cell.new(94.0, 704.0, 60.0, 6.0), Tabula::Cell.new(100.0, 704.0, 60.0, 28.0),
|
395
|
+
Tabula::Cell.new(128.0, 704.0, 60.0, 4.0), Tabula::Cell.new(132.0, 704.0, 60.0, 64.0),
|
396
|
+
Tabula::Cell.new(196.0, 704.0, 60.0, 66.0), Tabula::Cell.new(262.0, 704.0, 60.0, 4.0),
|
397
|
+
Tabula::Cell.new(266.0, 704.0, 60.0, 84.0), Tabula::Cell.new(350.0, 704.0, 60.0, 4.0),
|
398
|
+
Tabula::Cell.new(354.0, 704.0, 60.0, 32.0), Tabula::Cell.new(386.0, 704.0, 60.0, 38.0),
|
399
|
+
Tabula::Cell.new(424.0, 704.0, 60.0, 18.0), Tabula::Cell.new(442.0, 704.0, 60.0, 74.0),
|
400
|
+
Tabula::Cell.new(516.0, 704.0, 60.0, 28.0), Tabula::Cell.new(544.0, 704.0, 60.0, 4.0),
|
401
|
+
Tabula::Cell.new(84.0, 764.0, 216.0, 6.0), Tabula::Cell.new(90.0, 764.0, 216.0, 4.0),
|
402
|
+
Tabula::Cell.new(94.0, 764.0, 216.0, 6.0), Tabula::Cell.new(100.0, 764.0, 216.0, 28.0),
|
403
|
+
Tabula::Cell.new(128.0, 764.0, 216.0, 4.0), Tabula::Cell.new(132.0, 764.0, 216.0, 64.0),
|
404
|
+
Tabula::Cell.new(196.0, 764.0, 216.0, 66.0), Tabula::Cell.new(262.0, 764.0, 216.0, 4.0),
|
405
|
+
Tabula::Cell.new(266.0, 764.0, 216.0, 84.0), Tabula::Cell.new(350.0, 764.0, 216.0, 4.0),
|
406
|
+
Tabula::Cell.new(354.0, 764.0, 216.0, 32.0), Tabula::Cell.new(386.0, 764.0, 216.0, 38.0),
|
407
|
+
Tabula::Cell.new(424.0, 764.0, 216.0, 18.0), Tabula::Cell.new(442.0, 764.0, 216.0, 74.0),
|
408
|
+
Tabula::Cell.new(516.0, 764.0, 216.0, 28.0), Tabula::Cell.new(544.0, 764.0, 216.0, 4.0)]
|
409
|
+
|
410
|
+
|
411
|
+
expected_spreadsheets = [Tabula::Spreadsheet.new(40.0, 18.0, 208.0, 40.0, nil, nil, nil, nil),
|
412
|
+
Tabula::Spreadsheet.new(84.0, 18.0, 962.0, 464.0,nil, nil, nil, nil)]
|
413
|
+
|
414
|
+
#compares spreadsheets on area only.
|
415
|
+
assert_equal expected_spreadsheets.map{|s| [s.x, s.y, s.width, s.height] },
|
416
|
+
SpreadsheetsHasCellsTester.new(cells).find_spreadsheets_from_cells.map{|a| s = a.getBounds; [s.x, s.y, s.width, s.height] }
|
417
|
+
|
418
|
+
|
419
|
+
end
|
420
|
+
|
421
|
+
def test_add_spanning_cells
|
422
|
+
skip "until I write it"
|
423
|
+
end
|
424
|
+
|
425
|
+
def test_add_placeholder_cells_to_funny_shaped_tables
|
426
|
+
skip "until I write it, cf 01005787B_Pakistan.pdf"
|
427
|
+
end
|
428
|
+
|
429
|
+
class CellsHasCellsTester
|
430
|
+
include Tabula::HasCells
|
431
|
+
attr_accessor :vertical_ruling_lines, :horizontal_ruling_lines, :cells
|
432
|
+
def initialize(vertical_ruling_lines, horizontal_ruling_lines)
|
433
|
+
@cells = []
|
434
|
+
@vertical_ruling_lines = vertical_ruling_lines
|
435
|
+
@horizontal_ruling_lines = horizontal_ruling_lines
|
436
|
+
find_cells!
|
437
|
+
end
|
438
|
+
end
|
439
|
+
|
440
|
+
#just tests the algorithm
|
441
|
+
def test_lines_to_cells
|
442
|
+
vertical_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 0.0, 40.0),
|
443
|
+
Tabula::Ruling.new(44.0, 70.0, 0.0, 36.0),
|
444
|
+
Tabula::Ruling.new(40.0, 226.0, 0.0, 40.0)]
|
445
|
+
|
446
|
+
horizontal_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 208.0, 0.0),
|
447
|
+
Tabula::Ruling.new(44.0, 18.0, 208.0, 0.0),
|
448
|
+
Tabula::Ruling.new(50.0, 18.0, 208.0, 0.0),
|
449
|
+
Tabula::Ruling.new(54.0, 18.0, 208.0, 0.0),
|
450
|
+
Tabula::Ruling.new(60.0, 18.0, 208.0, 0.0),
|
451
|
+
Tabula::Ruling.new(64.0, 18.0, 208.0, 0.0),
|
452
|
+
Tabula::Ruling.new(70.0, 18.0, 208.0, 0.0),
|
453
|
+
Tabula::Ruling.new(74.0, 18.0, 208.0, 0.0),
|
454
|
+
Tabula::Ruling.new(80.0, 18.0, 208.0, 0.0)]
|
455
|
+
|
456
|
+
expected_cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
|
457
|
+
Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
|
458
|
+
Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
|
459
|
+
Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
|
460
|
+
Tabula::Cell.new(44.0, 70.0, 156.0, 6.0), Tabula::Cell.new(50.0, 70.0, 156.0, 4.0),
|
461
|
+
Tabula::Cell.new(54.0, 70.0, 156.0, 6.0), Tabula::Cell.new(60.0, 70.0, 156.0, 4.0),
|
462
|
+
Tabula::Cell.new(64.0, 70.0, 156.0, 6.0), Tabula::Cell.new(70.0, 70.0, 156.0, 4.0),
|
463
|
+
Tabula::Cell.new(74.0, 70.0, 156.0, 6.0), ]
|
464
|
+
|
465
|
+
actual_cells = CellsHasCellsTester.new(vertical_ruling_lines, horizontal_ruling_lines).cells
|
466
|
+
assert_equal Set.new(expected_cells), Set.new(actual_cells) #I don't care about order
|
143
467
|
end
|
144
468
|
|
469
|
+
#this is the real deal!!
|
470
|
+
def test_extract_tabular_data_using_lines_and_spreadsheets
|
471
|
+
pdf_file_path = "./test/data/frx_2012_disclosure.pdf"
|
472
|
+
expected_data_path = "./test/data/frx_2012_disclosure.tsv"
|
473
|
+
expected = open(expected_data_path, 'r').read #.split("\n").map{|line| line.split("\t")}
|
474
|
+
|
475
|
+
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all).extract.each do |pdf_page|
|
476
|
+
spreadsheet = pdf_page.spreadsheets.first
|
477
|
+
assert_equal expected, spreadsheet.to_tsv
|
478
|
+
end
|
479
|
+
end
|
480
|
+
|
481
|
+
def test_cope_with_a_tableless_page
|
482
|
+
pdf_file_path = "./test/data/no_tables.pdf"
|
483
|
+
|
484
|
+
spreadsheets = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all, '',
|
485
|
+
:line_color_filter => lambda{|components| components.all?{|c| c < 0.1}}
|
486
|
+
).extract.to_a.first.spreadsheets
|
487
|
+
|
488
|
+
assert_equal 0, spreadsheets.size
|
489
|
+
end
|
490
|
+
|
491
|
+
def test_spanning_cells
|
492
|
+
pdf_file_path = "./test/data/spanning_cells.pdf"
|
493
|
+
expected_data_path = "./test/data/spanning_cells.csv"
|
494
|
+
expected = open(expected_data_path, 'r').read
|
495
|
+
|
496
|
+
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
497
|
+
spreadsheet = pdf_page.spreadsheets.first
|
498
|
+
assert_equal expected, spreadsheet.to_csv
|
499
|
+
end
|
500
|
+
end
|
501
|
+
|
502
|
+
def test_almost_vertical_lines
|
503
|
+
pdf_file_path = "./test/data/puertos1.pdf"
|
504
|
+
top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
|
505
|
+
area = Tabula::ZoneEntity.new(top, left,
|
506
|
+
right - left, bottom - top)
|
507
|
+
|
508
|
+
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
509
|
+
rulings = Tabula::Ruling.crop_rulings_to_area(pdf_page.ruling_lines, area)
|
510
|
+
# TODO assertion not entirely correct, should do the trick for now
|
511
|
+
assert_equal 15, rulings.select(&:vertical?).count
|
512
|
+
end
|
513
|
+
end
|
514
|
+
|
515
|
+
def test_extract_spreadsheet_within_an_area
|
516
|
+
pdf_file_path = "./test/data/puertos1.pdf"
|
517
|
+
top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
|
518
|
+
|
519
|
+
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
520
|
+
area = pdf_page.get_area([top, left, bottom, right])
|
521
|
+
table = area.spreadsheets.first.to_a
|
522
|
+
assert_equal 15, table.length
|
523
|
+
assert_equal ["", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM"], table.first
|
524
|
+
assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
|
525
|
+
end
|
526
|
+
end
|
527
|
+
end
|
528
|
+
|
529
|
+
class TestIsTabularHeuristic < Minitest::Test
|
530
|
+
|
531
|
+
EXPECTED_TO_BE_SPREADSHEET = ['47008204D_USA.page4.pdf', 'GSK_2012_Q4.page437.pdf', 'strongschools.pdf', 'tabla_subsidios.pdf']
|
532
|
+
NOT_EXPECTED_TO_BE_SPREADSHEET = ['560015757GV_China.page1.pdf', 'S2MNCEbirdisland.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
|
533
|
+
|
534
|
+
File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
535
|
+
|
536
|
+
def test_heuristic_detects_spreadsheets
|
537
|
+
EXPECTED_TO_BE_SPREADSHEET.each do |f|
|
538
|
+
path = File.expand_path('data/' + f, File.dirname(__FILE__))
|
539
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
|
540
|
+
page = extractor.extract.first
|
541
|
+
page.get_ruling_lines!
|
542
|
+
assert page.is_tabular?
|
543
|
+
end
|
544
|
+
end
|
545
|
+
|
546
|
+
def test_heuristic_detects_non_spreadsheets
|
547
|
+
NOT_EXPECTED_TO_BE_SPREADSHEET.each do |f|
|
548
|
+
path = File.expand_path('data/' + f, File.dirname(__FILE__))
|
549
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
|
550
|
+
page = extractor.extract.first
|
551
|
+
page.get_ruling_lines!
|
552
|
+
assert !page.is_tabular?
|
553
|
+
end
|
554
|
+
end
|
555
|
+
|
556
|
+
|
557
|
+
|
145
558
|
|
146
559
|
end
|