tabula-extractor 0.6.6-java → 0.7.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
data/lib/tabula/version.rb
CHANGED
data/lib/tabula/writers.rb
CHANGED
@@ -5,9 +5,9 @@ module Tabula
|
|
5
5
|
module Writers
|
6
6
|
|
7
7
|
def Writers.CSV(lines, output=$stdout)
|
8
|
-
lines.each
|
8
|
+
lines.each do |l|
|
9
9
|
output.write CSV.generate_line(l.map(&:text), row_sep: "\r\n")
|
10
|
-
|
10
|
+
end
|
11
11
|
end
|
12
12
|
|
13
13
|
def Writers.JSON(lines, output=$stdout)
|
@@ -15,12 +15,11 @@ module Tabula
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def Writers.TSV(lines, output=$stdout)
|
18
|
-
lines.each
|
19
|
-
output.write(l.map(&:text)
|
20
|
-
|
18
|
+
lines.each do |l|
|
19
|
+
output.write CSV.generate_line(l.map(&:text), col_sep: "\t", row_sep: "\r\n")
|
20
|
+
end
|
21
21
|
end
|
22
22
|
|
23
|
-
|
24
23
|
def Writers.HTML(lines, output=$stdout)
|
25
24
|
raise "not implemented"
|
26
25
|
end
|
data/tabula-extractor.gemspec
CHANGED
@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
|
|
24
24
|
s.add_development_dependency 'minitest'
|
25
25
|
s.add_development_dependency 'bundler', '>= 1.3.4'
|
26
26
|
s.add_development_dependency 'ruby-debug'
|
27
|
+
s.add_development_dependency 'pry'
|
27
28
|
|
28
29
|
s.add_runtime_dependency "trollop", ["~> 2.0"]
|
29
30
|
# s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,88 @@
|
|
1
|
+
FOREST LABORATORIES, INC. DISCLOSURE REPORT "" "" "" ""
|
2
|
+
Calendar Year - 2012 "" "" "" ""
|
3
|
+
Physician Related Entity (if applicable) City / State Purpose of Payment Amount ($USD) * **
|
4
|
+
AALAEI, BEHZAD "" HIGHLAND, IN MEALS $51.24
|
5
|
+
TOTAL "" "" "" $51.24
|
6
|
+
AAMODT, DENISE, E "" ALBUQUERQUE, NM MEALS $66.12
|
7
|
+
TOTAL "" "" "" $66.12
|
8
|
+
AANONSEN, DEBORAH, A "" STATEN ISLAND, NY MEALS $85.00
|
9
|
+
TOTAL "" "" "" $85.00
|
10
|
+
AARON, CAREN, T "" RICHMOND, VA EDUCATIONAL ITEMS $78.80
|
11
|
+
AARON, CAREN, T "" RICHMOND, VA MEALS $392.45
|
12
|
+
TOTAL "" "" "" $471.25
|
13
|
+
AARON, JOHN "" CLARKSVILLE, TN MEALS $20.39
|
14
|
+
TOTAL "" "" "" $20.39
|
15
|
+
AARON, JOSHUA, N "" WEST GROVE, PA MEALS $310.33
|
16
|
+
AARON, JOSHUA, N REGIONAL PULMONARY & SLEEP MEDICINE WEST GROVE, PA SPEAKING FEES $4,700.00
|
17
|
+
TOTAL "" "" "" $5,010.33
|
18
|
+
AARON, MAUREEN, M "" MARTINSVILLE, VA MEALS $193.67
|
19
|
+
TOTAL "" "" "" $193.67
|
20
|
+
AARON, MICHAEL, L "" WEST ISLIP, NY MEALS $19.50
|
21
|
+
TOTAL "" "" "" $19.50
|
22
|
+
AARON, MICHAEL, R "" BROOKLYN, NY MEALS $65.92
|
23
|
+
TOTAL "" "" "" $65.92
|
24
|
+
AARONS, MARK, G "" PINEHURST, NC MEALS $154.19
|
25
|
+
TOTAL "" "" "" $154.19
|
26
|
+
AARONSON, GARY, A "" PHILADELPHIA, PA MEALS $205.17
|
27
|
+
TOTAL "" "" "" $205.17
|
28
|
+
AARONSON, ROBERT, M "" TUCSON, AZ MEALS $24.38
|
29
|
+
TOTAL "" "" "" $24.38
|
30
|
+
AASHEIM, RICHARD, J "" GREENEVILLE, TN EDUCATIONAL ITEMS $2.27
|
31
|
+
AASHEIM, RICHARD, J "" GREENEVILLE, TN MEALS $100.76
|
32
|
+
TOTAL "" "" "" $103.03
|
33
|
+
AASMAA, SIRIKE, T "" MONTVILLE, NJ MEALS $53.33
|
34
|
+
TOTAL "" "" "" $53.33
|
35
|
+
AAZAMI, HESSAM "" GRANADA HILLS, CA MEALS $402.90
|
36
|
+
TOTAL "" "" "" $402.90
|
37
|
+
ABAABA, ABIEDU, C "" JACKSONVILLE, FL MEALS $13.49
|
38
|
+
TOTAL "" "" "" $13.49
|
39
|
+
ABABNEH, ALAELDIN, A "" KANSAS CITY, KS MEALS $10.31
|
40
|
+
TOTAL "" "" "" $10.31
|
41
|
+
ABAD, ANTONIO, A "" CORAL SPRINGS, FL MEALS $516.29
|
42
|
+
TOTAL "" "" "" $516.29
|
43
|
+
ABADEER, PETER, S "" NORMAL, IL MEALS $200.38
|
44
|
+
TOTAL "" "" "" $200.38
|
45
|
+
ABAD, ENZO, L "" MIAMI, FL MEALS $67.61
|
46
|
+
TOTAL "" "" "" $67.61
|
47
|
+
ABADIAN SHARIFABAD, MANOOCHEHR "" GRANADA HILLS, CA MEALS $12.37
|
48
|
+
TOTAL "" "" "" $12.37
|
49
|
+
ABADI, CHRISTOPHER, A "" WARWICK, RI MEALS $157.42
|
50
|
+
TOTAL "" "" "" $157.42
|
51
|
+
ABADIE, MARCUS, G "" ATHENS, TX MEALS $361.89
|
52
|
+
TOTAL "" "" "" $361.89
|
53
|
+
ABADI, JAMSHEED, S "" BROOKLYN, NY MEALS $363.40
|
54
|
+
TOTAL "" "" "" $363.40
|
55
|
+
ABADILLA, JUNE, E "" JACKSON, KY MEALS $105.33
|
56
|
+
TOTAL "" "" "" $105.33
|
57
|
+
ABAD, JOHN, P "" NEWARK, OH MEALS $347.64
|
58
|
+
TOTAL "" "" "" $347.64
|
59
|
+
ABAD, JOSE, F "" FOLSOM, CA MEALS $30.28
|
60
|
+
TOTAL "" "" "" $30.28
|
61
|
+
ABAD, REMEDIOS, D "" WILNINGTON, DE MEALS $26.85
|
62
|
+
TOTAL "" "" "" $26.85
|
63
|
+
ABAD, SO KIM, F "" WICHITA FALLS, TX MEALS $136.52
|
64
|
+
TOTAL "" "" "" $136.52
|
65
|
+
ABAD, ZOILO, R "" MIAMI, FL MEALS $93.83
|
66
|
+
TOTAL "" "" "" $93.83
|
67
|
+
ABALIHI, CAROL, N "" EL PASO, TX MEALS $88.48
|
68
|
+
TOTAL "" "" "" $88.48
|
69
|
+
ABALOS, ANNA, T "" ROSEVILLE, CA MEALS $178.60
|
70
|
+
TOTAL "" "" "" $178.60
|
71
|
+
ABALOS, ARTURO, Z "" DELANO, CA MEALS $48.06
|
72
|
+
TOTAL "" "" "" $48.06
|
73
|
+
ABALOS, JOSEPH, M "" SENECA, PA MEALS $39.03
|
74
|
+
TOTAL "" "" "" $39.03
|
75
|
+
ABANDO, JOSE, R "" DAYTONA BEACH, FL MEALS $83.44
|
76
|
+
TOTAL "" "" "" $83.44
|
77
|
+
ABANG, ANTHONY, E "" ELIZABETHTOWN, KY MEALS $12.62
|
78
|
+
TOTAL "" "" "" $12.62
|
79
|
+
ABAN, KENRIC, T "" SAN DIEGO, CA MEALS $11.91
|
80
|
+
TOTAL "" "" "" $11.91
|
81
|
+
ABAQUETA, ALVIN, Y "" CHARLOTTE, NC MEALS $233.71
|
82
|
+
TOTAL "" "" "" $233.71
|
83
|
+
ABARCA, SERGIO, O "" TOOELE, UT MEALS $159.58
|
84
|
+
TOTAL "" "" "" $159.58
|
85
|
+
ABARIKWU, CONSTANTIA, A "" PHOENIX, AZ MEALS $153.57
|
86
|
+
TOTAL "" "" "" $153.57
|
87
|
+
ABASHIDZE, TEAH, A "" CLEVELAND, OH MEALS $153.59
|
88
|
+
TOTAL "" "" "" $153.59
|
Binary file
|
Binary file
|
@@ -0,0 +1,21 @@
|
|
1
|
+
Improved operation scenario,"","","","",""
|
2
|
+
Volume servers in:,2007,2008,2009,2010,2011
|
3
|
+
Server closets,"1,505","1,580","1,643","1,673","1,689"
|
4
|
+
Server rooms,"1,512","1,586","1,646","1,677","1,693"
|
5
|
+
Localized data centers,"1,512","1,586","1,646","1,677","1,693"
|
6
|
+
Mid-tier data centers,"1,512","1,586","1,646","1,677","1,693"
|
7
|
+
Enterprise-class data centers,"1,512","1,586","1,646","1,677","1,693"
|
8
|
+
Best practice scenario,"","","","",""
|
9
|
+
Volume servers in:,2007,2008,2009,2010,2011
|
10
|
+
Server closets,"1,456","1,439","1,386","1,296","1,326"
|
11
|
+
Server rooms,"1,465","1,472","1,427","1,334","1,371"
|
12
|
+
Localized data centers,"1,465","1,471","1,426","1,334","1,371"
|
13
|
+
Mid-tier data centers,"1,465","1,471","1,426","1,334","1,371"
|
14
|
+
Enterprise-class data centers,"1,465","1,471","1,426","1,334","1,371"
|
15
|
+
State-of-the-art scenario,"","","","",""
|
16
|
+
Volume servers in:,2007,2008,2009,2010,2011
|
17
|
+
Server closets,"1,485","1,471","1,424","1,315","1,349"
|
18
|
+
Server rooms,"1,495","1,573","1,586","1,424","1,485"
|
19
|
+
Localized data centers,"1,495","1,572","1,585","1,424","1,485"
|
20
|
+
Mid-tier data centers,"1,495","1,572","1,585","1,424","1,485"
|
21
|
+
Enterprise-class data centers,"1,495","1,572","1,585","1,424","1,485"
|
Binary file
|
Binary file
|
File without changes
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/test/heuristic.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#a list of filenames and the correct answer
|
2
|
+
# no more bs.
|
3
|
+
require_relative '../lib/tabula'
|
4
|
+
|
5
|
+
|
6
|
+
should_use_spreadsheet = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "spreadsheet/*") ).map{|a| [a, true]}
|
7
|
+
should_use_original = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "original/*") ).map{|a| [a, false]}
|
8
|
+
|
9
|
+
correct = []
|
10
|
+
misclassified_as_original = []
|
11
|
+
misclassified_as_spreadsheet = []
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
def heuristic(page)
|
16
|
+
page.is_tabular?
|
17
|
+
end
|
18
|
+
|
19
|
+
(should_use_spreadsheet + should_use_original) .each do |filename, expected_to_be_tabular|
|
20
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(filename, [1])
|
21
|
+
|
22
|
+
page = extractor.extract.first
|
23
|
+
page.get_ruling_lines!
|
24
|
+
# puts "#{File.basename(filename)} | #{expected_to_be_tabular}"
|
25
|
+
page_is_tabular = heuristic(page)
|
26
|
+
# puts ""
|
27
|
+
|
28
|
+
if page_is_tabular && expected_to_be_tabular || !page_is_tabular && !expected_to_be_tabular
|
29
|
+
correct << filename
|
30
|
+
elsif page_is_tabular && !expected_to_be_tabular
|
31
|
+
misclassified_as_spreadsheet << filename
|
32
|
+
elsif !page_is_tabular && expected_to_be_tabular
|
33
|
+
misclassified_as_original << filename
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
puts "#{correct.size} PDFs were correctly classified"
|
38
|
+
puts "#{misclassified_as_original.size + misclassified_as_spreadsheet.size} PDFs were incorrectly classified"
|
39
|
+
unless misclassified_as_spreadsheet.empty?
|
40
|
+
puts "#{misclassified_as_spreadsheet.size} PDFs should use the original extraction algorithm\n\t but was classified as needing the spreadsheet algorithm"
|
41
|
+
misclassified_as_spreadsheet.each do |filename|
|
42
|
+
puts " - #{File.basename(filename)}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
unless misclassified_as_original.empty?
|
46
|
+
puts "#{misclassified_as_original.size} PDFs should use the spreadsheet extraction algorithm\n\t but was classified as needing the original algorithm"
|
47
|
+
misclassified_as_original.each do |filename|
|
48
|
+
puts " - #{File.basename(filename)}"
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf --silent -o test.csv
|
2
|
+
bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf -o test.csv
|
3
|
+
bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv
|
4
|
+
bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv --format TSV
|
5
|
+
bin/tabula test/data/campaign_donors.pdf -o test.csv --columns 47,147,256,310,375,431,504 #columns should work
|
6
|
+
bin/tabula test/data/argentina_diputados_voting_record.pdf --guess -o test.csv --format TSV #should exclude guff
|
7
|
+
bin/tabula test/data/vertical_rulings_bug.pdf --area 250,0,325,1700 -o test.csv --format TSV #should be only a few lines
|
data/test/tests.rb
CHANGED
@@ -6,10 +6,109 @@ require_relative '../lib/tabula'
|
|
6
6
|
|
7
7
|
def lines_to_array(lines)
|
8
8
|
lines.map { |l|
|
9
|
-
l.map { |te| te.text }
|
9
|
+
l.map { |te| te.text.strip }
|
10
10
|
}
|
11
11
|
end
|
12
12
|
|
13
|
+
def lines_to_table(lines)
|
14
|
+
Tabula::Table.new_from_array(lines_to_array(lines))
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
# I don't want to pollute the "real" clasend a funny inspect method. Just for testing comparisons.
|
19
|
+
module Tabula
|
20
|
+
class Table
|
21
|
+
def inspect
|
22
|
+
"[" + lines.map(&:inspect).join(",") + "]"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
module Tabula
|
28
|
+
class Line
|
29
|
+
def inspect
|
30
|
+
@text_elements.map(&:text).inspect
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
class TestEntityComparability < Minitest::Test
|
37
|
+
def test_text_element_comparability
|
38
|
+
base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
|
39
|
+
|
40
|
+
two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
|
41
|
+
three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
|
42
|
+
four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
|
43
|
+
|
44
|
+
five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
|
45
|
+
six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
|
46
|
+
seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
|
47
|
+
assert_equal base, two
|
48
|
+
assert_equal base, three
|
49
|
+
assert_equal base, four
|
50
|
+
|
51
|
+
refute_equal base, five
|
52
|
+
refute_equal base, six
|
53
|
+
refute_equal base, seven
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_line_comparability
|
57
|
+
text_base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
|
58
|
+
|
59
|
+
text_two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy \n", nil)
|
60
|
+
text_three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
|
61
|
+
text_four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
|
62
|
+
|
63
|
+
text_five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
|
64
|
+
text_six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy kj", 55)
|
65
|
+
text_seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy kj", nil)
|
66
|
+
line_base = Tabula::Line.new
|
67
|
+
line_base.text_elements = [text_base, text_two, text_three]
|
68
|
+
line_equal = Tabula::Line.new
|
69
|
+
line_equal.text_elements = [text_base, text_two, text_three]
|
70
|
+
line_equal_but_longer = Tabula::Line.new
|
71
|
+
line_equal_but_longer.text_elements = [text_base, text_two, text_three, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
|
72
|
+
line_unequal = Tabula::Line.new
|
73
|
+
line_unequal.text_elements = [text_base, text_two, text_three, text_five]
|
74
|
+
line_unequal_and_longer = Tabula::Line.new
|
75
|
+
line_unequal_and_longer.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
|
76
|
+
line_unequal_and_longer_and_different = Tabula::Line.new
|
77
|
+
line_unequal_and_longer_and_different.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, 'whatever']
|
78
|
+
|
79
|
+
assert_equal line_base, line_equal
|
80
|
+
assert_equal line_base, line_equal_but_longer
|
81
|
+
refute_equal line_base, line_unequal
|
82
|
+
refute_equal line_base, line_unequal_and_longer
|
83
|
+
refute_equal line_base, line_unequal_and_longer_and_different
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_table_comparability
|
87
|
+
rows_base = [["a", "b", "c"], ['', 'd', '']]
|
88
|
+
rows_equal = [["a", "b", "c"], ['', 'd']]
|
89
|
+
rows_equal_padded = [['', "a", "b", "c"], ['', '', 'd']]
|
90
|
+
rows_unequal_one = [["a", "b", "c"], ['d']]
|
91
|
+
rows_unequal_two = [["a", "b", "c"], ['d', '']]
|
92
|
+
rows_unequal_three = [["a", "b", "c"], ['d'], ['a','b', 'd']]
|
93
|
+
rows_unequal_four = [["a", "b", "c"]]
|
94
|
+
|
95
|
+
table_base = Tabula::Table.new_from_array(rows_base)
|
96
|
+
table_equal = Tabula::Table.new_from_array(rows_equal)
|
97
|
+
table_equal_column_padded = Tabula::Table.new_from_array(rows_equal_padded)
|
98
|
+
table_unequal_one = Tabula::Table.new_from_array(rows_unequal_one)
|
99
|
+
table_unequal_two = Tabula::Table.new_from_array(rows_unequal_two)
|
100
|
+
table_unequal_three = Tabula::Table.new_from_array(rows_unequal_three)
|
101
|
+
table_unequal_four = Tabula::Table.new_from_array(rows_unequal_four)
|
102
|
+
|
103
|
+
assert_equal table_base, table_equal
|
104
|
+
assert_equal table_base, table_equal_column_padded
|
105
|
+
refute_equal table_base, table_unequal_one
|
106
|
+
refute_equal table_base, table_unequal_two
|
107
|
+
refute_equal table_base, table_unequal_three
|
108
|
+
refute_equal table_base, table_unequal_four
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
13
112
|
class TestPagesInfoExtractor < Minitest::Test
|
14
113
|
def test_pages_info_extractor
|
15
114
|
extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
@@ -24,42 +123,74 @@ class TestPagesInfoExtractor < Minitest::Test
|
|
24
123
|
end
|
25
124
|
|
26
125
|
class TestTableGuesser < Minitest::Test
|
126
|
+
def test_find_rects_from_lines_with_lsd
|
127
|
+
skip "Skipping until we actually use LSD"
|
128
|
+
filename = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
129
|
+
page_index = 0
|
130
|
+
lines = Tabula::Extraction::LineExtractor.lines_in_pdf_page(filename, page_index, :render_pdf => true)
|
131
|
+
|
132
|
+
page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
|
133
|
+
page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
|
134
|
+
expected_page_areas = [[54.087890625, 50.203125, 734.220703125, 550.44140625]]
|
135
|
+
assert_equal expected_page_areas, page_areas
|
136
|
+
end
|
137
|
+
|
27
138
|
end
|
28
139
|
|
29
140
|
class TestDumper < Minitest::Test
|
30
141
|
|
31
142
|
def test_extractor
|
32
|
-
extractor = Tabula::Extraction::
|
33
|
-
page = extractor.extract.
|
143
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
144
|
+
page = extractor.extract.next
|
34
145
|
assert_instance_of Tabula::Page, page
|
35
146
|
end
|
36
147
|
|
37
148
|
def test_get_by_area
|
38
|
-
|
39
|
-
# http://localhost:8080/debug/418b1d5698e5c7b724551d9610c071ab3063275c/characters?x1=57.921428571428564&x2=290.7&y1=107.1&y2=394.52142857142854&page=1&use_lines=false
|
40
|
-
extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
149
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
|
41
150
|
characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
|
42
151
|
assert_equal characters.size, 206
|
43
152
|
end
|
44
153
|
end
|
45
154
|
|
155
|
+
class TestRulingIntersection < Minitest::Test
|
156
|
+
def test_ruling_intersection
|
157
|
+
horizontals = [Tabula::Ruling.new(10, 1, 10, 0)]
|
158
|
+
verticals = [Tabula::Ruling.new(1, 3, 0, 11),
|
159
|
+
Tabula::Ruling.new(1, 4, 0, 11)]
|
160
|
+
ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
|
161
|
+
assert_equal 2, ints.size
|
162
|
+
assert_equal ints[0][0].getX, 3.0
|
163
|
+
assert_equal ints[0][0].getY, 10.0
|
164
|
+
assert_equal ints[1][0].getX, 4.0
|
165
|
+
assert_equal ints[1][0].getY, 10.0
|
166
|
+
|
167
|
+
verticals = [Tabula::Ruling.new(20, 3, 0, 11)]
|
168
|
+
ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
|
169
|
+
assert_equal ints.size, 0
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
46
173
|
class TestExtractor < Minitest::Test
|
47
174
|
|
48
175
|
def test_table_extraction_1
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
176
|
+
table = lines_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
|
177
|
+
1,
|
178
|
+
[107.1, 57.9214, 394.5214, 290.7],
|
179
|
+
:detect_ruling_lines => false)
|
180
|
+
|
181
|
+
expected = [["Prior Scale","New Scale","% Rank*"], ["800","170","99"], ["790","170","99"], ["780","170","99"], ["770","170","99"], ["760","170","99"], ["750","169","99"], ["740","169","99"], ["730","168","98"], ["720","168","98"], ["710","167","97"], ["700","166","96"], ["690","165","95"], ["680","165","95"], ["670","164","93"], ["660","164","93"], ["650","163","91"]]
|
182
|
+
|
53
183
|
assert_equal expected, table
|
54
184
|
end
|
55
185
|
|
56
186
|
def test_diputados_voting_record
|
57
|
-
|
58
|
-
|
187
|
+
table = lines_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
|
188
|
+
1,
|
189
|
+
[269.875, 12.75, 790.5, 561])
|
59
190
|
|
60
191
|
expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
|
61
192
|
|
62
|
-
assert_equal expected,
|
193
|
+
assert_equal expected, table
|
63
194
|
end
|
64
195
|
|
65
196
|
def test_forest_disclosure_report_dont_regress
|
@@ -67,80 +198,362 @@ class TestExtractor < Minitest::Test
|
|
67
198
|
# test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
|
68
199
|
# and a solution for half-x-height-offset lines.
|
69
200
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
70
|
-
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
71
|
-
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
72
|
-
vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
|
73
201
|
|
202
|
+
table = lines_to_table Tabula.extract_table(pdf_file_path,
|
203
|
+
1,
|
204
|
+
[106.01, 48.09, 227.31, 551.89],
|
205
|
+
:detect_ruling_lines => true)
|
74
206
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
|
80
|
-
['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
|
81
|
-
['TOTAL', '', '', '', '$471.25'],
|
82
|
-
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
|
83
|
-
['TOTAL', '', '', '','$20.39'],
|
84
|
-
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
|
85
|
-
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
86
|
-
['TOTAL', '', '', '', '$5,010.33'],
|
87
|
-
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
|
88
|
-
['TOTAL', '', '', '', '$193.67'],
|
89
|
-
['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
|
90
|
-
|
91
|
-
assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
|
207
|
+
expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
|
208
|
+
|
209
|
+
|
210
|
+
assert_equal expected, table
|
92
211
|
end
|
93
212
|
|
94
213
|
def test_missing_spaces_around_an_ampersand
|
95
214
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
96
|
-
character_extractor = Tabula::Extraction::
|
97
|
-
|
98
|
-
|
215
|
+
character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
|
216
|
+
page_obj = character_extractor.extract.next
|
217
|
+
lines = page_obj.ruling_lines
|
218
|
+
vertical_rulings = lines.select(&:vertical?)
|
99
219
|
|
220
|
+
area = [170, 28, 185, 833] #top left bottom right
|
100
221
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
222
|
+
expected = Tabula::Table.new_from_array([
|
223
|
+
["", "REGIONAL PULMONARY & SLEEP",],
|
224
|
+
["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"],
|
225
|
+
["", "MEDICINE", ],
|
226
|
+
])
|
106
227
|
|
107
|
-
assert_equal expected,
|
228
|
+
assert_equal expected, lines_to_table(page_obj.get_area(area).make_table(:vertical_rulings => vertical_rulings))
|
108
229
|
end
|
109
230
|
|
110
231
|
def test_forest_disclosure_report
|
111
232
|
skip "Skipping until we support multiline cells"
|
112
233
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
113
|
-
character_extractor = Tabula::Extraction::
|
234
|
+
character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
|
114
235
|
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
115
|
-
vertical_rulings = lines.select(&:vertical?)
|
236
|
+
vertical_rulings = lines.select(&:vertical?) #.uniq{|line| (line.left / 10).round }
|
116
237
|
|
117
|
-
|
238
|
+
page_obj = character_extractor.extract.next
|
239
|
+
characters = page_obj.get_text([110, 28, 218, 833])
|
118
240
|
#top left bottom right
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
241
|
+
expected = Tabula::Table.new_from_array([
|
242
|
+
['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '', '$85.00'],
|
243
|
+
['TOTAL', '', '', '','$85.00'],
|
244
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '', '$78.80'],
|
245
|
+
['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '', '$392.45'],
|
246
|
+
['TOTAL', '', '', '', '$471.25'],
|
247
|
+
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '', '$20.39'],
|
248
|
+
['TOTAL', '', '', '','$20.39'],
|
249
|
+
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '', '$310.33'],
|
250
|
+
['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '', '$4,700.00'],
|
251
|
+
['TOTAL', '', '', '', '$5,010.33'],
|
252
|
+
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '', '$193.67'],
|
253
|
+
['TOTAL', '', '', '', '$193.67'],
|
254
|
+
['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '', '$19.50']
|
255
|
+
])
|
256
|
+
|
257
|
+
assert_equal expected, lines_to_table(Tabula.make_table(characters, :vertical_rulings => vertical_rulings))
|
134
258
|
end
|
135
259
|
|
136
260
|
# TODO Spaces inserted in words - fails
|
137
261
|
def test_bo_page24
|
138
|
-
|
139
|
-
|
262
|
+
table = lines_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
|
263
|
+
1,
|
264
|
+
[425.625, 53.125, 575.714, 810.535],
|
265
|
+
:detect_ruling_lines => false)
|
140
266
|
|
141
267
|
expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
|
142
|
-
|
268
|
+
|
269
|
+
assert_equal expected, table
|
270
|
+
end
|
271
|
+
|
272
|
+
|
273
|
+
def test_vertical_rulings_splitting_words
|
274
|
+
#if a vertical ruling crosses over a word, the word should be split at that vertical ruling
|
275
|
+
# before, the entire word would end up on one side of the vertical ruling.
|
276
|
+
pdf_file_path = File.expand_path('data/vertical_rulings_bug.pdf', File.dirname(__FILE__))
|
277
|
+
|
278
|
+
#both of these are semantically "correct"; the difference is in how we handle multi-line cells
|
279
|
+
expected = Tabula::Table.new_from_array([
|
280
|
+
["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
|
281
|
+
["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
|
282
|
+
["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH ABRAHAMSON", "", "", "$22.93", "", "", "$22.93"]
|
283
|
+
])
|
284
|
+
other_expected = Tabula::Table.new_from_array([
|
285
|
+
["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
|
286
|
+
["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
|
287
|
+
["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH", "", "", "$22.93", "", "", "$22.93"],
|
288
|
+
["", "", "", "ABRAHAMSON"]
|
289
|
+
])
|
290
|
+
|
291
|
+
#N.B. it's "MORGANTOWN", "WV" that we're most interested in here (it used to show up as ["MORGANTOWNWV", "", ""])
|
292
|
+
|
293
|
+
|
294
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, 1...2) #:all ) # 1..2643
|
295
|
+
extractor.extract.each_with_index do |pdf_page, page_index|
|
296
|
+
|
297
|
+
page_areas = [[250, 0, 325, 1700]]
|
298
|
+
|
299
|
+
scale_factor = pdf_page.width / 1700
|
300
|
+
|
301
|
+
vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
|
302
|
+
|
303
|
+
tables = page_areas.map do |page_area|
|
304
|
+
pdf_page.get_area(page_area).make_table(:vertical_rulings => vertical_rulings)
|
305
|
+
end
|
306
|
+
assert_equal expected, lines_to_table(tables.first)
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
def test_vertical_rulings_prevent_merging_of_columns
|
311
|
+
expected = [["SZARANGOWICZ", "GUSTAVO ALEJANDRO", "25.096.244", "20-25096244-5", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TAILHADE", "LUIS RODOLFO", "21.386.299", "20-21386299-6", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TEDESCHI", "ADRIÁN ALBERTO", "24.171.507", "20-24171507-9", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["URRIZA", "MARÍA TERESA", "18.135.604", "27-18135604-4", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["USTARROZ", "GERÓNIMO JAVIER", "24.912.947", "20-24912947-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VALSANGIACOMO BLANC", "OFERNANDO JORGE", "26.800.203", "20-26800203-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VICENTE", "PABLO ARIEL", "21.897.586", "20-21897586-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["AMBURI", "HUGO ALBERTO", "14.096.560", "20-14096560-0", "09/10/2013", "EFECTIVO", "$ 20.000,00"], ["BERRA", "CLAUDIA SUSANA", "14.433.112", "27-14433112-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"]]
|
312
|
+
|
313
|
+
vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
|
314
|
+
|
315
|
+
table = lines_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
|
316
|
+
1,
|
317
|
+
[255.57,40.43,398.76,557.35],
|
318
|
+
:vertical_rulings => vertical_rulings)
|
319
|
+
|
320
|
+
assert_equal expected, table
|
321
|
+
end
|
322
|
+
|
323
|
+
def test_get_spacing_and_merging_right
|
324
|
+
table = lines_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
|
325
|
+
1,
|
326
|
+
[52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
|
327
|
+
:detect_ruling_lines => true)
|
328
|
+
|
329
|
+
expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia ", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
|
330
|
+
|
331
|
+
assert_equal expected, table
|
332
|
+
|
333
|
+
end
|
334
|
+
|
335
|
+
|
336
|
+
class SpreadsheetsHasCellsTester
|
337
|
+
include Tabula::HasCells
|
338
|
+
attr_accessor :cells
|
339
|
+
def initialize(cells)
|
340
|
+
@cells = cells
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
#just tests the algorithm
|
345
|
+
def test_cells_to_spreadsheets
|
346
|
+
|
347
|
+
cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
|
348
|
+
Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
|
349
|
+
Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
|
350
|
+
Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
|
351
|
+
Tabula::Cell.new(90.0, 18.0, 52.0, 4.0), Tabula::Cell.new(94.0, 18.0, 52.0, 6.0),
|
352
|
+
Tabula::Cell.new(100.0, 18.0, 52.0, 28.0), Tabula::Cell.new(128.0, 18.0, 52.0, 4.0),
|
353
|
+
Tabula::Cell.new(132.0, 18.0, 52.0, 64.0), Tabula::Cell.new(196.0, 18.0, 52.0, 66.0),
|
354
|
+
Tabula::Cell.new(262.0, 18.0, 52.0, 4.0), Tabula::Cell.new(266.0, 18.0, 52.0, 84.0),
|
355
|
+
Tabula::Cell.new(350.0, 18.0, 52.0, 4.0), Tabula::Cell.new(354.0, 18.0, 52.0, 32.0),
|
356
|
+
Tabula::Cell.new(386.0, 18.0, 52.0, 38.0), Tabula::Cell.new(424.0, 18.0, 52.0, 18.0),
|
357
|
+
Tabula::Cell.new(442.0, 18.0, 52.0, 74.0), Tabula::Cell.new(516.0, 18.0, 52.0, 28.0),
|
358
|
+
Tabula::Cell.new(544.0, 18.0, 52.0, 4.0), Tabula::Cell.new(44.0, 70.0, 156.0, 6.0),
|
359
|
+
Tabula::Cell.new(50.0, 70.0, 156.0, 4.0), Tabula::Cell.new(54.0, 70.0, 156.0, 6.0),
|
360
|
+
Tabula::Cell.new(60.0, 70.0, 156.0, 4.0), Tabula::Cell.new(64.0, 70.0, 156.0, 6.0),
|
361
|
+
Tabula::Cell.new(70.0, 70.0, 156.0, 4.0), Tabula::Cell.new(74.0, 70.0, 156.0, 6.0),
|
362
|
+
Tabula::Cell.new(84.0, 70.0, 2.0, 6.0), Tabula::Cell.new(90.0, 70.0, 156.0, 4.0),
|
363
|
+
Tabula::Cell.new(94.0, 70.0, 156.0, 6.0), Tabula::Cell.new(100.0, 70.0, 156.0, 28.0),
|
364
|
+
Tabula::Cell.new(128.0, 70.0, 156.0, 4.0), Tabula::Cell.new(132.0, 70.0, 156.0, 64.0),
|
365
|
+
Tabula::Cell.new(196.0, 70.0, 156.0, 66.0), Tabula::Cell.new(262.0, 70.0, 156.0, 4.0),
|
366
|
+
Tabula::Cell.new(266.0, 70.0, 156.0, 84.0), Tabula::Cell.new(350.0, 70.0, 156.0, 4.0),
|
367
|
+
Tabula::Cell.new(354.0, 70.0, 156.0, 32.0), Tabula::Cell.new(386.0, 70.0, 156.0, 38.0),
|
368
|
+
Tabula::Cell.new(424.0, 70.0, 156.0, 18.0), Tabula::Cell.new(442.0, 70.0, 156.0, 74.0),
|
369
|
+
Tabula::Cell.new(516.0, 70.0, 156.0, 28.0), Tabula::Cell.new(544.0, 70.0, 156.0, 4.0),
|
370
|
+
Tabula::Cell.new(84.0, 72.0, 446.0, 6.0), Tabula::Cell.new(90.0, 226.0, 176.0, 4.0),
|
371
|
+
Tabula::Cell.new(94.0, 226.0, 176.0, 6.0), Tabula::Cell.new(100.0, 226.0, 176.0, 28.0),
|
372
|
+
Tabula::Cell.new(128.0, 226.0, 176.0, 4.0), Tabula::Cell.new(132.0, 226.0, 176.0, 64.0),
|
373
|
+
Tabula::Cell.new(196.0, 226.0, 176.0, 66.0), Tabula::Cell.new(262.0, 226.0, 176.0, 4.0),
|
374
|
+
Tabula::Cell.new(266.0, 226.0, 176.0, 84.0), Tabula::Cell.new(350.0, 226.0, 176.0, 4.0),
|
375
|
+
Tabula::Cell.new(354.0, 226.0, 176.0, 32.0), Tabula::Cell.new(386.0, 226.0, 176.0, 38.0),
|
376
|
+
Tabula::Cell.new(424.0, 226.0, 176.0, 18.0), Tabula::Cell.new(442.0, 226.0, 176.0, 74.0),
|
377
|
+
Tabula::Cell.new(516.0, 226.0, 176.0, 28.0), Tabula::Cell.new(544.0, 226.0, 176.0, 4.0),
|
378
|
+
Tabula::Cell.new(90.0, 402.0, 116.0, 4.0), Tabula::Cell.new(94.0, 402.0, 116.0, 6.0),
|
379
|
+
Tabula::Cell.new(100.0, 402.0, 116.0, 28.0), Tabula::Cell.new(128.0, 402.0, 116.0, 4.0),
|
380
|
+
Tabula::Cell.new(132.0, 402.0, 116.0, 64.0), Tabula::Cell.new(196.0, 402.0, 116.0, 66.0),
|
381
|
+
Tabula::Cell.new(262.0, 402.0, 116.0, 4.0), Tabula::Cell.new(266.0, 402.0, 116.0, 84.0),
|
382
|
+
Tabula::Cell.new(350.0, 402.0, 116.0, 4.0), Tabula::Cell.new(354.0, 402.0, 116.0, 32.0),
|
383
|
+
Tabula::Cell.new(386.0, 402.0, 116.0, 38.0), Tabula::Cell.new(424.0, 402.0, 116.0, 18.0),
|
384
|
+
Tabula::Cell.new(442.0, 402.0, 116.0, 74.0), Tabula::Cell.new(516.0, 402.0, 116.0, 28.0),
|
385
|
+
Tabula::Cell.new(544.0, 402.0, 116.0, 4.0), Tabula::Cell.new(84.0, 518.0, 246.0, 6.0),
|
386
|
+
Tabula::Cell.new(90.0, 518.0, 186.0, 4.0), Tabula::Cell.new(94.0, 518.0, 186.0, 6.0),
|
387
|
+
Tabula::Cell.new(100.0, 518.0, 186.0, 28.0), Tabula::Cell.new(128.0, 518.0, 186.0, 4.0),
|
388
|
+
Tabula::Cell.new(132.0, 518.0, 186.0, 64.0), Tabula::Cell.new(196.0, 518.0, 186.0, 66.0),
|
389
|
+
Tabula::Cell.new(262.0, 518.0, 186.0, 4.0), Tabula::Cell.new(266.0, 518.0, 186.0, 84.0),
|
390
|
+
Tabula::Cell.new(350.0, 518.0, 186.0, 4.0), Tabula::Cell.new(354.0, 518.0, 186.0, 32.0),
|
391
|
+
Tabula::Cell.new(386.0, 518.0, 186.0, 38.0), Tabula::Cell.new(424.0, 518.0, 186.0, 18.0),
|
392
|
+
Tabula::Cell.new(442.0, 518.0, 186.0, 74.0), Tabula::Cell.new(516.0, 518.0, 186.0, 28.0),
|
393
|
+
Tabula::Cell.new(544.0, 518.0, 186.0, 4.0), Tabula::Cell.new(90.0, 704.0, 60.0, 4.0),
|
394
|
+
Tabula::Cell.new(94.0, 704.0, 60.0, 6.0), Tabula::Cell.new(100.0, 704.0, 60.0, 28.0),
|
395
|
+
Tabula::Cell.new(128.0, 704.0, 60.0, 4.0), Tabula::Cell.new(132.0, 704.0, 60.0, 64.0),
|
396
|
+
Tabula::Cell.new(196.0, 704.0, 60.0, 66.0), Tabula::Cell.new(262.0, 704.0, 60.0, 4.0),
|
397
|
+
Tabula::Cell.new(266.0, 704.0, 60.0, 84.0), Tabula::Cell.new(350.0, 704.0, 60.0, 4.0),
|
398
|
+
Tabula::Cell.new(354.0, 704.0, 60.0, 32.0), Tabula::Cell.new(386.0, 704.0, 60.0, 38.0),
|
399
|
+
Tabula::Cell.new(424.0, 704.0, 60.0, 18.0), Tabula::Cell.new(442.0, 704.0, 60.0, 74.0),
|
400
|
+
Tabula::Cell.new(516.0, 704.0, 60.0, 28.0), Tabula::Cell.new(544.0, 704.0, 60.0, 4.0),
|
401
|
+
Tabula::Cell.new(84.0, 764.0, 216.0, 6.0), Tabula::Cell.new(90.0, 764.0, 216.0, 4.0),
|
402
|
+
Tabula::Cell.new(94.0, 764.0, 216.0, 6.0), Tabula::Cell.new(100.0, 764.0, 216.0, 28.0),
|
403
|
+
Tabula::Cell.new(128.0, 764.0, 216.0, 4.0), Tabula::Cell.new(132.0, 764.0, 216.0, 64.0),
|
404
|
+
Tabula::Cell.new(196.0, 764.0, 216.0, 66.0), Tabula::Cell.new(262.0, 764.0, 216.0, 4.0),
|
405
|
+
Tabula::Cell.new(266.0, 764.0, 216.0, 84.0), Tabula::Cell.new(350.0, 764.0, 216.0, 4.0),
|
406
|
+
Tabula::Cell.new(354.0, 764.0, 216.0, 32.0), Tabula::Cell.new(386.0, 764.0, 216.0, 38.0),
|
407
|
+
Tabula::Cell.new(424.0, 764.0, 216.0, 18.0), Tabula::Cell.new(442.0, 764.0, 216.0, 74.0),
|
408
|
+
Tabula::Cell.new(516.0, 764.0, 216.0, 28.0), Tabula::Cell.new(544.0, 764.0, 216.0, 4.0)]
|
409
|
+
|
410
|
+
|
411
|
+
expected_spreadsheets = [Tabula::Spreadsheet.new(40.0, 18.0, 208.0, 40.0, nil, nil, nil, nil),
|
412
|
+
Tabula::Spreadsheet.new(84.0, 18.0, 962.0, 464.0,nil, nil, nil, nil)]
|
413
|
+
|
414
|
+
#compares spreadsheets on area only.
|
415
|
+
assert_equal expected_spreadsheets.map{|s| [s.x, s.y, s.width, s.height] },
|
416
|
+
SpreadsheetsHasCellsTester.new(cells).find_spreadsheets_from_cells.map{|a| s = a.getBounds; [s.x, s.y, s.width, s.height] }
|
417
|
+
|
418
|
+
|
419
|
+
end
|
420
|
+
|
421
|
+
def test_add_spanning_cells
|
422
|
+
skip "until I write it"
|
423
|
+
end
|
424
|
+
|
425
|
+
def test_add_placeholder_cells_to_funny_shaped_tables
|
426
|
+
skip "until I write it, cf 01005787B_Pakistan.pdf"
|
427
|
+
end
|
428
|
+
|
429
|
+
class CellsHasCellsTester
|
430
|
+
include Tabula::HasCells
|
431
|
+
attr_accessor :vertical_ruling_lines, :horizontal_ruling_lines, :cells
|
432
|
+
def initialize(vertical_ruling_lines, horizontal_ruling_lines)
|
433
|
+
@cells = []
|
434
|
+
@vertical_ruling_lines = vertical_ruling_lines
|
435
|
+
@horizontal_ruling_lines = horizontal_ruling_lines
|
436
|
+
find_cells!
|
437
|
+
end
|
438
|
+
end
|
439
|
+
|
440
|
+
#just tests the algorithm
|
441
|
+
def test_lines_to_cells
|
442
|
+
vertical_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 0.0, 40.0),
|
443
|
+
Tabula::Ruling.new(44.0, 70.0, 0.0, 36.0),
|
444
|
+
Tabula::Ruling.new(40.0, 226.0, 0.0, 40.0)]
|
445
|
+
|
446
|
+
horizontal_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 208.0, 0.0),
|
447
|
+
Tabula::Ruling.new(44.0, 18.0, 208.0, 0.0),
|
448
|
+
Tabula::Ruling.new(50.0, 18.0, 208.0, 0.0),
|
449
|
+
Tabula::Ruling.new(54.0, 18.0, 208.0, 0.0),
|
450
|
+
Tabula::Ruling.new(60.0, 18.0, 208.0, 0.0),
|
451
|
+
Tabula::Ruling.new(64.0, 18.0, 208.0, 0.0),
|
452
|
+
Tabula::Ruling.new(70.0, 18.0, 208.0, 0.0),
|
453
|
+
Tabula::Ruling.new(74.0, 18.0, 208.0, 0.0),
|
454
|
+
Tabula::Ruling.new(80.0, 18.0, 208.0, 0.0)]
|
455
|
+
|
456
|
+
expected_cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
|
457
|
+
Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
|
458
|
+
Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
|
459
|
+
Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
|
460
|
+
Tabula::Cell.new(44.0, 70.0, 156.0, 6.0), Tabula::Cell.new(50.0, 70.0, 156.0, 4.0),
|
461
|
+
Tabula::Cell.new(54.0, 70.0, 156.0, 6.0), Tabula::Cell.new(60.0, 70.0, 156.0, 4.0),
|
462
|
+
Tabula::Cell.new(64.0, 70.0, 156.0, 6.0), Tabula::Cell.new(70.0, 70.0, 156.0, 4.0),
|
463
|
+
Tabula::Cell.new(74.0, 70.0, 156.0, 6.0), ]
|
464
|
+
|
465
|
+
actual_cells = CellsHasCellsTester.new(vertical_ruling_lines, horizontal_ruling_lines).cells
|
466
|
+
assert_equal Set.new(expected_cells), Set.new(actual_cells) #I don't care about order
|
143
467
|
end
|
144
468
|
|
469
|
+
#this is the real deal!!
|
470
|
+
def test_extract_tabular_data_using_lines_and_spreadsheets
|
471
|
+
pdf_file_path = "./test/data/frx_2012_disclosure.pdf"
|
472
|
+
expected_data_path = "./test/data/frx_2012_disclosure.tsv"
|
473
|
+
expected = open(expected_data_path, 'r').read #.split("\n").map{|line| line.split("\t")}
|
474
|
+
|
475
|
+
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all).extract.each do |pdf_page|
|
476
|
+
spreadsheet = pdf_page.spreadsheets.first
|
477
|
+
assert_equal expected, spreadsheet.to_tsv
|
478
|
+
end
|
479
|
+
end
|
480
|
+
|
481
|
+
def test_cope_with_a_tableless_page
|
482
|
+
pdf_file_path = "./test/data/no_tables.pdf"
|
483
|
+
|
484
|
+
spreadsheets = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all, '',
|
485
|
+
:line_color_filter => lambda{|components| components.all?{|c| c < 0.1}}
|
486
|
+
).extract.to_a.first.spreadsheets
|
487
|
+
|
488
|
+
assert_equal 0, spreadsheets.size
|
489
|
+
end
|
490
|
+
|
491
|
+
def test_spanning_cells
|
492
|
+
pdf_file_path = "./test/data/spanning_cells.pdf"
|
493
|
+
expected_data_path = "./test/data/spanning_cells.csv"
|
494
|
+
expected = open(expected_data_path, 'r').read
|
495
|
+
|
496
|
+
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
497
|
+
spreadsheet = pdf_page.spreadsheets.first
|
498
|
+
assert_equal expected, spreadsheet.to_csv
|
499
|
+
end
|
500
|
+
end
|
501
|
+
|
502
|
+
def test_almost_vertical_lines
|
503
|
+
pdf_file_path = "./test/data/puertos1.pdf"
|
504
|
+
top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
|
505
|
+
area = Tabula::ZoneEntity.new(top, left,
|
506
|
+
right - left, bottom - top)
|
507
|
+
|
508
|
+
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
509
|
+
rulings = Tabula::Ruling.crop_rulings_to_area(pdf_page.ruling_lines, area)
|
510
|
+
# TODO assertion not entirely correct, should do the trick for now
|
511
|
+
assert_equal 15, rulings.select(&:vertical?).count
|
512
|
+
end
|
513
|
+
end
|
514
|
+
|
515
|
+
def test_extract_spreadsheet_within_an_area
|
516
|
+
pdf_file_path = "./test/data/puertos1.pdf"
|
517
|
+
top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
|
518
|
+
|
519
|
+
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
520
|
+
area = pdf_page.get_area([top, left, bottom, right])
|
521
|
+
table = area.spreadsheets.first.to_a
|
522
|
+
assert_equal 15, table.length
|
523
|
+
assert_equal ["", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM"], table.first
|
524
|
+
assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
|
525
|
+
end
|
526
|
+
end
|
527
|
+
end
|
528
|
+
|
529
|
+
class TestIsTabularHeuristic < Minitest::Test
|
530
|
+
|
531
|
+
EXPECTED_TO_BE_SPREADSHEET = ['47008204D_USA.page4.pdf', 'GSK_2012_Q4.page437.pdf', 'strongschools.pdf', 'tabla_subsidios.pdf']
|
532
|
+
NOT_EXPECTED_TO_BE_SPREADSHEET = ['560015757GV_China.page1.pdf', 'S2MNCEbirdisland.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
|
533
|
+
|
534
|
+
File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
535
|
+
|
536
|
+
def test_heuristic_detects_spreadsheets
|
537
|
+
EXPECTED_TO_BE_SPREADSHEET.each do |f|
|
538
|
+
path = File.expand_path('data/' + f, File.dirname(__FILE__))
|
539
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
|
540
|
+
page = extractor.extract.first
|
541
|
+
page.get_ruling_lines!
|
542
|
+
assert page.is_tabular?
|
543
|
+
end
|
544
|
+
end
|
545
|
+
|
546
|
+
def test_heuristic_detects_non_spreadsheets
|
547
|
+
NOT_EXPECTED_TO_BE_SPREADSHEET.each do |f|
|
548
|
+
path = File.expand_path('data/' + f, File.dirname(__FILE__))
|
549
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
|
550
|
+
page = extractor.extract.first
|
551
|
+
page.get_ruling_lines!
|
552
|
+
assert !page.is_tabular?
|
553
|
+
end
|
554
|
+
end
|
555
|
+
|
556
|
+
|
557
|
+
|
145
558
|
|
146
559
|
end
|