RubyGems - tabula-extractor - Versions diffs - 0.6.6-java → 0.7.0-java - Mend

tabula-extractor 0.6.6-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

checksums.yaml +7 -0
data/AUTHORS.md +1 -0
data/README.md +27 -11
data/bin/tabula +61 -19
data/ext/liblsd-linux32.so +0 -0
data/ext/liblsd-linux64.so +0 -0
data/ext/liblsd.dll +0 -0
data/ext/liblsd.dylib +0 -0
data/ext/liblsd64.dll +0 -0
data/ext/lsd.c +137 -137
data/ext/lsd.h +9 -9
data/lib/tabula.rb +20 -3
data/lib/tabula/core_ext.rb +261 -0
data/lib/tabula/entities.rb +11 -456
data/lib/tabula/entities/cell.rb +42 -0
data/lib/tabula/entities/has_cells.rb +244 -0
data/lib/tabula/entities/line.rb +39 -0
data/lib/tabula/entities/page.rb +269 -0
data/lib/tabula/entities/page_area.rb +7 -0
data/lib/tabula/entities/ruling.rb +300 -0
data/lib/tabula/entities/spreadsheet.rb +92 -0
data/lib/tabula/entities/table.rb +81 -0
data/lib/tabula/entities/text_chunk.rb +114 -0
data/lib/tabula/entities/text_element.rb +112 -0
data/lib/tabula/entities/zone_entity.rb +57 -0
data/lib/tabula/extraction.rb +327 -0
data/lib/tabula/line_segment_detector.rb +9 -7
data/lib/tabula/pdf_line_extractor.rb +319 -0
data/lib/tabula/pdf_render.rb +1 -5
data/lib/tabula/spreadsheet_extractor.rb +52 -0
data/lib/tabula/table_extractor.rb +50 -348
data/lib/tabula/table_guesser.rb +21 -23
data/lib/tabula/version.rb +1 -1
data/lib/tabula/writers.rb +5 -6
data/tabula-extractor.gemspec +1 -0
data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
data/test/data/47008204D_USA.page4.pdf +0 -0
data/test/data/560015757GV_China.page1.pdf +0 -0
data/test/data/GSK_2012_Q4.page437.pdf +0 -0
data/test/data/S2MNCEbirdisland.pdf +0 -0
data/test/data/campaign_donors.pdf +0 -0
data/test/data/frx_2012_disclosure.tsv +88 -0
data/test/data/no_tables.pdf +0 -0
data/test/data/puertos1.pdf +0 -0
data/test/data/spanning_cells.csv +21 -0
data/test/data/spanning_cells.pdf +0 -0
data/test/data/strongschools.pdf +0 -0
data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
data/test/data/vietnam3.pdf +0 -0
data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
data/test/heuristic.rb +50 -0
data/test/test_bin_tabula.sh +7 -0
data/test/tests.rb +476 -63
metadata +79 -28
data/lib/geom/point.rb +0 -21
data/lib/geom/rectangle.rb +0 -101
data/lib/geom/segment.rb +0 -82
data/lib/tabula/pdf_dump.rb +0 -132
data/lib/tabula/whitespace.rb +0 -50
data/vertical_rulings_bug.rb +0 -29

data/lib/tabula/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Tabula
-  VERSION = '0.6.6'
+  VERSION = '0.7.0'
 end

data/lib/tabula/writers.rb CHANGED Viewed

@@ -5,9 +5,9 @@ module Tabula
   module Writers
     def Writers.CSV(lines, output=$stdout)
-      lines.each { |l|
+      lines.each do |l|
         output.write CSV.generate_line(l.map(&:text), row_sep: "\r\n")
-      }
+      end
     end
     def Writers.JSON(lines, output=$stdout)
@@ -15,12 +15,11 @@ module Tabula
     end
     def Writers.TSV(lines, output=$stdout)
-      lines.each { |l|
-        output.write(l.map(&:text).join("\t") + "\n")
-      }
+      lines.each do |l|
+        output.write CSV.generate_line(l.map(&:text), col_sep: "\t", row_sep: "\r\n")
+      end
     end
     def Writers.HTML(lines, output=$stdout)
       raise "not implemented"
     end

data/tabula-extractor.gemspec CHANGED Viewed

@@ -24,6 +24,7 @@ Gem::Specification.new do |s|
   s.add_development_dependency 'minitest'
   s.add_development_dependency 'bundler', '>= 1.3.4'
   s.add_development_dependency 'ruby-debug'
+  s.add_development_dependency 'pry'
   s.add_runtime_dependency "trollop", ["~> 2.0"]
 #  s.add_runtime_dependency "algorithms", ["~> 0.6.1"]

data/target/pdfbox-app-2.0.0-SNAPSHOT.jar CHANGED Viewed

Binary file

data/test/data/47008204D_USA.page4.pdf ADDED Viewed

Binary file

data/test/data/560015757GV_China.page1.pdf ADDED Viewed

Binary file

data/test/data/GSK_2012_Q4.page437.pdf ADDED Viewed

Binary file

data/test/data/S2MNCEbirdisland.pdf ADDED Viewed

Binary file

data/test/data/campaign_donors.pdf ADDED Viewed

Binary file

data/test/data/frx_2012_disclosure.tsv ADDED Viewed

@@ -0,0 +1,88 @@
+FOREST LABORATORIES, INC. DISCLOSURE REPORT	""	""	""	""
+Calendar Year - 2012	""	""	""	""
+Physician	Related Entity (if applicable)	City / State	Purpose of Payment	Amount ($USD) * **
+AALAEI, BEHZAD	""	HIGHLAND, IN	MEALS	$51.24
+TOTAL	""	""	""	$51.24
+AAMODT, DENISE, E	""	ALBUQUERQUE, NM	MEALS	$66.12
+TOTAL	""	""	""	$66.12
+AANONSEN, DEBORAH, A	""	STATEN ISLAND, NY	MEALS	$85.00
+TOTAL	""	""	""	$85.00
+AARON, CAREN, T	""	RICHMOND, VA	EDUCATIONAL ITEMS	$78.80
+AARON, CAREN, T	""	RICHMOND, VA	MEALS	$392.45
+TOTAL	""	""	""	$471.25
+AARON, JOHN	""	CLARKSVILLE, TN	MEALS	$20.39
+TOTAL	""	""	""	$20.39
+AARON, JOSHUA, N	""	WEST GROVE, PA	MEALS	$310.33
+AARON, JOSHUA, N	REGIONAL PULMONARY & SLEEP MEDICINE	WEST GROVE, PA	SPEAKING FEES	$4,700.00
+TOTAL	""	""	""	$5,010.33
+AARON, MAUREEN, M	""	MARTINSVILLE, VA	MEALS	$193.67
+TOTAL	""	""	""	$193.67
+AARON, MICHAEL, L	""	WEST ISLIP, NY	MEALS	$19.50
+TOTAL	""	""	""	$19.50
+AARON, MICHAEL, R	""	BROOKLYN, NY	MEALS	$65.92
+TOTAL	""	""	""	$65.92
+AARONS, MARK, G	""	PINEHURST, NC	MEALS	$154.19
+TOTAL	""	""	""	$154.19
+AARONSON, GARY, A	""	PHILADELPHIA, PA	MEALS	$205.17
+TOTAL	""	""	""	$205.17
+AARONSON, ROBERT, M	""	TUCSON, AZ	MEALS	$24.38
+TOTAL	""	""	""	$24.38
+AASHEIM, RICHARD, J	""	GREENEVILLE, TN	EDUCATIONAL ITEMS	$2.27
+AASHEIM, RICHARD, J	""	GREENEVILLE, TN	MEALS	$100.76
+TOTAL	""	""	""	$103.03
+AASMAA, SIRIKE, T	""	MONTVILLE, NJ	MEALS	$53.33
+TOTAL	""	""	""	$53.33
+AAZAMI, HESSAM	""	GRANADA HILLS, CA	MEALS	$402.90
+TOTAL	""	""	""	$402.90
+ABAABA, ABIEDU, C	""	JACKSONVILLE, FL	MEALS	$13.49
+TOTAL	""	""	""	$13.49
+ABABNEH, ALAELDIN, A	""	KANSAS CITY, KS	MEALS	$10.31
+TOTAL	""	""	""	$10.31
+ABAD, ANTONIO, A	""	CORAL SPRINGS, FL	MEALS	$516.29
+TOTAL	""	""	""	$516.29
+ABADEER, PETER, S	""	NORMAL, IL	MEALS	$200.38
+TOTAL	""	""	""	$200.38
+ABAD, ENZO, L	""	MIAMI, FL	MEALS	$67.61
+TOTAL	""	""	""	$67.61
+ABADIAN SHARIFABAD, MANOOCHEHR	""	GRANADA HILLS, CA	MEALS	$12.37
+TOTAL	""	""	""	$12.37
+ABADI, CHRISTOPHER, A	""	WARWICK, RI	MEALS	$157.42
+TOTAL	""	""	""	$157.42
+ABADIE, MARCUS, G	""	ATHENS, TX	MEALS	$361.89
+TOTAL	""	""	""	$361.89
+ABADI, JAMSHEED, S	""	BROOKLYN, NY	MEALS	$363.40
+TOTAL	""	""	""	$363.40
+ABADILLA, JUNE, E	""	JACKSON, KY	MEALS	$105.33
+TOTAL	""	""	""	$105.33
+ABAD, JOHN, P	""	NEWARK, OH	MEALS	$347.64
+TOTAL	""	""	""	$347.64
+ABAD, JOSE, F	""	FOLSOM, CA	MEALS	$30.28
+TOTAL	""	""	""	$30.28
+ABAD, REMEDIOS, D	""	WILNINGTON, DE	MEALS	$26.85
+TOTAL	""	""	""	$26.85
+ABAD, SO KIM, F	""	WICHITA FALLS, TX	MEALS	$136.52
+TOTAL	""	""	""	$136.52
+ABAD, ZOILO, R	""	MIAMI, FL	MEALS	$93.83
+TOTAL	""	""	""	$93.83
+ABALIHI, CAROL, N	""	EL PASO, TX	MEALS	$88.48
+TOTAL	""	""	""	$88.48
+ABALOS, ANNA, T	""	ROSEVILLE, CA	MEALS	$178.60
+TOTAL	""	""	""	$178.60
+ABALOS, ARTURO, Z	""	DELANO, CA	MEALS	$48.06
+TOTAL	""	""	""	$48.06
+ABALOS, JOSEPH, M	""	SENECA, PA	MEALS	$39.03
+TOTAL	""	""	""	$39.03
+ABANDO, JOSE, R	""	DAYTONA BEACH, FL	MEALS	$83.44
+TOTAL	""	""	""	$83.44
+ABANG, ANTHONY, E	""	ELIZABETHTOWN, KY	MEALS	$12.62
+TOTAL	""	""	""	$12.62
+ABAN, KENRIC, T	""	SAN DIEGO, CA	MEALS	$11.91
+TOTAL	""	""	""	$11.91
+ABAQUETA, ALVIN, Y	""	CHARLOTTE, NC	MEALS	$233.71
+TOTAL	""	""	""	$233.71
+ABARCA, SERGIO, O	""	TOOELE, UT	MEALS	$159.58
+TOTAL	""	""	""	$159.58
+ABARIKWU, CONSTANTIA, A	""	PHOENIX, AZ	MEALS	$153.57
+TOTAL	""	""	""	$153.57
+ABASHIDZE, TEAH, A	""	CLEVELAND, OH	MEALS	$153.59
+TOTAL	""	""	""	$153.59

data/test/data/no_tables.pdf ADDED Viewed

Binary file

data/test/data/puertos1.pdf ADDED Viewed

Binary file

data/test/data/spanning_cells.csv ADDED Viewed

@@ -0,0 +1,21 @@
+Improved operation scenario,"","","","",""
+Volume servers in:,2007,2008,2009,2010,2011
+Server closets,"1,505","1,580","1,643","1,673","1,689"
+Server rooms,"1,512","1,586","1,646","1,677","1,693"
+Localized data centers,"1,512","1,586","1,646","1,677","1,693"
+Mid-tier data centers,"1,512","1,586","1,646","1,677","1,693"
+Enterprise-class data centers,"1,512","1,586","1,646","1,677","1,693"
+Best practice scenario,"","","","",""
+Volume servers in:,2007,2008,2009,2010,2011
+Server closets,"1,456","1,439","1,386","1,296","1,326"
+Server rooms,"1,465","1,472","1,427","1,334","1,371"
+Localized data centers,"1,465","1,471","1,426","1,334","1,371"
+Mid-tier data centers,"1,465","1,471","1,426","1,334","1,371"
+Enterprise-class data centers,"1,465","1,471","1,426","1,334","1,371"
+State-of-the-art scenario,"","","","",""
+Volume servers in:,2007,2008,2009,2010,2011
+Server closets,"1,485","1,471","1,424","1,315","1,349"
+Server rooms,"1,495","1,573","1,586","1,424","1,485"
+Localized data centers,"1,495","1,572","1,585","1,424","1,485"
+Mid-tier data centers,"1,495","1,572","1,585","1,424","1,485"
+Enterprise-class data centers,"1,495","1,572","1,585","1,424","1,485"

data/test/data/spanning_cells.pdf ADDED Viewed

Binary file

data/test/data/strongschools.pdf ADDED Viewed

Binary file

data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} RENAMED Viewed

File without changes

data/test/data/vietnam3.pdf ADDED Viewed

Binary file

data/test/heuristic-test-set/original/560015757GV_China.page1.pdf ADDED Viewed

Binary file

data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf ADDED Viewed

Binary file

data/test/heuristic-test-set/original/bo_page24.pdf ADDED Viewed

Binary file

data/test/heuristic-test-set/original/campaign_donors.pdf ADDED Viewed

Binary file

data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf ADDED Viewed

Binary file

data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf ADDED Viewed

Binary file

data/test/heuristic-test-set/spreadsheet/strongschools.pdf ADDED Viewed

Binary file

data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf ADDED Viewed

Binary file

data/test/heuristic.rb ADDED Viewed

@@ -0,0 +1,50 @@
+#a list of filenames and the correct answer
+# no more bs.
+require_relative '../lib/tabula'
+should_use_spreadsheet = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "spreadsheet/*") ).map{|a| [a, true]}
+should_use_original  = Dir.glob( File.join(File.dirname(File.absolute_path(__FILE__)), "heuristic-test-set", "original/*") ).map{|a| [a, false]}
+correct = []
+misclassified_as_original = []
+misclassified_as_spreadsheet = []
+def heuristic(page)
+  page.is_tabular?
+end
+(should_use_spreadsheet + should_use_original) .each do |filename, expected_to_be_tabular|
+  extractor = Tabula::Extraction::ObjectExtractor.new(filename, [1])
+  page = extractor.extract.first
+  page.get_ruling_lines!
+  # puts "#{File.basename(filename)} | #{expected_to_be_tabular}"
+  page_is_tabular = heuristic(page)
+  # puts ""
+  if page_is_tabular && expected_to_be_tabular  || !page_is_tabular && !expected_to_be_tabular
+    correct << filename
+  elsif page_is_tabular && !expected_to_be_tabular
+    misclassified_as_spreadsheet << filename
+  elsif !page_is_tabular && expected_to_be_tabular
+    misclassified_as_original << filename
+  end
+end
+puts "#{correct.size} PDFs were correctly classified"
+puts "#{misclassified_as_original.size + misclassified_as_spreadsheet.size} PDFs were incorrectly classified"
+unless misclassified_as_spreadsheet.empty?
+  puts "#{misclassified_as_spreadsheet.size} PDFs should use the original extraction algorithm\n\t but was classified as needing the spreadsheet algorithm"
+  misclassified_as_spreadsheet.each do |filename|
+    puts " - #{File.basename(filename)}"
+  end
+end
+unless misclassified_as_original.empty?
+  puts "#{misclassified_as_original.size} PDFs should use the spreadsheet extraction algorithm\n\t but was classified as needing the original algorithm"
+  misclassified_as_original.each do |filename|
+    puts " - #{File.basename(filename)}"
+  end
+end

data/test/test_bin_tabula.sh ADDED Viewed

@@ -0,0 +1,7 @@
+bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf --silent -o test.csv
+bin/tabula test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf -o test.csv
+bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv
+bin/tabula test/heuristic-test-set/original/bo_page24.pdf -o test.csv --format TSV
+bin/tabula test/data/campaign_donors.pdf -o test.csv --columns 47,147,256,310,375,431,504 #columns should work
+bin/tabula test/data/argentina_diputados_voting_record.pdf --guess -o test.csv --format TSV #should exclude guff
+bin/tabula test/data/vertical_rulings_bug.pdf --area 250,0,325,1700 -o test.csv --format TSV #should be only a few lines

data/test/tests.rb CHANGED Viewed

@@ -6,10 +6,109 @@ require_relative '../lib/tabula'
 def lines_to_array(lines)
   lines.map { |l|
-    l.map { |te| te.text }
+    l.map { |te| te.text.strip }
   }
 end
+def lines_to_table(lines)
+  Tabula::Table.new_from_array(lines_to_array(lines))
+end
+# I don't want to pollute the "real" clasend a funny inspect method. Just for testing comparisons.
+module Tabula
+  class Table
+    def inspect
+      "[" + lines.map(&:inspect).join(",") + "]"
+    end
+  end
+end
+module Tabula
+  class Line
+    def inspect
+      @text_elements.map(&:text).inspect
+    end
+  end
+end
+class TestEntityComparability < Minitest::Test
+  def test_text_element_comparability
+    base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
+    two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy  \n", nil)
+    three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
+    four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
+    five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
+    six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy    kj", 55)
+    seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy    kj", nil)
+    assert_equal base, two
+    assert_equal base, three
+    assert_equal base, four
+    refute_equal base, five
+    refute_equal base, six
+    refute_equal base, seven
+  end
+  def test_line_comparability
+    text_base = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "Jeremy", nil)
+    text_two = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, " Jeremy  \n", nil)
+    text_three = Tabula::TextElement.new(7, 6, 8, 6, nil, 12, "Jeremy", 88)
+    text_four = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "Jeremy", 55)
+    text_five = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy b", 55)
+    text_six = Tabula::TextElement.new(5, 7, 1212, 121, 66, 15, "jeremy    kj", 55)
+    text_seven = Tabula::TextElement.new(nil, nil, nil, nil, nil, nil, "jeremy    kj", nil)
+    line_base = Tabula::Line.new
+    line_base.text_elements = [text_base, text_two, text_three]
+    line_equal = Tabula::Line.new
+    line_equal.text_elements = [text_base, text_two, text_three]
+    line_equal_but_longer = Tabula::Line.new
+    line_equal_but_longer.text_elements = [text_base, text_two, text_three, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
+    line_unequal = Tabula::Line.new
+    line_unequal.text_elements = [text_base, text_two, text_three, text_five]
+    line_unequal_and_longer = Tabula::Line.new
+    line_unequal_and_longer.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, Tabula::TextElement::EMPTY]
+    line_unequal_and_longer_and_different = Tabula::Line.new
+    line_unequal_and_longer_and_different.text_elements = [text_base, text_two, text_three, text_five, Tabula::TextElement::EMPTY, 'whatever']
+    assert_equal line_base, line_equal
+    assert_equal line_base, line_equal_but_longer
+    refute_equal line_base, line_unequal
+    refute_equal line_base, line_unequal_and_longer
+    refute_equal line_base, line_unequal_and_longer_and_different
+  end
+  def test_table_comparability
+    rows_base = [["a", "b", "c"], ['', 'd', '']]
+    rows_equal = [["a", "b", "c"], ['', 'd']]
+    rows_equal_padded = [['', "a", "b", "c"], ['', '', 'd']]
+    rows_unequal_one = [["a", "b", "c"], ['d']]
+    rows_unequal_two = [["a", "b", "c"], ['d', '']]
+    rows_unequal_three = [["a", "b", "c"], ['d'], ['a','b', 'd']]
+    rows_unequal_four = [["a", "b", "c"]]
+    table_base = Tabula::Table.new_from_array(rows_base)
+    table_equal = Tabula::Table.new_from_array(rows_equal)
+    table_equal_column_padded = Tabula::Table.new_from_array(rows_equal_padded)
+    table_unequal_one = Tabula::Table.new_from_array(rows_unequal_one)
+    table_unequal_two = Tabula::Table.new_from_array(rows_unequal_two)
+    table_unequal_three = Tabula::Table.new_from_array(rows_unequal_three)
+    table_unequal_four = Tabula::Table.new_from_array(rows_unequal_four)
+    assert_equal table_base, table_equal
+    assert_equal table_base, table_equal_column_padded
+    refute_equal table_base, table_unequal_one
+    refute_equal table_base, table_unequal_two
+    refute_equal table_base, table_unequal_three
+    refute_equal table_base, table_unequal_four
+  end
+end
 class TestPagesInfoExtractor < Minitest::Test
   def test_pages_info_extractor
     extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
@@ -24,42 +123,74 @@ class TestPagesInfoExtractor < Minitest::Test
 end
 class TestTableGuesser < Minitest::Test
+  def test_find_rects_from_lines_with_lsd
+    skip "Skipping until we actually use LSD"
+    filename = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
+    page_index = 0
+    lines = Tabula::Extraction::LineExtractor.lines_in_pdf_page(filename, page_index, :render_pdf => true)
+    page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
+    page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
+    expected_page_areas = [[54.087890625, 50.203125, 734.220703125, 550.44140625]]
+    assert_equal expected_page_areas, page_areas
+  end
 end
 class TestDumper < Minitest::Test
   def test_extractor
-    extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
-    page = extractor.extract.first
+    extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
+    page = extractor.extract.next
     assert_instance_of Tabula::Page, page
   end
   def test_get_by_area
-#    http://localhost:8080/debug/418b1d5698e5c7b724551d9610c071ab3063275c/characters?x1=57.921428571428564&x2=290.7&y1=107.1&y2=394.52142857142854&page=1&use_lines=false
-    extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
+    extractor = Tabula::Extraction::ObjectExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
     characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
     assert_equal characters.size, 206
   end
 end
+class TestRulingIntersection < Minitest::Test
+  def test_ruling_intersection
+    horizontals = [Tabula::Ruling.new(10, 1, 10, 0)]
+    verticals   = [Tabula::Ruling.new(1, 3, 0, 11),
+                   Tabula::Ruling.new(1, 4, 0, 11)]
+    ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
+    assert_equal 2, ints.size
+    assert_equal ints[0][0].getX, 3.0
+    assert_equal ints[0][0].getY, 10.0
+    assert_equal ints[1][0].getX, 4.0
+    assert_equal ints[1][0].getY, 10.0
+    verticals =   [Tabula::Ruling.new(20, 3, 0, 11)]
+    ints = Tabula::Ruling.find_intersections(horizontals, verticals).to_a
+    assert_equal ints.size, 0
+  end
+end
 class TestExtractor < Minitest::Test
   def test_table_extraction_1
-    character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
-    characters = character_extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
-    table = lines_to_array Tabula.make_table(characters)
-    expected = [["Prior Scale ", "New Scale ", "% Rank* "], ["800 ", "170 ", "99 "], ["790 ", "170 ", "99 "], ["780 ", "170 ", "99 "], ["770 ", "170 ", "99 "], ["760 ", "170 ", "99 "], ["750 ", "169 ", "99 "], ["740 ", "169 ", "99 "], ["730 ", "168 ", "98 "], ["720 ", "168 ", "98 "], ["710 ", "167 ", "97 "], ["700 ", "166 ", "96 "], ["690 ", "165 ", "95 "], ["680 ", "165 ", "95 "], ["670 ", "164 ", "93 "], ["660 ", "164 ", "93 "], ["650 ", "163 ", "91 "]]
+    table = lines_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
+                                                1,
+                                                [107.1, 57.9214, 394.5214, 290.7],
+                                                :detect_ruling_lines => false)
+    expected = [["Prior Scale","New Scale","% Rank*"], ["800","170","99"], ["790","170","99"], ["780","170","99"], ["770","170","99"], ["760","170","99"], ["750","169","99"], ["740","169","99"], ["730","168","98"], ["720","168","98"], ["710","167","97"], ["700","166","96"], ["690","165","95"], ["680","165","95"], ["670","164","93"], ["660","164","93"], ["650","163","91"]]
     assert_equal expected, table
   end
   def test_diputados_voting_record
-    character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)))
-    characters = character_extractor.extract.next.get_text([269.875, 12.75, 790.5, 561])
+    table = lines_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
+                                                1,
+                                                [269.875, 12.75, 790.5, 561])
     expected = [["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
-    assert_equal expected, lines_to_array(Tabula.make_table(characters))
+    assert_equal expected, table
   end
   def test_forest_disclosure_report_dont_regress
@@ -67,80 +198,362 @@ class TestExtractor < Minitest::Test
     # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
     # and a solution for half-x-height-offset lines.
     pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
-    character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
-    lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
-    vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
+    table = lines_to_table Tabula.extract_table(pdf_file_path,
+                                                1,
+                                                [106.01, 48.09, 227.31, 551.89],
+                                                :detect_ruling_lines => true)
-    characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
-                                                           #top left bottom right
-    expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
-                ['TOTAL', '', '', '','$85.00'],
-                ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
-                ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
-                ['TOTAL', '', '', '', '$471.25'],
-                ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
-                ['TOTAL', '', '', '','$20.39'],
-                ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
-                ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
-                ['TOTAL', '', '', '', '$5,010.33'],
-                ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
-                ['TOTAL', '', '', '', '$193.67'],
-                ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
-    assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
+    expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
+    assert_equal expected, table
   end
   def test_missing_spaces_around_an_ampersand
     pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
-    character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
-    lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
-    vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
+    character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
+    page_obj = character_extractor.extract.next
+    lines = page_obj.ruling_lines
+    vertical_rulings = lines.select(&:vertical?)
+    area = [170, 28, 185, 833] #top left bottom right
-    characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
-                                                           #top left bottom right
-    expected = [
-                 ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
-                ]
+    expected = Tabula::Table.new_from_array([
+       ["", "REGIONAL PULMONARY & SLEEP",],
+       ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"],
+       ["", "MEDICINE", ],
+      ])
-    assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
+    assert_equal expected, lines_to_table(page_obj.get_area(area).make_table(:vertical_rulings => vertical_rulings))
   end
   def test_forest_disclosure_report
     skip "Skipping until we support multiline cells"
     pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
-    character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
+    character_extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path)
     lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
-    vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
+    vertical_rulings = lines.select(&:vertical?) #.uniq{|line| (line.left / 10).round }
-    characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
+    page_obj = character_extractor.extract.next
+    characters = page_obj.get_text([110, 28, 218, 833])
                                                            #top left bottom right
-    expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
-                ['TOTAL', '', '', '','$85.00'],
-                ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
-                ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
-                ['TOTAL', '', '', '', '$471.25'],
-                ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
-                ['TOTAL', '', '', '','$20.39'],
-                ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
-                ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '$4,700.00'],
-                ['TOTAL', '', '', '', '$5,010.33'],
-                ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
-                ['TOTAL', '', '', '', '$193.67'],
-                ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
-    assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
+        expected = Tabula::Table.new_from_array([
+          ['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '', '$85.00'],
+          ['TOTAL', '', '', '','$85.00'],
+          ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '', '$78.80'],
+          ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '', '$392.45'],
+          ['TOTAL', '', '', '', '$471.25'],
+          ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '', '$20.39'],
+          ['TOTAL', '', '', '','$20.39'],
+          ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '', '$310.33'],
+          ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '', '$4,700.00'],
+          ['TOTAL', '', '', '', '$5,010.33'],
+          ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '', '$193.67'],
+          ['TOTAL', '', '', '', '$193.67'],
+          ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '', '$19.50']
+        ])
+    assert_equal expected, lines_to_table(Tabula.make_table(characters, :vertical_rulings => vertical_rulings))
   end
   # TODO Spaces inserted in words - fails
   def test_bo_page24
-    character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
-    characters = character_extractor.extract.next.get_text([435.625, 53.125, 585.7142857142857, 810.5357142857142])
+    table = lines_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
+                                                1,
+                                                [425.625, 53.125, 575.714, 810.535],
+                                                :detect_ruling_lines => false)
     expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B. MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
-    assert_equal expected, lines_to_array(Tabula.make_table(characters))
+    assert_equal expected, table
+  end
+  def test_vertical_rulings_splitting_words
+    #if a vertical ruling crosses over a word, the word should be split at that vertical ruling
+    # before, the entire word would end up on one side of the vertical ruling.
+    pdf_file_path = File.expand_path('data/vertical_rulings_bug.pdf', File.dirname(__FILE__))
+    #both of these are semantically "correct"; the difference is in how we handle multi-line cells
+    expected = Tabula::Table.new_from_array([
+      ["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
+      ["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
+      ["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH ABRAHAMSON", "", "", "$22.93", "", "", "$22.93"]
+     ])
+    other_expected = Tabula::Table.new_from_array([
+      ["ABRAHAMS, HARRISON M", "ARLINGTON", "TX", "HARRISON M ABRAHAMS", "", "", "$3.08", "", "", "$3.08"],
+      ["ABRAHAMS, ROGER A", "MORGANTOWN", "WV", "ROGER A ABRAHAMS", "", "$1500.00", "$76.28", "$49.95", "", "$1626.23"],
+      ["ABRAHAMSON, TIMOTHY GARTH", "URBANDALE", "IA", "TIMOTHY GARTH", "", "", "$22.93", "", "", "$22.93"],
+      ["", "", "", "ABRAHAMSON"]
+     ])
+    #N.B. it's "MORGANTOWN", "WV" that we're most interested in here (it used to show up as ["MORGANTOWNWV", "", ""])
+    extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, 1...2) #:all ) # 1..2643
+    extractor.extract.each_with_index do |pdf_page, page_index|
+      page_areas = [[250, 0, 325, 1700]]
+      scale_factor = pdf_page.width / 1700
+      vertical_rulings = [0, 360, 506, 617, 906, 1034, 1160, 1290, 1418, 1548].map{|n| Tabula::Ruling.new(0, n * scale_factor, 0, 1000)}
+      tables = page_areas.map do |page_area|
+        pdf_page.get_area(page_area).make_table(:vertical_rulings => vertical_rulings)
+      end
+      assert_equal expected, lines_to_table(tables.first)
+    end
+  end
+  def test_vertical_rulings_prevent_merging_of_columns
+    expected = [["SZARANGOWICZ", "GUSTAVO ALEJANDRO", "25.096.244", "20-25096244-5", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TAILHADE", "LUIS RODOLFO", "21.386.299", "20-21386299-6", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["TEDESCHI", "ADRIÁN ALBERTO", "24.171.507", "20-24171507-9", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["URRIZA", "MARÍA TERESA", "18.135.604", "27-18135604-4", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["USTARROZ", "GERÓNIMO JAVIER", "24.912.947", "20-24912947-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VALSANGIACOMO BLANC", "OFERNANDO JORGE", "26.800.203", "20-26800203-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["VICENTE", "PABLO ARIEL", "21.897.586", "20-21897586-1", "09/10/2013", "EFECTIVO", "$ 10.000,00"], ["AMBURI", "HUGO ALBERTO", "14.096.560", "20-14096560-0", "09/10/2013", "EFECTIVO", "$ 20.000,00"], ["BERRA", "CLAUDIA SUSANA", "14.433.112", "27-14433112-0", "09/10/2013", "EFECTIVO", "$ 10.000,00"]]
+    vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
+    table = lines_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
+                                                1,
+                                                [255.57,40.43,398.76,557.35],
+                                                :vertical_rulings => vertical_rulings)
+    assert_equal expected, table
+  end
+  def test_get_spacing_and_merging_right
+    table = lines_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
+                                                1,
+                                                [52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
+                                                :detect_ruling_lines => true)
+    expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia ", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
+    assert_equal expected, table
+  end
+  class SpreadsheetsHasCellsTester
+    include Tabula::HasCells
+    attr_accessor :cells
+    def initialize(cells)
+      @cells = cells
+    end
+  end
+  #just tests the algorithm
+  def test_cells_to_spreadsheets
+    cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
+      Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
+      Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
+      Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
+      Tabula::Cell.new(90.0, 18.0, 52.0, 4.0), Tabula::Cell.new(94.0, 18.0, 52.0, 6.0),
+      Tabula::Cell.new(100.0, 18.0, 52.0, 28.0), Tabula::Cell.new(128.0, 18.0, 52.0, 4.0),
+      Tabula::Cell.new(132.0, 18.0, 52.0, 64.0), Tabula::Cell.new(196.0, 18.0, 52.0, 66.0),
+      Tabula::Cell.new(262.0, 18.0, 52.0, 4.0), Tabula::Cell.new(266.0, 18.0, 52.0, 84.0),
+      Tabula::Cell.new(350.0, 18.0, 52.0, 4.0), Tabula::Cell.new(354.0, 18.0, 52.0, 32.0),
+      Tabula::Cell.new(386.0, 18.0, 52.0, 38.0), Tabula::Cell.new(424.0, 18.0, 52.0, 18.0),
+      Tabula::Cell.new(442.0, 18.0, 52.0, 74.0), Tabula::Cell.new(516.0, 18.0, 52.0, 28.0),
+      Tabula::Cell.new(544.0, 18.0, 52.0, 4.0), Tabula::Cell.new(44.0, 70.0, 156.0, 6.0),
+      Tabula::Cell.new(50.0, 70.0, 156.0, 4.0), Tabula::Cell.new(54.0, 70.0, 156.0, 6.0),
+      Tabula::Cell.new(60.0, 70.0, 156.0, 4.0), Tabula::Cell.new(64.0, 70.0, 156.0, 6.0),
+      Tabula::Cell.new(70.0, 70.0, 156.0, 4.0), Tabula::Cell.new(74.0, 70.0, 156.0, 6.0),
+      Tabula::Cell.new(84.0, 70.0, 2.0, 6.0), Tabula::Cell.new(90.0, 70.0, 156.0, 4.0),
+      Tabula::Cell.new(94.0, 70.0, 156.0, 6.0), Tabula::Cell.new(100.0, 70.0, 156.0, 28.0),
+      Tabula::Cell.new(128.0, 70.0, 156.0, 4.0), Tabula::Cell.new(132.0, 70.0, 156.0, 64.0),
+      Tabula::Cell.new(196.0, 70.0, 156.0, 66.0), Tabula::Cell.new(262.0, 70.0, 156.0, 4.0),
+      Tabula::Cell.new(266.0, 70.0, 156.0, 84.0), Tabula::Cell.new(350.0, 70.0, 156.0, 4.0),
+      Tabula::Cell.new(354.0, 70.0, 156.0, 32.0), Tabula::Cell.new(386.0, 70.0, 156.0, 38.0),
+      Tabula::Cell.new(424.0, 70.0, 156.0, 18.0), Tabula::Cell.new(442.0, 70.0, 156.0, 74.0),
+      Tabula::Cell.new(516.0, 70.0, 156.0, 28.0), Tabula::Cell.new(544.0, 70.0, 156.0, 4.0),
+      Tabula::Cell.new(84.0, 72.0, 446.0, 6.0), Tabula::Cell.new(90.0, 226.0, 176.0, 4.0),
+      Tabula::Cell.new(94.0, 226.0, 176.0, 6.0), Tabula::Cell.new(100.0, 226.0, 176.0, 28.0),
+      Tabula::Cell.new(128.0, 226.0, 176.0, 4.0), Tabula::Cell.new(132.0, 226.0, 176.0, 64.0),
+      Tabula::Cell.new(196.0, 226.0, 176.0, 66.0), Tabula::Cell.new(262.0, 226.0, 176.0, 4.0),
+      Tabula::Cell.new(266.0, 226.0, 176.0, 84.0), Tabula::Cell.new(350.0, 226.0, 176.0, 4.0),
+      Tabula::Cell.new(354.0, 226.0, 176.0, 32.0), Tabula::Cell.new(386.0, 226.0, 176.0, 38.0),
+      Tabula::Cell.new(424.0, 226.0, 176.0, 18.0), Tabula::Cell.new(442.0, 226.0, 176.0, 74.0),
+      Tabula::Cell.new(516.0, 226.0, 176.0, 28.0), Tabula::Cell.new(544.0, 226.0, 176.0, 4.0),
+      Tabula::Cell.new(90.0, 402.0, 116.0, 4.0), Tabula::Cell.new(94.0, 402.0, 116.0, 6.0),
+      Tabula::Cell.new(100.0, 402.0, 116.0, 28.0), Tabula::Cell.new(128.0, 402.0, 116.0, 4.0),
+      Tabula::Cell.new(132.0, 402.0, 116.0, 64.0), Tabula::Cell.new(196.0, 402.0, 116.0, 66.0),
+      Tabula::Cell.new(262.0, 402.0, 116.0, 4.0), Tabula::Cell.new(266.0, 402.0, 116.0, 84.0),
+      Tabula::Cell.new(350.0, 402.0, 116.0, 4.0), Tabula::Cell.new(354.0, 402.0, 116.0, 32.0),
+      Tabula::Cell.new(386.0, 402.0, 116.0, 38.0), Tabula::Cell.new(424.0, 402.0, 116.0, 18.0),
+      Tabula::Cell.new(442.0, 402.0, 116.0, 74.0), Tabula::Cell.new(516.0, 402.0, 116.0, 28.0),
+      Tabula::Cell.new(544.0, 402.0, 116.0, 4.0), Tabula::Cell.new(84.0, 518.0, 246.0, 6.0),
+      Tabula::Cell.new(90.0, 518.0, 186.0, 4.0), Tabula::Cell.new(94.0, 518.0, 186.0, 6.0),
+      Tabula::Cell.new(100.0, 518.0, 186.0, 28.0), Tabula::Cell.new(128.0, 518.0, 186.0, 4.0),
+      Tabula::Cell.new(132.0, 518.0, 186.0, 64.0), Tabula::Cell.new(196.0, 518.0, 186.0, 66.0),
+      Tabula::Cell.new(262.0, 518.0, 186.0, 4.0), Tabula::Cell.new(266.0, 518.0, 186.0, 84.0),
+      Tabula::Cell.new(350.0, 518.0, 186.0, 4.0), Tabula::Cell.new(354.0, 518.0, 186.0, 32.0),
+      Tabula::Cell.new(386.0, 518.0, 186.0, 38.0), Tabula::Cell.new(424.0, 518.0, 186.0, 18.0),
+      Tabula::Cell.new(442.0, 518.0, 186.0, 74.0), Tabula::Cell.new(516.0, 518.0, 186.0, 28.0),
+      Tabula::Cell.new(544.0, 518.0, 186.0, 4.0), Tabula::Cell.new(90.0, 704.0, 60.0, 4.0),
+      Tabula::Cell.new(94.0, 704.0, 60.0, 6.0), Tabula::Cell.new(100.0, 704.0, 60.0, 28.0),
+      Tabula::Cell.new(128.0, 704.0, 60.0, 4.0), Tabula::Cell.new(132.0, 704.0, 60.0, 64.0),
+      Tabula::Cell.new(196.0, 704.0, 60.0, 66.0), Tabula::Cell.new(262.0, 704.0, 60.0, 4.0),
+      Tabula::Cell.new(266.0, 704.0, 60.0, 84.0), Tabula::Cell.new(350.0, 704.0, 60.0, 4.0),
+      Tabula::Cell.new(354.0, 704.0, 60.0, 32.0), Tabula::Cell.new(386.0, 704.0, 60.0, 38.0),
+      Tabula::Cell.new(424.0, 704.0, 60.0, 18.0), Tabula::Cell.new(442.0, 704.0, 60.0, 74.0),
+      Tabula::Cell.new(516.0, 704.0, 60.0, 28.0), Tabula::Cell.new(544.0, 704.0, 60.0, 4.0),
+      Tabula::Cell.new(84.0, 764.0, 216.0, 6.0), Tabula::Cell.new(90.0, 764.0, 216.0, 4.0),
+      Tabula::Cell.new(94.0, 764.0, 216.0, 6.0), Tabula::Cell.new(100.0, 764.0, 216.0, 28.0),
+      Tabula::Cell.new(128.0, 764.0, 216.0, 4.0), Tabula::Cell.new(132.0, 764.0, 216.0, 64.0),
+      Tabula::Cell.new(196.0, 764.0, 216.0, 66.0), Tabula::Cell.new(262.0, 764.0, 216.0, 4.0),
+      Tabula::Cell.new(266.0, 764.0, 216.0, 84.0), Tabula::Cell.new(350.0, 764.0, 216.0, 4.0),
+      Tabula::Cell.new(354.0, 764.0, 216.0, 32.0), Tabula::Cell.new(386.0, 764.0, 216.0, 38.0),
+      Tabula::Cell.new(424.0, 764.0, 216.0, 18.0), Tabula::Cell.new(442.0, 764.0, 216.0, 74.0),
+      Tabula::Cell.new(516.0, 764.0, 216.0, 28.0), Tabula::Cell.new(544.0, 764.0, 216.0, 4.0)]
+    expected_spreadsheets = [Tabula::Spreadsheet.new(40.0, 18.0, 208.0, 40.0, nil, nil, nil, nil),
+                             Tabula::Spreadsheet.new(84.0, 18.0, 962.0, 464.0,nil, nil, nil, nil)]
+    #compares spreadsheets on area only.
+    assert_equal expected_spreadsheets.map{|s| [s.x, s.y, s.width, s.height] },
+      SpreadsheetsHasCellsTester.new(cells).find_spreadsheets_from_cells.map{|a| s = a.getBounds; [s.x, s.y, s.width, s.height] }
+  end
+  def test_add_spanning_cells
+    skip "until I write it"
+  end
+  def test_add_placeholder_cells_to_funny_shaped_tables
+    skip "until I write it, cf 01005787B_Pakistan.pdf"
+  end
+  class CellsHasCellsTester
+    include Tabula::HasCells
+    attr_accessor :vertical_ruling_lines, :horizontal_ruling_lines, :cells
+    def initialize(vertical_ruling_lines, horizontal_ruling_lines)
+      @cells = []
+      @vertical_ruling_lines = vertical_ruling_lines
+      @horizontal_ruling_lines = horizontal_ruling_lines
+      find_cells!
+    end
+  end
+  #just tests the algorithm
+  def test_lines_to_cells
+    vertical_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 0.0, 40.0),
+                              Tabula::Ruling.new(44.0, 70.0, 0.0, 36.0),
+                              Tabula::Ruling.new(40.0, 226.0, 0.0, 40.0)]
+    horizontal_ruling_lines = [ Tabula::Ruling.new(40.0, 18.0, 208.0, 0.0),
+                                Tabula::Ruling.new(44.0, 18.0, 208.0, 0.0),
+                                Tabula::Ruling.new(50.0, 18.0, 208.0, 0.0),
+                                Tabula::Ruling.new(54.0, 18.0, 208.0, 0.0),
+                                Tabula::Ruling.new(60.0, 18.0, 208.0, 0.0),
+                                Tabula::Ruling.new(64.0, 18.0, 208.0, 0.0),
+                                Tabula::Ruling.new(70.0, 18.0, 208.0, 0.0),
+                                Tabula::Ruling.new(74.0, 18.0, 208.0, 0.0),
+                                Tabula::Ruling.new(80.0, 18.0, 208.0, 0.0)]
+    expected_cells = [Tabula::Cell.new(40.0, 18.0, 208.0, 4.0), Tabula::Cell.new(44.0, 18.0, 52.0, 6.0),
+                      Tabula::Cell.new(50.0, 18.0, 52.0, 4.0), Tabula::Cell.new(54.0, 18.0, 52.0, 6.0),
+                      Tabula::Cell.new(60.0, 18.0, 52.0, 4.0), Tabula::Cell.new(64.0, 18.0, 52.0, 6.0),
+                      Tabula::Cell.new(70.0, 18.0, 52.0, 4.0), Tabula::Cell.new(74.0, 18.0, 52.0, 6.0),
+                      Tabula::Cell.new(44.0, 70.0, 156.0, 6.0), Tabula::Cell.new(50.0, 70.0, 156.0, 4.0),
+                      Tabula::Cell.new(54.0, 70.0, 156.0, 6.0), Tabula::Cell.new(60.0, 70.0, 156.0, 4.0),
+                      Tabula::Cell.new(64.0, 70.0, 156.0, 6.0), Tabula::Cell.new(70.0, 70.0, 156.0, 4.0),
+                      Tabula::Cell.new(74.0, 70.0, 156.0, 6.0), ]
+    actual_cells = CellsHasCellsTester.new(vertical_ruling_lines, horizontal_ruling_lines).cells
+    assert_equal Set.new(expected_cells), Set.new(actual_cells) #I don't care about order
   end
+  #this is the real deal!!
+  def test_extract_tabular_data_using_lines_and_spreadsheets
+    pdf_file_path = "./test/data/frx_2012_disclosure.pdf"
+    expected_data_path = "./test/data/frx_2012_disclosure.tsv"
+    expected = open(expected_data_path, 'r').read #.split("\n").map{|line| line.split("\t")}
+    Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all).extract.each do |pdf_page|
+      spreadsheet = pdf_page.spreadsheets.first
+      assert_equal expected, spreadsheet.to_tsv
+    end
+  end
+  def test_cope_with_a_tableless_page
+    pdf_file_path = "./test/data/no_tables.pdf"
+    spreadsheets = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all, '',
+        :line_color_filter => lambda{|components| components.all?{|c| c < 0.1}}
+      ).extract.to_a.first.spreadsheets
+    assert_equal 0, spreadsheets.size
+  end
+  def test_spanning_cells
+    pdf_file_path = "./test/data/spanning_cells.pdf"
+    expected_data_path = "./test/data/spanning_cells.csv"
+    expected = open(expected_data_path, 'r').read
+    Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
+      spreadsheet = pdf_page.spreadsheets.first
+      assert_equal expected, spreadsheet.to_csv
+    end
+  end
+  def test_almost_vertical_lines
+    pdf_file_path = "./test/data/puertos1.pdf"
+    top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
+    area = Tabula::ZoneEntity.new(top, left,
+                                  right - left, bottom - top)
+    Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
+      rulings = Tabula::Ruling.crop_rulings_to_area(pdf_page.ruling_lines, area)
+      # TODO assertion not entirely correct, should do the trick for now
+      assert_equal 15, rulings.select(&:vertical?).count
+    end
+  end
+  def test_extract_spreadsheet_within_an_area
+    pdf_file_path = "./test/data/puertos1.pdf"
+    top, left, bottom, right = 273.9035714285714, 30.32142857142857, 554.8821428571429, 546.7964285714286
+    Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
+      area = pdf_page.get_area([top, left, bottom, right])
+      table = area.spreadsheets.first.to_a
+      assert_equal 15, table.length
+      assert_equal ["", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM", "M.U$S", "TM"], table.first
+      assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
+    end
+  end
+end
+class TestIsTabularHeuristic < Minitest::Test
+  EXPECTED_TO_BE_SPREADSHEET = ['47008204D_USA.page4.pdf', 'GSK_2012_Q4.page437.pdf', 'strongschools.pdf', 'tabla_subsidios.pdf']
+  NOT_EXPECTED_TO_BE_SPREADSHEET = ['560015757GV_China.page1.pdf', 'S2MNCEbirdisland.pdf', 'bo_page24.pdf', 'campaign_donors.pdf']
+  File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
+  def test_heuristic_detects_spreadsheets
+    EXPECTED_TO_BE_SPREADSHEET.each do |f|
+      path = File.expand_path('data/' + f, File.dirname(__FILE__))
+      extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
+      page = extractor.extract.first
+      page.get_ruling_lines!
+      assert page.is_tabular?
+    end
+  end
+  def test_heuristic_detects_non_spreadsheets
+    NOT_EXPECTED_TO_BE_SPREADSHEET.each do |f|
+      path = File.expand_path('data/' + f, File.dirname(__FILE__))
+      extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
+      page = extractor.extract.first
+      page.get_ruling_lines!
+      assert !page.is_tabular?
+    end
+  end
 end