RubyGems - tabula-extractor - Versions diffs - 0.7.0-java → 0.7.1-java - Mend

tabula-extractor 0.7.0-java → 0.7.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/bin/tabula +5 -1
data/lib/tabula/entities/cell.rb +11 -0
data/lib/tabula/entities/has_cells.rb +0 -1
data/lib/tabula/entities/page.rb +7 -8
data/lib/tabula/entities/spreadsheet.rb +18 -0
data/lib/tabula/entities/table.rb +20 -5
data/lib/tabula/extraction.rb +7 -1
data/lib/tabula/table_extractor.rb +39 -19
data/lib/tabula/version.rb +1 -1
data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
data/test/tests.rb +17 -17
metadata +5 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: fa69052647e565cd996f92f1c73e6d00deceea54
-  data.tar.gz: 7d76ccc9b445e9138f65920cd0a401fbee3ababf
+  metadata.gz: fea5751c9b78705fbfda50be1e29011d40249b04
+  data.tar.gz: 2d7bb156a073636467b34e44b3f8b6a364920354
 SHA512:
-  metadata.gz: d17ad7e967407711d60d9b30e8d231ab9adef313b8ede84e1960e35f24c5374f472a500e7f7148cc485408f796f1d7a67e96393d0fc5331e0f3f88971dad76c4
-  data.tar.gz: 84121390715280e86fb3ff1dd82533863dfabbb0b27582fb0d3bf9ad088f8d05e4c8f7dfbcfd2f621c232accd6145968e263a4e0e081431cf941939974999b20
+  metadata.gz: d8d1f1932397b599b3d724bb4e8ca5fd73f516e198a391625031ab83f8e5f0a2743d51dcc02953ee5a53c0664330c8e8e1f5309f72493e9f2e232068b06085a1
+  data.tar.gz: e292d6320db6a42799418b6651f63a88446f85c6787659597e77b9f7c38e698b69a10c78edbdeea49baedc0d92f77602b847aef90ab1f8f2ab10356493dbb397

data/bin/tabula CHANGED Viewed

@@ -6,6 +6,10 @@ require_relative '../lib/tabula'
 FORMATS = ['CSV', 'TSV', 'HTML', 'JSON']
 def parse_pages_arg(pages_arg)
+  if(pages_arg == 'all')
+    return :all
+  end
   ranges = pages_arg.split(',').map(&:strip)
   pages = []
   ranges.each do |range|
@@ -32,7 +36,7 @@ Usage:
 where [options] are:
 EOS
-    opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String
+    opt :pages, 'Comma separated list of ranges, or all. Examples: --pages 1-3,5-7, --pages 3 or --pages all. Default is --pages 1', :default => '1', :type => String
     opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
     opt :columns, 'X coordinates of column boundaries. Example --columns 10.1,20.2,30.3', :default => nil, :type => String
     opt :password, 'Password to decrypt document. Default is empty', :default => ''

data/lib/tabula/entities/cell.rb CHANGED Viewed

@@ -38,5 +38,16 @@ module Tabula
       end
       output.strip
     end
+    def to_json(*a)
+      {
+        'json_class'   => self.class.name,
+        'text' => text,
+        'top' => top,
+        'left' => left,
+        'width' => width,
+        'height' => height
+      }.to_json(*a)
+    end
   end
 end

data/lib/tabula/entities/has_cells.rb CHANGED Viewed

@@ -6,7 +6,6 @@ module Tabula
   # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
   module HasCells
-    IS_TABULAR_HEURISTIC_RATIO = 0.8
     ANOTHER_MAGIC_NUMBER = 0.75
     def is_tabular?

data/lib/tabula/entities/page.rb CHANGED Viewed

@@ -83,11 +83,7 @@ module Tabula
     #for API backwards-compatibility reasons, this returns an array of arrays.
     def make_table(options={})
-      get_table(options).lines.map do |l|
-        l.text_elements.map! do |te|
-          te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
-        end
-      end.sort_by { |l| l.map { |te| te.top or 0 }.max }
+      get_table(options).rows
     end
     # returns the Spreadsheets; creating them if they're not memoized
@@ -241,13 +237,17 @@ module Tabula
       end
       lines_to_points.each do |l, p1_p2|
-        l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0],
- p1_p2[1]
+        l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1]
       end
     end
     def collapse_oriented_rulings(lines)
       # lines must all be of one orientation (i.e. horizontal, vertical)
+      if lines.empty?
+        return []
+      end
       lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
       lines = lines.inject([lines.shift]) do |memo, next_line|
@@ -262,7 +262,6 @@ module Tabula
           memo << next_line
         end
       end
-      lines
     end
   end

data/lib/tabula/entities/spreadsheet.rb CHANGED Viewed

@@ -1,9 +1,13 @@
 module Tabula
   # a counterpart of Table, to be sure.
   # not sure yet what their relationship ought to be.
+  # the both should implement `cells`, `rows`, `cols`, `extraction_method`
   class Spreadsheet < ZoneEntity
     include Tabula::HasCells
     attr_accessor :cells, :vertical_ruling_lines, :horizontal_ruling_lines, :cells_resolved
+    attr_reader :extraction_method, :page
     def initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) #, lines)
       super(top, left, width, height)
@@ -11,6 +15,7 @@ module Tabula
       @page = page
       @vertical_ruling_lines = vertical_ruling_lines
       @horizontal_ruling_lines = horizontal_ruling_lines
+      @extraction_method = "spreadsheet"
     end
     def ruling_lines
@@ -88,5 +93,18 @@ module Tabula
       Tabula::Writers.TSV(rows, out)
       out.string
     end
+    def to_json(*a)
+      {
+        'json_class'   => self.class.name,
+        'extraction_method' => @extraction_method,
+        'data' => rows,
+      }.to_json(*a)
+    end
+    def +(other)
+      raise ArgumentError unless other.page == @page
+      Spreadsheet.new(nil, nil, nil, nil, @page, @cells + other.cells, nil, nil )
+    end
   end
 end

data/lib/tabula/entities/table.rb CHANGED Viewed

@@ -1,9 +1,11 @@
 module Tabula
   class Table
-    attr_reader :lines
+    attr_reader :extraction_method
+    attr_accessor :lines
     def initialize(line_count, separators)
       @separators = separators
       @lines = (0...line_count).inject([]) { |m| m << Line.new }
+      @extraction_method = "original"
     end
     def add_text_element(text_element, i, j)
@@ -28,22 +30,27 @@ module Tabula
     end
     def cols
-      self.rpad!
-      lines.map(&:text_elements).transpose
+      rows.transpose
     end
     def rows
       self.rpad!
-      lines.map(&:text_elements)
+      lines.map do |l|
+        l.text_elements.map! do |te|
+          te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
+        end
+      end.sort_by { |l| l.map { |te| te.top || 0 }.max }
     end
     # create a new Table object from an array of arrays, representing a list of rows in a spreadsheet
     # probably only used for testing
     def self.new_from_array(array_of_rows)
       t = Table.new(array_of_rows.size, [])
+      @extraction_method = "testing"
       array_of_rows.each_with_index do |row, index|
-        t.lines[index].text_elements = row.map{|cell| TextElement.new(nil, nil, nil, nil, nil, nil, cell, nil)}
+        t.lines[index].text_elements = row.each_with_index.map{|cell, inner_index| TextElement.new(index, inner_index, 1, 1, nil, nil, cell, nil)}
       end
+      t.rpad!
       t
     end
@@ -77,5 +84,13 @@ module Tabula
       self.lines.zip(other.lines).all? { |my, yours| my == yours }
     end
+    def to_json(*a)
+      {
+        'json_class'   => self.class.name,
+        'extraction_method' => @extraction_method,
+        'data' => rows,
+      }.to_json(*a)
+    end
   end
 end

data/lib/tabula/extraction.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 java_import org.apache.pdfbox.pdfparser.PDFParser
 java_import org.apache.pdfbox.util.TextPosition
 java_import org.apache.pdfbox.pdmodel.PDDocument
@@ -198,7 +199,12 @@ module Tabula
       def processTextPosition(text)
         c = text.getCharacter
-        h = c == ' ' ? text.getWidthDirAdj.round(2) : text.getHeightDir.round(2)
+        h = text.getHeightDir.round(2)
+        if c == ' ' || c == ' ' # replace non-breaking space for space
+          c = ' '
+          h = text.getWidthDirAdj.round(2)
+        end
         te = Tabula::TextElement.new(text.getYDirAdj.round(2) - h,
                                      text.getXDirAdj.round(2),

data/lib/tabula/table_extractor.rb CHANGED Viewed

@@ -28,7 +28,8 @@ module Tabula
     options = {
       :password => '',
       :detect_ruling_lines => true,
-      :vertical_rulings => []
+      :vertical_rulings => [],
+      :extraction_method => "guess",
     }.merge(options)
     if area.instance_of?(Array)
@@ -41,32 +42,51 @@ module Tabula
       page = [page]
     end
-    page_obj = Extraction::ObjectExtractor.new(pdf_path,
+    pdf_page = Extraction::ObjectExtractor.new(pdf_path,
                                                page,
                                                options[:password]) \
       .extract.next
-    use_detected_lines = false
-    if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
-      detected_vertical_rulings = Ruling.crop_rulings_to_area(page_obj.vertical_ruling_lines,
-                                                              area)
+    if ["spreadsheet", "original"].include? options[:extraction_method]
+      use_spreadsheet_extraction_method = options[:extraction_method] == "spreadsheet"
+    else
+      use_spreadsheet_extraction_method = pdf_page.is_tabular?
+    end
-      # only use lines if at least 80% of them cover at least 90%
-      # of the height of area of interest
+    if use_spreadsheet_extraction_method
+      table = pdf_page.get_area(area).spreadsheets.inject(&:+)
+    else
+      use_detected_lines = false
+      if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
+        detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
+                                                                area)
-      # TODO this heuristic SUCKS
-      # what if only a couple columns is delimited with vertical rulings?
-      # ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
-      # idea: detect columns without considering rulings, detect vertical rulings
-      # calculate ratio and try to come up with a threshold
-      use_detected_lines = detected_vertical_rulings.size > 2 \
-      && (detected_vertical_rulings.count { |vl|
-            vl.height / area.height > 0.9
-          } / detected_vertical_rulings.size.to_f) >= 0.8
+        # only use lines if at least 80% of them cover at least 90%
+        # of the height of area of interest
-    end
+        # TODO this heuristic SUCKS
+        # what if only a couple columns is delimited with vertical rulings?
+        # ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
+        # idea: detect columns without considering rulings, detect vertical rulings
+        # calculate ratio and try to come up with a threshold
+        use_detected_lines = detected_vertical_rulings.size > 2 \
+        && (detected_vertical_rulings.count { |vl|
+              vl.height / area.height > 0.9
+            } / detected_vertical_rulings.size.to_f) >= 0.8
-    page_obj.get_area(area).make_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
+      end
+      table = pdf_page.get_area(area).get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
+      # fixes up the table a little bit, replacing nils with empty TextElements
+      # and sorting the lines.
+      table.lines.each do |l|
+        l.text_elements = l.text_elements.map do |te|
+          te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
+        end
+      end
+      table.lines.sort_by! { |l| l.text_elements.map { |te| te.top or 0 }.max }
+      table
+    end
   end
 end

data/lib/tabula/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Tabula
-  VERSION = '0.7.0'
+  VERSION = '0.7.1'
 end

data/test/heuristic-test-set/original/cs076pct.pdf ADDED Viewed

Binary file

data/test/tests.rb CHANGED Viewed

@@ -4,10 +4,14 @@ require 'minitest/autorun'
 require_relative '../lib/tabula'
+def table_to_array(table)
+  lines_to_array(table.rows)
+end
 def lines_to_array(lines)
-  lines.map { |l|
+  lines.map do |l|
     l.map { |te| te.text.strip }
-  }
+  end
 end
 def lines_to_table(lines)
@@ -15,7 +19,7 @@ def lines_to_table(lines)
 end
-# I don't want to pollute the "real" clasend a funny inspect method. Just for testing comparisons.
+# I don't want to pollute the "real" class with a funny inspect method. Just for testing comparisons.
 module Tabula
   class Table
     def inspect
@@ -27,7 +31,7 @@ end
 module Tabula
   class Line
     def inspect
-      @text_elements.map(&:text).inspect
+      @text_elements.map{|te| te.nil? ? '' : te.text}.inspect
     end
   end
 end
@@ -173,7 +177,7 @@ end
 class TestExtractor < Minitest::Test
   def test_table_extraction_1
-    table = lines_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
+    table = table_to_array Tabula.extract_table(File.expand_path('data/gre.pdf', File.dirname(__FILE__)),
                                                 1,
                                                 [107.1, 57.9214, 394.5214, 290.7],
                                                 :detect_ruling_lines => false)
@@ -184,7 +188,7 @@ class TestExtractor < Minitest::Test
   end
   def test_diputados_voting_record
-    table = lines_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
+    table = table_to_array Tabula.extract_table(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)),
                                                 1,
                                                 [269.875, 12.75, 790.5, 561])
@@ -199,14 +203,13 @@ class TestExtractor < Minitest::Test
     # and a solution for half-x-height-offset lines.
     pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
-    table = lines_to_table Tabula.extract_table(pdf_file_path,
+    table = Tabula.extract_table(pdf_file_path,
                                                 1,
                                                 [106.01, 48.09, 227.31, 551.89],
                                                 :detect_ruling_lines => true)
     expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
     assert_equal expected, table
   end
@@ -259,7 +262,7 @@ class TestExtractor < Minitest::Test
   # TODO Spaces inserted in words - fails
   def test_bo_page24
-    table = lines_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
+    table = table_to_array Tabula.extract_table(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)),
                                                 1,
                                                 [425.625, 53.125, 575.714, 810.535],
                                                 :detect_ruling_lines => false)
@@ -312,7 +315,7 @@ class TestExtractor < Minitest::Test
     vertical_rulings = [47,147,256,310,375,431,504].map{|n| Tabula::Ruling.new(0, n, 0, 1000)}
-    table = lines_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
+    table = table_to_array Tabula.extract_table(File.expand_path('data/campaign_donors.pdf', File.dirname(__FILE__)),
                                                 1,
                                                 [255.57,40.43,398.76,557.35],
                                                 :vertical_rulings => vertical_rulings)
@@ -321,12 +324,12 @@ class TestExtractor < Minitest::Test
   end
   def test_get_spacing_and_merging_right
-    table = lines_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
+    table = table_to_array Tabula.extract_table(File.expand_path('data/strongschools.pdf', File.dirname(__FILE__)),
                                                 1,
                                                 [52.32857142857143,15.557142857142859,128.70000000000002,767.9571428571429],
                                                 :detect_ruling_lines => true)
-    expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia ", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
+    expected = [["Last Name", "First Name", "Address", "City", "State", "Zip", "Occupation", "Employer", "Date", "Amount"], ["Lidstad", "Dick & Peg", "62 Mississippi River Blvd N", "Saint Paul", "MN", "55104", "retired", "", "10/12/2012", "60.00"], ["Strom", "Pam", "1229 Hague Ave", "St. Paul", "MN", "55104", "", "", "9/12/2012", "60.00"], ["Seeba", "Louise & Paul", "1399 Sheldon St", "Saint Paul", "MN", "55108", "BOE", "City of Saint Paul", "10/12/2012", "60.00"], ["Schumacher / Bales", "Douglas L. / Patricia", "948 County Rd. D W", "Saint Paul", "MN", "55126", "", "", "10/13/2012", "60.00"], ["Abrams", "Marjorie", "238 8th St east", "St Paul", "MN", "55101", "Retired", "Retired", "8/8/2012", "75.00"], ["Crouse / Schroeder", "Abigail / Jonathan", "1545 Branston St.", "Saint Paul", "MN", "55108", "", "", "10/6/2012", "75.00"]]
     assert_equal expected, table
@@ -539,7 +542,7 @@ class TestIsTabularHeuristic < Minitest::Test
       extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
       page = extractor.extract.first
       page.get_ruling_lines!
-      assert page.is_tabular?
+      assert page.is_tabular?, "failed on file #{f}"
     end
   end
@@ -549,11 +552,8 @@ class TestIsTabularHeuristic < Minitest::Test
       extractor = Tabula::Extraction::ObjectExtractor.new(path, [1])
       page = extractor.extract.first
       page.get_ruling_lines!
-      assert !page.is_tabular?
+      assert !page.is_tabular?, "failed on file #{f}"
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tabula-extractor
 version: !ruby/object:Gem::Version
-  version: 0.7.0
+  version: 0.7.1
 platform: java
 authors:
 - Manuel Aristarán
@@ -10,7 +10,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-01-07 00:00:00.000000000 Z
+date: 2014-01-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: minitest
@@ -162,6 +162,7 @@ files:
 - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
 - test/heuristic-test-set/original/bo_page24.pdf
 - test/heuristic-test-set/original/campaign_donors.pdf
+- test/heuristic-test-set/original/cs076pct.pdf
 - test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
 - test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
 - test/heuristic-test-set/spreadsheet/strongschools.pdf
@@ -189,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.1.9
+rubygems_version: 2.2.1
 signing_key:
 specification_version: 4
 summary: extract tables from PDF files
@@ -217,6 +218,7 @@ test_files:
 - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
 - test/heuristic-test-set/original/bo_page24.pdf
 - test/heuristic-test-set/original/campaign_donors.pdf
+- test/heuristic-test-set/original/cs076pct.pdf
 - test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
 - test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
 - test/heuristic-test-set/spreadsheet/strongschools.pdf