RubyGems - tabula-extractor - Versions diffs - 0.7.1-java → 0.7.2-java - Mend

tabula-extractor 0.7.1-java → 0.7.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/.gitignore +2 -0
data/README.md +22 -0
data/Rakefile +1 -1
data/bin/tabula +8 -1
data/lib/tabula.rb +4 -0
data/lib/tabula/core_ext.rb +1 -1
data/lib/tabula/entities.rb +1 -0
data/lib/tabula/entities/page.rb +20 -8
data/lib/tabula/entities/spreadsheet.rb +1 -1
data/lib/tabula/entities/table.rb +12 -0
data/lib/tabula/entities/text_chunk.rb +1 -3
data/lib/tabula/entities/text_element.rb +52 -34
data/lib/tabula/entities/text_element_index.rb +55 -0
data/lib/tabula/extraction.rb +6 -5
data/lib/tabula/version.rb +1 -1
data/target/jsi-1.1.0-SNAPSHOT.jar +0 -0
data/target/slf4j-api-1.6.3.jar +0 -0
data/target/trove4j-3.0.3.jar +0 -0
data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
data/test/data/sydney_disclosure_contract.pdf +0 -0
data/test/data/wc2012.pdf +0 -0
data/test/tests.rb +45 -1
metadata +13 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: fea5751c9b78705fbfda50be1e29011d40249b04
-  data.tar.gz: 2d7bb156a073636467b34e44b3f8b6a364920354
+  metadata.gz: 089f1213abcf17bb66c982d40b2145a0f452297c
+  data.tar.gz: 80151791aae887fe11108e3f39c03e06eb29cdea
 SHA512:
-  metadata.gz: d8d1f1932397b599b3d724bb4e8ca5fd73f516e198a391625031ab83f8e5f0a2743d51dcc02953ee5a53c0664330c8e8e1f5309f72493e9f2e232068b06085a1
-  data.tar.gz: e292d6320db6a42799418b6651f63a88446f85c6787659597e77b9f7c38e698b69a10c78edbdeea49baedc0d92f77602b847aef90ab1f8f2ab10356493dbb397
+  metadata.gz: 28a058d979fc405094a416fbcd05c65f3cbd25e8269ce8e22f023a27fe0e5f3d4ef39f8b1e60c00ffc964bd858eb379ae71826f9eb4e70447b4181be96d97efe
+  data.tar.gz: c601699c5639a72a0ce4149b50f523cebe914a6767f3dd449979dde6c9a6c7ae269a270f7da0bfd33d92a52199d3a74866aef69121e6033f593c48341f3dc9f4

data/.gitignore CHANGED

@@ -13,6 +13,8 @@ spec/reports
 test/tmp
 test/version_tmp
 tmp
+/*.pdf
+/*.csv
 # YARD artifacts
 .yardoc

data/README.md CHANGED

@@ -44,6 +44,7 @@ Tabula helps you extract tables from PDFs
                           extraction (if there are ruling lines separating each
                           cell, as in a PDF of an Excel spreadsheet)
           --silent, -i:   Suppress all stderr output.
+--use-line-returns, -u:   Use embedded line returns in cells.
          --version, -v:   Print version and exit
             --help, -h:   Show this message
 ```
@@ -52,6 +53,27 @@ Tabula helps you extract tables from PDFs
 `tabula-extractor` is a RubyGem that you can use to programmatically extract tabular data, using the Tabula engine, in your scripts or applications. We don't have docs yet, but [the tests](test/tests.rb) are a good source of information.
+Here's a very basic example:
+````ruby
+require 'tabula'
+pdf_file_path = "whatever.pdf"
+outfilename = "whatever.csv"
+out = open(outfilename, 'w')
+extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
+extractor.extract.each do |pdf_page|
+  pdf_page.spreadsheets.each do |spreadsheet|
+    out << spreadsheet.to_csv
+    out << "\n\n"
+  end
+end
+out.close
+````
 ## Notes
 `tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.

data/Rakefile CHANGED

@@ -6,7 +6,7 @@ require 'rake'
 Bundler::GemHelper.install_tasks
 task :test do
-  ruby %{--debug -X-C -J-Xmx512m test/tests.rb}
+  ruby %{-X+C -J-Xmx512m test/tests.rb}
 end
 task :default => [:test]

data/bin/tabula CHANGED

@@ -47,6 +47,7 @@ EOS
     opt :spreadsheet, "Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
     opt :no_spreadsheet, "Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
     opt :silent, 'Suppress all stderr output.'
+    opt :use_line_returns, 'Use embedded line returns in cells. (Only in spreadsheet mode.)'
   end
   if !opts[:columns].nil?
@@ -94,6 +95,12 @@ def main
                                 else
                                   nil
                                 end
+  use_line_returns = if opts[:use_line_returns]
+				  true
+				else
+				  false
+				end
   extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
   extractor.extract.each_with_index do |pdf_page, page_index|
@@ -111,7 +118,7 @@ def main
           STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{spreadsheet.dims(:top, :left, :bottom, :right)}"
         end
       end
-      tables = pdf_page.spreadsheets.map(&:rows)
+      tables = pdf_page.spreadsheets(:use_line_returns=> use_line_returns).map(&:rows)
     else
       STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{page_area.to_s}" if opts[:debug]
       if opts[:guess]

data/lib/tabula.rb CHANGED

@@ -4,6 +4,10 @@ module Tabula
 end
 require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
+require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
+require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
+require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
 import 'java.util.logging.LogManager'
 import 'java.util.logging.Level'

data/lib/tabula/core_ext.rb CHANGED

@@ -121,7 +121,7 @@ class Line2D::Float
 end
-class Rectangle2D::Float
+class Rectangle2D
   SIMILARITY_DIVISOR = 20
   alias_method :top, :minY

data/lib/tabula/entities.rb CHANGED

@@ -2,6 +2,7 @@ require_relative './entities/zone_entity'
 require_relative './entities/cell'
 require_relative './entities/has_cells'
 require_relative './entities/line'
+require_relative './entities/text_element_index'
 require_relative './entities/page'
 require_relative './entities/page_area'
 require_relative './entities/ruling'

data/lib/tabula/entities/page.rb CHANGED

@@ -15,11 +15,14 @@ module Tabula
       @ruling_lines = ruling_lines
       @file_path = file_path
       @number_one_indexed = number
-      self.texts = texts
       @cells = []
       @spreadsheets = nil
       @min_char_width = min_char_width
       @min_char_height = min_char_height
+      @spatial_index = TextElementIndex.new
+      self.texts = texts
+      self.texts.each { |te| @spatial_index << te }
     end
     def min_char_width
@@ -54,10 +57,10 @@ module Tabula
     def get_table(options={})
       options = {:vertical_rulings => []}.merge(options)
       if texts.empty?
-        return []
+        return Tabula::Table.new(0, [])
       end
-      text_chunks = TextElement.merge_words(self.texts, options).sort
+      text_chunks = TextElement.merge_words(self.texts.sort, options).sort
       lines = TextChunk.group_by_lines(text_chunks)
@@ -65,7 +68,8 @@ module Tabula
         columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
         separators = columns.sort.reverse
       else
-        columns = TextChunk.column_positions(text_chunks)
+        columns = TextChunk.column_positions(lines.first.text_elements.min_by(&:top).top,
+                                             text_chunks)
         separators = columns[1..-1].sort.reverse
       end
@@ -123,8 +127,8 @@ module Tabula
       spreadsheets(options).each do |spreadsheet|
         spreadsheet.cells.each do |cell|
           cell.text_elements = page.get_cell_text(cell)
-          spreadsheet.cells_resolved = true
         end
+        spreadsheet.cells_resolved = true
       end
     end
@@ -176,9 +180,17 @@ module Tabula
       if area.nil?
         texts
       else
-        texts.select do |t|
-          area.contains(t)
-        end
+        @spatial_index.contains(area)
+      end
+    end
+    def fill_in_cell_texts!(areas)
+      texts.each do |t|
+        area = areas.find{|a| a.contains(t) }
+        area.text_elements << t unless area.nil?
+      end
+      areas.each do |area|
+        area.text_elements = TextElement.merge_words(area.text_elements)
       end
     end

data/lib/tabula/entities/spreadsheet.rb CHANGED

@@ -53,7 +53,7 @@ module Tabula
       if array_of_rows.size > 2
         if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
           missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
-          # puts missing_spots.inspect
           missing_spots.each do |missing_spot|
             missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
             missing_spot_placeholder.placeholder = true

data/lib/tabula/entities/table.rb CHANGED

@@ -92,5 +92,17 @@ module Tabula
         'data' => rows,
       }.to_json(*a)
     end
+    def to_csv
+      out = StringIO.new
+      Tabula::Writers.CSV(rows, out)
+      out.string
+    end
+    def to_tsv
+      out = StringIO.new
+      Tabula::Writers.TSV(rows, out)
+      out.string
+    end
   end
 end

data/lib/tabula/entities/text_chunk.rb CHANGED

@@ -31,11 +31,9 @@ module Tabula
     ##
     # calculate estimated columns from an iterable of TextChunk
-    def self.column_positions(text_chunks)
+    def self.column_positions(top, text_chunks)
       right = 0
       columns = []
-      lines = TextChunk.group_by_lines(text_chunks)
-      top = lines.first.text_elements.map(&:top).min
       text_chunks.each do |te|
         next if te.text =~ ONLY_SPACES_RE

data/lib/tabula/entities/text_element.rb CHANGED

@@ -2,16 +2,17 @@ module Tabula
   ##
   # a Glyph
   class TextElement < ZoneEntity
-    attr_accessor :font, :font_size, :text, :width_of_space
+    attr_accessor :font, :font_size, :text, :width_of_space, :direction
     TOLERANCE_FACTOR = 0.25
-    def initialize(top, left, width, height, font, font_size, text, width_of_space)
+    def initialize(top, left, width, height, font, font_size, text, width_of_space, direction=0)
       super(top, left, width, height)
       self.font = font
       self.font_size = font_size
       self.text = text
       self.width_of_space = width_of_space
+      self.direction = direction
     end
     EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
@@ -31,40 +32,45 @@ module Tabula
         current_chunk = chunks.last
         prev_char = current_chunk.text_elements.last
-        # any vertical ruling goes across prev_char and char?
-        across_vertical_ruling = vertical_ruling_locations.any? { |loc|
-          prev_char.left < loc && char.left > loc
-        }
-        # should we add a space?
-        if (prev_char.text != " ") && (char.text != " ") \
-          && !across_vertical_ruling \
-          && prev_char.should_add_space?(char)
-          sp = self.new(prev_char.top,
-                        prev_char.right,
-                        prev_char.width_of_space,
-                        prev_char.width_of_space, # width == height for spaces
-                        prev_char.font,
-                        prev_char.font_size,
-                        ' ',
-                        prev_char.width_of_space)
-          chunks.last << sp
-          prev_char = sp
-        end
-        # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
-        # that they ought to be merged by that account.
-        # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
-        # Why are both of those `.left`?, you might ask. The intuition is that a letter
-        # that starts on the left of a vertical ruling ought to remain on the left of it.
-        if !across_vertical_ruling && prev_char.should_merge?(char)
-          chunks.last << char
+        # if same char AND overlapped, skip
+        if prev_char.text == char.text && prev_char.overlaps_with_ratio?(char, 0.85)
+          chunks
         else
-          # create a new chunk
-          chunks << TextChunk.create_from_text_element(char)
+          # any vertical ruling goes across prev_char and char?
+          across_vertical_ruling = vertical_ruling_locations.any? { |loc|
+            prev_char.left < loc && char.left > loc
+          }
+          # should we add a space?
+          if (prev_char.text != " ") && (char.text != " ") \
+            && !across_vertical_ruling \
+            && prev_char.should_add_space?(char)
+            sp = self.new(prev_char.top,
+                          prev_char.right,
+                          prev_char.width_of_space,
+                          prev_char.width_of_space, # width == height for spaces
+                          prev_char.font,
+                          prev_char.font_size,
+                          ' ',
+                          prev_char.width_of_space)
+            chunks.last << sp
+            prev_char = sp
+          end
+          # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
+          # that they ought to be merged by that account.
+          # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
+          # Why are both of those `.left`?, you might ask. The intuition is that a letter
+          # that starts on the left of a vertical ruling ought to remain on the left of it.
+          if !across_vertical_ruling && prev_char.should_merge?(char)
+            chunks.last << char
+          else
+            # create a new chunk
+            chunks << TextChunk.create_from_text_element(char)
+          end
+          chunks
         end
-        chunks
       end
     end
@@ -108,5 +114,17 @@ module Tabula
     def ==(other)
       self.text.strip == other.text.strip
     end
+    # sort in lexicographic (reading) order
+    def <=>(other)
+      if self.vertically_overlaps?(other)
+        self.left <=> other.left
+      elsif self.top < other.top
+        -1
+      else
+        1
+      end
+    end
   end
 end

data/lib/tabula/entities/text_element_index.rb ADDED

@@ -0,0 +1,55 @@
+module Tabula
+  class TextElementIndex < Java::ComInfomatiqJsiRtree::RTree
+    attr_reader :te_dict
+    class SaveToListProcedure
+      include Java::GnuTroveProcedure::TIntProcedure
+      attr_reader :list
+      def initialize(parent)
+        @parent = parent
+        @list = []
+      end
+      def execute(id)
+        @list << @parent.te_dict[id]
+        return true
+      end
+      def reset!
+        @list = []
+      end
+    end
+    def initialize
+      super
+      self.init(nil)
+      @te_dict = {}
+      @save_to_list = SaveToListProcedure.new(self)
+    end
+    def <<(text_element)
+      r = Java::ComInfomatiqJsi::Rectangle.new(text_element.left,
+                                               text_element.top,
+                                               text_element.right,
+                                               text_element.bottom)
+      @te_dict[text_element.object_id] = text_element
+      self.add(r, text_element.object_id)
+    end
+    def contains(zone_entity)
+      r = Java::ComInfomatiqJsi::Rectangle.new(zone_entity.left,
+                                               zone_entity.top,
+                                               zone_entity.right,
+                                               zone_entity.bottom)
+      @save_to_list.reset!
+      super(r, @save_to_list)
+      # sort in lexicographic (reading) order
+      @save_to_list.list.sort
+    end
+  end
+end

data/lib/tabula/extraction.rb CHANGED

@@ -203,12 +203,12 @@ module Tabula
         if c == ' ' || c == ' ' # replace non-breaking space for space
           c = ' '
-          h = text.getWidthDirAdj.round(2)
+          h = text.getWidth.round(2)
         end
-        te = Tabula::TextElement.new(text.getYDirAdj.round(2) - h,
-                                     text.getXDirAdj.round(2),
-                                     text.getWidthDirAdj.round(2),
+        te = Tabula::TextElement.new(text.getY.round(2) - h,
+                                     text.getX.round(2),
+                                     text.getWidth.round(2),
                                      # ugly hack follows: we need spaces to have a height, so we can
                                      # test for vertical overlap. height == width seems a safe bet.
                                      h,
@@ -216,7 +216,8 @@ module Tabula
                                      text.getFontSize.round(2),
                                      c,
                                      # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
-                                     text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace)
+                                     text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
+                                     text.getDir)
         ccp_bounds = self.currentClippingPath

data/lib/tabula/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Tabula
-  VERSION = '0.7.1'
+  VERSION = '0.7.2'
 end

data/target/jsi-1.1.0-SNAPSHOT.jar ADDED

Binary file

data/target/slf4j-api-1.6.3.jar ADDED

Binary file

data/target/trove4j-3.0.3.jar ADDED

Binary file

data/test/data/nyc_2013fiscalreporttables.pdf ADDED

Binary file

data/test/data/sydney_disclosure_contract.pdf ADDED

Binary file

data/test/data/wc2012.pdf ADDED

Binary file

data/test/tests.rb CHANGED

@@ -206,7 +206,8 @@ class TestExtractor < Minitest::Test
     table = Tabula.extract_table(pdf_file_path,
                                                 1,
                                                 [106.01, 48.09, 227.31, 551.89],
-                                                :detect_ruling_lines => true)
+                                                :detect_ruling_lines => true,
+                                                :extraction_method => "original")
     expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
@@ -527,6 +528,49 @@ class TestExtractor < Minitest::Test
       assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
     end
   end
+  def test_remove_repeated_text
+    top, left, bottom, right = 106.07142857142858, 50.91428571428572, 141.42857142857144, 755.2285714285715
+    table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
+                                 1,
+                                 [top,left,bottom,right],
+                                 :detect_ruling_lines => false,
+                                 :extraction_method => 'original')
+    ary = table_to_array(table)
+    assert_equal ary[1][1], "$ 18,969,610"
+    assert_equal ary[1][2], "$ 18,157,722"
+  end
+  def test_remove_overlapping_text
+    # one of those PDFs that put characters on top of another to make text "bold"
+    top,left,bottom,right = 399.98571428571427, 36.06428571428571, 425.1214285714285, 544.2428571428571
+    table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
+                                 1,
+                                 [top,left,bottom,right],
+                                 :detect_ruling_lines => false,
+                                 :extraction_method => 'original')
+    ary = table_to_array(table)
+    assert_equal ary.first.first, "Community development"
+  end
+  def test_cells_including_line_returns
+    data = []
+    pdf_file_path = "./test/data/sydney_disclosure_contract.pdf"
+    Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
+      pdf_page.spreadsheets.each do |spreadsheet|
+        spreadsheet.cells.each do |cell|
+          cell.text_elements = pdf_page.get_cell_text(cell)
+          cell.options = ({:use_line_returns => true, :cell_debug => 0})
+          data << cell.text
+        end
+      end
+    end
+    assert_equal ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \nAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \nDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender  \nTender evaluation criteria included: \n- The schedule of prices \n- Compliance with technical specifications/Technical assessment \n- Operational Plan including maintenance procedures"], data
+  end
 end
 class TestIsTabularHeuristic < Minitest::Test

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tabula-extractor
 version: !ruby/object:Gem::Version
-  version: 0.7.1
+  version: 0.7.2
 platform: java
 authors:
 - Manuel Aristarán
@@ -10,7 +10,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-01-18 00:00:00.000000000 Z
+date: 2014-01-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: minitest
@@ -127,6 +127,7 @@ files:
 - lib/tabula/entities/table.rb
 - lib/tabula/entities/text_chunk.rb
 - lib/tabula/entities/text_element.rb
+- lib/tabula/entities/text_element_index.rb
 - lib/tabula/entities/zone_entity.rb
 - lib/tabula/extraction.rb
 - lib/tabula/line_segment_detector.rb
@@ -138,7 +139,10 @@ files:
 - lib/tabula/version.rb
 - lib/tabula/writers.rb
 - tabula-extractor.gemspec
+- target/jsi-1.1.0-SNAPSHOT.jar
 - target/pdfbox-app-2.0.0-SNAPSHOT.jar
+- target/slf4j-api-1.6.3.jar
+- target/trove4j-3.0.3.jar
 - test/data/47008204D_USA.page4.pdf
 - test/data/560015757GV_China.page1.pdf
 - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
@@ -151,13 +155,16 @@ files:
 - test/data/frx_2012_disclosure.tsv
 - test/data/gre.pdf
 - test/data/no_tables.pdf
+- test/data/nyc_2013fiscalreporttables.pdf
 - test/data/puertos1.pdf
 - test/data/spanning_cells.csv
 - test/data/spanning_cells.pdf
 - test/data/strongschools.pdf
+- test/data/sydney_disclosure_contract.pdf
 - test/data/tabla_subsidios.pdf
 - test/data/vertical_rulings_bug.pdf
 - test/data/vietnam3.pdf
+- test/data/wc2012.pdf
 - test/heuristic-test-set/original/560015757GV_China.page1.pdf
 - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
 - test/heuristic-test-set/original/bo_page24.pdf
@@ -190,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.1
+rubygems_version: 2.1.9
 signing_key:
 specification_version: 4
 summary: extract tables from PDF files
@@ -207,13 +214,16 @@ test_files:
 - test/data/frx_2012_disclosure.tsv
 - test/data/gre.pdf
 - test/data/no_tables.pdf
+- test/data/nyc_2013fiscalreporttables.pdf
 - test/data/puertos1.pdf
 - test/data/spanning_cells.csv
 - test/data/spanning_cells.pdf
 - test/data/strongschools.pdf
+- test/data/sydney_disclosure_contract.pdf
 - test/data/tabla_subsidios.pdf
 - test/data/vertical_rulings_bug.pdf
 - test/data/vietnam3.pdf
+- test/data/wc2012.pdf
 - test/heuristic-test-set/original/560015757GV_China.page1.pdf
 - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
 - test/heuristic-test-set/original/bo_page24.pdf