RubyGems - tabula-extractor - Versions diffs - 0.7.2-java → 0.7.4-java - Mend

tabula-extractor 0.7.2-java → 0.7.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/README.md +4 -8
data/bin/tabula +3 -3
data/lib/tabula.rb +9 -5
data/lib/tabula/entities.rb +1 -0
data/lib/tabula/entities/cell.rb +6 -4
data/lib/tabula/entities/has_cells.rb +22 -78
data/lib/tabula/entities/line.rb +52 -6
data/lib/tabula/entities/page.rb +43 -50
data/lib/tabula/entities/ruling.rb +83 -105
data/lib/tabula/entities/spreadsheet.rb +74 -11
data/lib/tabula/entities/table.rb +55 -37
data/lib/tabula/entities/tabular.rb +42 -0
data/lib/tabula/entities/text_chunk.rb +55 -52
data/lib/tabula/entities/text_element.rb +129 -62
data/lib/tabula/entities/zone_entity.rb +15 -6
data/lib/tabula/extraction.rb +114 -49
data/lib/tabula/line_segment_detector.rb +0 -5
data/lib/tabula/table_extractor.rb +32 -37
data/lib/tabula/version.rb +1 -1
data/tabula-extractor.gemspec +2 -5
metadata +13 -95
data/ext/COPYING +0 -661
data/ext/Makefile.OSX +0 -18
data/ext/Makefile.defaults +0 -9
data/ext/Makefile.linux32 +0 -11
data/ext/Makefile.linux64 +0 -12
data/ext/Makefile.mingw +0 -10
data/ext/Makefile.mingw64 +0 -10
data/ext/liblsd-linux32.so +0 -0
data/ext/liblsd-linux64.so +0 -0
data/ext/liblsd.def +0 -3
data/ext/liblsd.dll +0 -0
data/ext/liblsd.dylib +0 -0
data/ext/liblsd64.dll +0 -0
data/ext/lsd.c +0 -2270
data/ext/lsd.h +0 -283
data/test/data/47008204D_USA.page4.pdf +0 -0
data/test/data/560015757GV_China.page1.pdf +0 -0
data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
data/test/data/GSK_2012_Q4.page437.pdf +0 -0
data/test/data/S2MNCEbirdisland.pdf +0 -0
data/test/data/argentina_diputados_voting_record.pdf +0 -0
data/test/data/bo_page24.pdf +0 -0
data/test/data/campaign_donors.pdf +0 -0
data/test/data/frx_2012_disclosure.pdf +0 -0
data/test/data/frx_2012_disclosure.tsv +0 -88
data/test/data/gre.pdf +0 -0
data/test/data/no_tables.pdf +0 -0
data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
data/test/data/puertos1.pdf +0 -0
data/test/data/spanning_cells.csv +0 -21
data/test/data/spanning_cells.pdf +0 -0
data/test/data/strongschools.pdf +0 -0
data/test/data/sydney_disclosure_contract.pdf +0 -0
data/test/data/tabla_subsidios.pdf +0 -0
data/test/data/vertical_rulings_bug.pdf +0 -0
data/test/data/vietnam3.pdf +0 -0
data/test/data/wc2012.pdf +0 -0
data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
data/test/heuristic.rb +0 -50
data/test/test_bin_tabula.sh +0 -7
data/test/tests.rb +0 -603

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 089f1213abcf17bb66c982d40b2145a0f452297c
-  data.tar.gz: 80151791aae887fe11108e3f39c03e06eb29cdea
+  metadata.gz: 935de0f0dc43fa388a86cc091dc540b74b6ce31f
+  data.tar.gz: 67fa5fda6450c3b1659af3c61c8027843be5c082
 SHA512:
-  metadata.gz: 28a058d979fc405094a416fbcd05c65f3cbd25e8269ce8e22f023a27fe0e5f3d4ef39f8b1e60c00ffc964bd858eb379ae71826f9eb4e70447b4181be96d97efe
-  data.tar.gz: c601699c5639a72a0ce4149b50f523cebe914a6767f3dd449979dde6c9a6c7ae269a270f7da0bfd33d92a52199d3a74866aef69121e6033f593c48341f3dc9f4
+  metadata.gz: 191054f79148535bf359c81c72d35b717f71f97ee3c3bedd4c2af66e4332afb98f3071afe4c9ed9e894586e3a20722769742f17fc02b9a5d5d954a4fae50803d
+  data.tar.gz: 711f993194c402d1bca016f0fe13ccaeb8e4eafc6b67c2de0fa8b3cef1e7e3ae5b4cdefc2b251b64467747e7af26f80bb54bf57d4424ea50bb2dd26db7e27570

data/.gitignore CHANGED Viewed

@@ -12,6 +12,7 @@ rdoc
 spec/reports
 test/tmp
 test/version_tmp
+test/data/icdar-groundtruth
 tmp
 /*.pdf
 /*.csv

data/README.md CHANGED Viewed

@@ -7,7 +7,7 @@ Extract tables from PDF files. `tabula-extractor` is the table extraction engine
 ## Installation
-At the moment, `tabula-extractor` only works with JRuby. [Install JRuby](http://jruby.org/getting-started) and run
+`tabula-extractor` only works with JRuby 1.7 or newer. [Install JRuby](http://jruby.org/getting-started) and run
 ``
 jruby -S gem install tabula-extractor
@@ -57,12 +57,12 @@ Here's a very basic example:
 ````ruby
 require 'tabula'
 pdf_file_path = "whatever.pdf"
 outfilename = "whatever.csv"
 out = open(outfilename, 'w')
 extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
 extractor.extract.each do |pdf_page|
   pdf_page.spreadsheets.each do |spreadsheet|
@@ -73,7 +73,3 @@ end
 out.close
 ````
-## Notes
-`tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.

data/bin/tabula CHANGED Viewed

@@ -1,4 +1,4 @@
-#!/usr/bin/env jruby
+#!/usr/bin/env jruby -J-Djava.awt.headless=true
 # encoding: utf-8
 require 'trollop'
 require_relative '../lib/tabula'
@@ -9,7 +9,7 @@ def parse_pages_arg(pages_arg)
   if(pages_arg == 'all')
     return :all
   end
   ranges = pages_arg.split(',').map(&:strip)
   pages = []
   ranges.each do |range|
@@ -100,7 +100,7 @@ def main
 				else
 				  false
 				end
   extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
   extractor.extract.each_with_index do |pdf_page, page_index|

data/lib/tabula.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 module Tabula
   PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
   ONLY_SPACES_RE = Regexp.new('^\s+$')
+  SAME_CHAR_RE = Regexp.new('^(.)\1+$')
 end
 require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
@@ -8,7 +9,6 @@ require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
 require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
 require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
 import 'java.util.logging.LogManager'
 import 'java.util.logging.Level'
@@ -22,13 +22,17 @@ lm.logger_names.each do |name|
     end
   end
 end
 require_relative './tabula/version'
 require_relative './tabula/core_ext'
 require_relative './tabula/entities'
 require_relative './tabula/extraction'
 require_relative './tabula/table_extractor'
 require_relative './tabula/writers'
-require_relative './tabula/line_segment_detector'
-require_relative './tabula/pdf_render'
+module Tabula
+  autoload :LSD               , File.expand_path('tabula/line_segment_detector.rb', File.dirname(__FILE__))
+  autoload :Render            , File.expand_path('tabula/pdf_render.rb', File.dirname(__FILE__))
+end
+require_relative './tabula/table_extractor'

data/lib/tabula/entities.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require_relative './entities/tabular'
 require_relative './entities/zone_entity'
 require_relative './entities/cell'
 require_relative './entities/has_cells'

data/lib/tabula/entities/cell.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module Tabula
       @placeholder = false
       @spanning = false
       @text_elements = []
-      @options = ({:use_line_returns => false, :cell_debug => NORMAL}).merge options
+      @options = ({:use_line_returns => true, :cell_debug => NORMAL}).merge options
     end
     def self.new_from_points(topleft, bottomright, options={})
@@ -29,11 +29,13 @@ module Tabula
       output = ""
       text_elements.sort #use the default sort for ZoneEntity
       text_elements.group_by(&:top).values.each do |row|
-        output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\n" : '')
-      end
+        output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\r" : '')
+        # per @bchartoff, https://github.com/jazzido/tabula-extractor/pull/65#issuecomment-32899336
+        # line returns as \r behave better in Excel.
+      end
       if (output.empty? && @options[:cell_debug] >= DEBUG) || @options[:cell_debug] >= SUPERDEBUG
         text_output = output.dup
-        output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
+        output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
         output += " \n #{text_output}"
       end
       output.strip

data/lib/tabula/entities/has_cells.rb CHANGED Viewed

@@ -6,27 +6,30 @@ module Tabula
   # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
   module HasCells
-    ANOTHER_MAGIC_NUMBER = 0.75
+    ARBITRARY_MAGIC_HEURISTIC_NUMBER = 0.65
     def is_tabular?
+      ratio = heuristic_ratio
+      return ratio > ARBITRARY_MAGIC_HEURISTIC_NUMBER && ratio < (1 / ARBITRARY_MAGIC_HEURISTIC_NUMBER)
+    end
+    def heuristic_ratio
       #spreadsheet extraction
       spreadsheet = spreadsheets.first
-      return false if spreadsheet.nil?
+      return Float::NAN if spreadsheet.nil?
       rows_defined_by_lines = spreadsheet.rows.size #rows filled in automatically
       columns_defined_by_lines = spreadsheet.cols.size
       table = self.get_table
       columns_defined_without_lines = table.cols.size
       rows_defined_without_lines = table.rows.size
-      ratio = ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
-      return ratio > ANOTHER_MAGIC_NUMBER && ratio < (1 / ANOTHER_MAGIC_NUMBER)
+      ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
     end
     # finds cells from the ruling lines on the page.
     # implements Nurminen thesis algorithm cf. https://github.com/jazzido/tabula-extractor/issues/16
     # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
-    def find_cells!(options={})
+    def find_cells!(horizontal_ruling_lines, vertical_ruling_lines, options={})
       # All lines need to been sorted from up to down,
       # and left to right in ascending order
@@ -39,9 +42,10 @@ module Tabula
       # depending on the Point2D default sort here.
       intersection_points_array = intersection_points.keys.sort
-      intersection_points.each_with_index do |(topLeft, ((horizontal, vertical))), i|
+      intersection_points_array.each_with_index do |topLeft, i|
         # Fetch all points on the same vertical and horizontal
         # line with current crossing point
+        horizontal, vertical = intersection_points[topLeft]
         # this lets us go to the next intersection_point in intersection_points_array
         # it is bad and I feel bad.
@@ -64,19 +68,19 @@ module Tabula
               #                                                    point;
               next unless horizontal.colinear?(y_point)
               #Hypothetical bottom right point of rectangle
-              btmRight = Point2D::Float.new( y_point.x, x_point.y )
+              btmRight = Point2D::Float.new(y_point.x, x_point.y)
               if intersection_points.include?(btmRight)
-                intersection_points[btmRight].each do |btmRightHorizontal, btmRightVertical|
-                  if btmRightHorizontal.colinear?( x_point ) &&
+                btmRightHorizontal, btmRightVertical = intersection_points[btmRight]
+                if btmRightHorizontal.colinear?( x_point ) &&
                     btmRightVertical.colinear?( y_point )
-                    # Rectangle is confirmed to have 4 sides
-                    cellsFound << Cell.new_from_points( topLeft, btmRight, options)
-                    # Each crossing point can be the top left corner
-                    # of only a single rectangle
-                    #next crossing-point; we need to "next" out of the outer loop here
-                    # to avoid creating non-minimal cells, I htink.
-                    throw :cellCreated
-                  end
+                  # Rectangle is confirmed to have 4 sides
+                  cellsFound << Cell.new_from_points( topLeft, btmRight, options)
+                  # Each crossing point can be the top left corner
+                  # of only a single rectangle
+                  #next crossing-point; we need to "next" out of the outer loop here
+                  # to avoid creating non-minimal cells, I htink.
+                  throw :cellCreated
                 end
               end
             end
@@ -87,66 +91,6 @@ module Tabula
       cellsFound
     end
-    #############################
-    # Chapter 2, Spanning Cells #
-    #############################
-    #if c is a "spanning cell", that is
-    #              if there are N>0 vertical lines strictly between this cell's left and right
-    #insert N placeholder cells after it with zero size (but same top)
-    # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
-    def add_spanning_cells!
-      #rounding: because Cell.new_from_points, using in #find_cells above, has
-      # a float precision error where, for instance, a cell whose x2 coord is
-      # supposed to be 160.137451171875 comes out as 160.13745498657227 because
-      # of minus. :(
-      vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq    #already sorted
-      horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
-      cells.each do |c|
-        vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
-        horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
-        unless vertical_rulings_spanned_over.empty?
-          c.spanning = true
-          vertical_rulings_spanned_over.each do |spanned_over_line_loc|
-            placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
-            placeholder.placeholder = true
-            cells << placeholder
-          end
-        end
-        unless horizontal_rulings_spanned_over.empty?
-          c.spanning = true
-          horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
-            placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
-            placeholder.placeholder = true
-            cells << placeholder
-          end
-        end
-        #if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
-        # e.g. -------------------
-        #      | C |  C |  C | C |         (this is some pretty sweet ASCII art, eh?)
-        #      |-----------------|
-        #      | C |  C |  C | C |
-        #      |-----------------|
-        #      | C | SC    P | C |   where MC is the "spanning cell" that holds all the text within its bounds
-        #      |----    +    ----|         P is a "placeholder" cell with either zero width or zero height
-        #      | C | P    DP | C |         DP is a "double placeholder" cell with zero width and zero height
-        #      |----    +    ----|         C is an ordinary cell.
-        #      | C | P    DP | C |
-        #      |-----------------|
-        unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
-          double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
-            placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
-            placeholder.placeholder = true
-            cells << placeholder
-          end
-        end
-      end
-    end
     #TODO:
     #returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells)
     #maybe placeholders should be added after cells is split into spreadsheets

data/lib/tabula/entities/line.rb CHANGED Viewed

@@ -3,6 +3,8 @@ module Tabula
     attr_accessor :text_elements
     attr_reader :index
+    SPACE_RUN_MAX_LENGTH = 3
     def initialize(index=nil)
       @text_elements = []
       @index = index
@@ -16,15 +18,59 @@ module Tabula
         self.width = t.width
         self.height = t.height
       else
-        if in_same_column = @text_elements.find { |te| te.horizontally_overlaps?(t) }
-          in_same_column.merge!(t)
-        else
-          self.text_elements << t
-          self.merge!(t)
-        end
+        self.text_elements << t
+        self.merge!(t)
       end
     end
+    ##
+    # remove runs of the space char longer than SPACE_RUN_MAX_LENGTH
+    # should not change dimensions of the container +Line+
+    def remove_sequential_spaces!(seq_spaces_count=SPACE_RUN_MAX_LENGTH)
+      self.text_elements = self.text_elements.reduce([]) do |memo, text_chunk|
+        long_space_runs = text_chunk
+          .text_elements
+          .chunk { |te| te.text == ' '}  # detect runs of spaces...
+          .select { |is_space, text_elements| # ...longer than SPACE_RUN_MAX_LENGTH
+          is_space && !text_elements.nil? && text_elements.size >= SPACE_RUN_MAX_LENGTH
+        }
+          .map { |_, text_elements| text_elements }
+        # no long runs of spaces
+        # keep as it was and end iteration
+        if long_space_runs.empty?
+          memo << text_chunk
+          next memo
+        end
+        ranges = long_space_runs.map { |lsr|
+          idx = text_chunk
+            .text_elements
+            .index { |te| te.equal?(lsr.first) } # we need pointer comparison here
+          (idx)..(idx+lsr.size-1)
+        }
+        in_run = false
+        new_chunk = true
+        text_chunk
+          .text_elements
+          .each_with_index do |te, i|
+          if ranges.any? { |r| r.include?(i) } # te belongs to a run of spaces, skip
+            in_run = true
+          else
+            if in_run || new_chunk
+              memo << TextChunk.create_from_text_element(te)
+            else
+              memo.last << te
+            end
+            in_run = new_chunk = false
+          end
+        end
+        memo
+      end # reduce
+      self
+    end
     #used for testing, ignores text element stuff besides stripped text.
     def ==(other)
       return false if other.nil?

data/lib/tabula/entities/page.rb CHANGED Viewed

@@ -6,7 +6,7 @@ module Tabula
     attr_writer :min_char_width, :min_char_height
     attr_accessor :cells
-    def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil)
+    def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil, spatial_index=nil)
       super(0, 0, width, height)
       @rotation = rotation
       if number < 1
@@ -19,10 +19,16 @@ module Tabula
       @spreadsheets = nil
       @min_char_width = min_char_width
       @min_char_height = min_char_height
-      @spatial_index = TextElementIndex.new
       self.texts = texts
-      self.texts.each { |te| @spatial_index << te }
+      if spatial_index.nil?
+        @spatial_index = TextElementIndex.new
+        self.texts.each { |te| @spatial_index << te }
+      else
+        @spatial_index = spatial_index
+      end
     end
     def min_char_width
@@ -49,7 +55,8 @@ module Tabula
                                texts,
                                Ruling.crop_rulings_to_area(@ruling_lines, area),
                                texts.map(&:width).min,
-                               texts.map(&:height).min)
+                               texts.map(&:height).min,
+                               @spatial_index)
       return page_area
     end
@@ -60,28 +67,33 @@ module Tabula
         return Tabula::Table.new(0, [])
       end
-      text_chunks = TextElement.merge_words(self.texts.sort, options).sort
+      texts = self.texts.sort
+      text_chunks = TextElement.merge_words(texts, options)
-      lines = TextChunk.group_by_lines(text_chunks)
+      lines = TextChunk.group_by_lines(text_chunks.sort).sort_by(&:top)
-      unless options[:vertical_rulings].empty?
-        columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
-        separators = columns.sort.reverse
-      else
-        columns = TextChunk.column_positions(lines.first.text_elements.min_by(&:top).top,
-                                             text_chunks)
-        separators = columns[1..-1].sort.reverse
-      end
+      columns = unless options[:vertical_rulings].empty?
+                  options[:vertical_rulings].map(&:left).sort #pixel locations, not entities
+                else
+                  TextChunk.column_positions(lines).sort
+                end
-      table = Table.new(lines.count, separators)
+      table = Table.new(lines.count, columns)
       lines.each_with_index do |line, i|
-        line.text_elements.each do |te|
-          j = separators.find_index { |s| te.left > s } || separators.count
-          table.add_text_element(te, i, separators.count - j)
+        line.text_elements.select { |te| te.text !~ ONLY_SPACES_RE }.each do |te|
+          j = columns.find_index { |s| te.left <= s } || columns.count
+          table.add_text_element(te, i, j)
         end
       end
-      table.lstrip_lines!
+      # fixes up the table a little bit, replacing nils with empty TextElements
+      # and sorting the lines.
+      # table.rows.each do |l|
+      #   l.text_elements = l.text_elements.map do |te|
+      #     te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
+      #   end
+      # end
+      # table.rows.sort_by!(&:top)
       table
     end
@@ -96,7 +108,7 @@ module Tabula
         return @spreadsheets
       end
       get_ruling_lines!(options)
-      self.find_cells!(options)
+      self.find_cells!(self.horizontal_ruling_lines, self.vertical_ruling_lines, options)
       spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons.
@@ -157,14 +169,18 @@ module Tabula
     #returns ruling lines, memoizes them in
     def get_ruling_lines!(options={})
-      if !@ruling_lines.nil? && !@ruling_lines.empty?
-        self.snap_points!
-        @vertical_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
-        @horizontal_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
-        @vertical_ruling_lines + @horizontal_ruling_lines
-      else
-        []
+      if @ruling_lines.nil? || @ruling_lines.empty?
+        return []
       end
+      self.snap_points!
+      @ruling_lines.select! { |l| !(l.width == 0 && l.height == 0) }
+      @vertical_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
+      @horizontal_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
+      @vertical_ruling_lines + @horizontal_ruling_lines
     end
     ##
@@ -252,29 +268,6 @@ module Tabula
         l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1]
       end
     end
-    def collapse_oriented_rulings(lines)
-      # lines must all be of one orientation (i.e. horizontal, vertical)
-      if lines.empty?
-        return []
-      end
-      lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
-      lines = lines.inject([lines.shift]) do |memo, next_line|
-        last = memo.last
-        if next_line.position == last.position && last.nearlyIntersects?(next_line)
-          memo.last.start = next_line.start < last.start ? next_line.start : last.start
-          memo.last.end = next_line.end < last.end ? last.end : next_line.end
-          memo
-        elsif next_line.length == 0
-          memo
-        else
-          memo << next_line
-        end
-      end
-    end
   end
 end