RubyGems - tabula-extractor - Versions diffs - 0.5.1-java → 0.6.1-java - Mend

tabula-extractor 0.5.1-java → 0.6.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/bin/tabula +10 -6
data/lib/geom/point.rb +21 -0
data/lib/geom/rectangle.rb +101 -0
data/lib/geom/segment.rb +82 -0
data/lib/tabula.rb +1 -0
data/lib/tabula/entities.rb +16 -5
data/lib/tabula/line_segment_detector.rb +1 -1
data/lib/tabula/pdf_dump.rb +2 -1
data/lib/tabula/pdf_render.rb +2 -1
data/lib/tabula/table_extractor.rb +127 -15
data/lib/tabula/table_guesser.rb +199 -0
data/lib/tabula/version.rb +1 -1
data/tabula-extractor.gemspec +2 -2
data/test/data/frx_2012_disclosure.pdf +0 -0
data/test/tests.rb +73 -0
metadata +10 -2

data/bin/tabula CHANGED Viewed

@@ -34,6 +34,7 @@ EOS
     opt :pages, 'Comma separated list of ranges. Examples: --pages 1-3,5-7 or --pages 3. Default is --pages 1', :default => '1', :type => String
     opt :area, 'Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page', :type => String, :default => nil
+    opt :guess, 'Guess the portion of the page to analyze per page. Slow.'
     opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
     opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
   end
@@ -51,7 +52,6 @@ EOS
   Trollop::die 'file does not exist' unless File.exists? pdf_filename
   return opts, pdf_filename
 end
 def main
@@ -60,11 +60,15 @@ def main
   area = opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)
   out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
   extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
-  extractor.extract.each do |page|
-    text = page.get_text(area)
-    Tabula::Writers.send(opts[:format].to_sym,
-                         Tabula.make_table(text),
-                         out)
+  extractor.extract.each_with_index do |page, page_index|
+    page_areas = opts[:guess] ? Tabula::TableGuesser::find_rects_on_page(Tabula::TableGuesser::load_pdf(filename), page_index) : [area]
+    page_areas.each do |page_area|
+      text = page.get_text( page_area )
+      Tabula::Writers.send(opts[:format].to_sym,
+                           Tabula.make_table(text),
+                           out)
+    end
   end
   out.close
 end

data/lib/geom/point.rb ADDED Viewed

@@ -0,0 +1,21 @@
+#
+# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
+# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
+#
+module Geometry
+  class Point < Struct.new(:x, :y)
+    def self.new_by_array(array)
+      self.new(array[0], array[1])
+    end
+    def ==(another_point)
+      x === another_point.x && y === another_point.y
+    end
+  end
+end
+def Point(x, y)
+  Geometry::Point.new(x, y)
+end

data/lib/geom/rectangle.rb ADDED Viewed

@@ -0,0 +1,101 @@
+#
+# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
+# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
+#
+module Geometry
+  class Rectangle < Struct.new(:point1, :point2)
+    SIMILARITY_DIVISOR = 20
+    def Rectangle.unionize(non_overlapping_rectangles, next_rect)
+      #if next_rect doesn't overlap any of non_overlapping_rectangles
+      if (overlapping = non_overlapping_rectangles.select{|r| next_rect.overlaps? r}) && !non_overlapping_rectangles.empty?
+        #remove all of those that it overlaps from non_overlapping_rectangles and
+        non_overlapping_rectangles -= overlapping
+        #add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
+        non_overlapping_rectangles << overlapping.inject(next_rect){|memo, overlap| memo.bounding_box(overlap) }
+      else
+        non_overlapping_rectangles << next_rect
+      end
+    end
+    def self.new_by_x_y_dims(x, y, width, height)
+      self.new( Point.new_by_array([x, y]), Point.new_by_array([x + width, y + height]) )
+    end
+    def x
+      [point1.x, point2.x].min
+    end
+    alias_method :left, :x
+    def y
+      #puts "y: [#{point1.y} #{point2.y}].min"
+      [point1.y, point2.y].min
+    end
+    alias_method :top, :y
+    def x2
+      [point1.x, point2.x].max
+    end
+    alias_method :right, :x2
+    def y2
+      #puts "y2: [#{point1.y} #{point2.y}].max"
+      [point1.y, point2.y].max
+    end
+    alias_method :bottom, :y2
+    def width
+      (point1.x - point2.x).abs
+    end
+    def height
+      (point1.y - point2.y).abs
+    end
+    def area
+      self.width * self.height
+    end
+    def similarity_hash
+      [self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
+    end
+    def dims(*format)
+      if format
+        format.map{|method| self.send(method)}
+      else
+        [self.x, self.y, self.width, self.height]
+      end
+    end
+    def contains?(other_x, other_y)
+      (other_x <= x2 && other_x >= x ) && (other_y <= y2 && other_y > y)
+    end
+    def overlaps?(other_rect)
+      return contains?(other_rect.x, other_rect.y) || contains?(other_rect.x2, other_rect.y2) ||
+                contains?(other_rect.x, other_rect.y2) || contains?(other_rect.x2, other_rect.y) ||
+                other_rect.contains?(x, y) || other_rect.contains?(x2, y2) ||
+                other_rect.contains?(x, y2) || other_rect.contains?(x2, y)
+    end
+    def bounding_box(other_rect)
+      #new rect with bounding box of these two
+      new_x1 = [x, other_rect.x].min
+      new_y1 = [x, other_rect.y].min
+      new_x2 = [x2, other_rect.x2].max
+      new_y2 = [y2, other_rect.y2].max
+      new_width = (new_x2 - new_x1).abs
+      new_height = (new_y2 - new_y1).abs
+      Rectangle.new_by_x_y_dims(new_x1, new_y1, new_width, new_height)
+    end
+  end
+end

data/lib/geom/segment.rb ADDED Viewed

@@ -0,0 +1,82 @@
+#
+# Cribbed shamelessly from Daniel Vartanov's [ruby-geometry](https://github.com/DanielVartanov/ruby-geometry/)
+# MIT License (c) 2008 Daniel Vartanov, modifications (c) 2013 Jeremy B. Merrill
+#
+module Geometry
+  include Math
+  extend Math
+  def Geometry.distance(point1, point2)
+    hypot point1.x - point2.x, point1.y - point2.y
+  end
+  class Segment < Struct.new(:point1, :point2)
+    def self.new_by_arrays(point1_coordinates, point2_coordinates)
+      self.new(Point.new_by_array(point1_coordinates),
+               Point.new_by_array(point2_coordinates))
+    end
+    def scale!(scale_factor)
+      self.point1.x = self.point1.x * scale_factor
+      self.point1.y = self.point1.y * scale_factor
+      self.point2.x = self.point2.x * scale_factor
+      self.point2.y = self.point2.y * scale_factor
+    end
+    def vertical?
+      point1.x == point2.x
+    end
+    def horizontal?
+      point1.y == point2.y
+    end
+    def leftmost_endpoint
+      ((point1.x <=> point2.x) == -1) ? point1 : point2
+    end
+    def rightmost_endpoint
+      ((point1.x <=> point2.x) == 1) ? point1 : point2
+    end
+    def topmost_endpoint
+      ((point1.y <=> point2.y) == 1) ? point1 : point2
+    end
+    def bottommost_endpoint
+      ((point1.y <=> point2.y) == -1) ? point1 : point2
+    end
+    def top
+      topmost_endpoint.y
+    end
+    def bottom
+      bottommost_endpoint.y
+    end
+    def width
+      (left - right).abs
+    end
+    def height
+      (bottom - top).abs
+    end
+    def left
+      leftmost_endpoint.x
+    end
+    def right
+      rightmost_endpoint.x
+    end
+    def length
+      Geometry.distance(point1, point2)
+    end
+  end
+end
+def Segment(point1, point2)
+  Geometry::Segment.new point1, point2
+end

data/lib/tabula.rb CHANGED Viewed

@@ -7,5 +7,6 @@ require_relative './tabula/entities'
 require_relative './tabula/pdf_dump'
 require_relative './tabula/table_extractor'
 require_relative './tabula/writers'
+require_relative './tabula/table_guesser'
 require_relative './tabula/line_segment_detector'
 require_relative './tabula/pdf_render'

data/lib/tabula/entities.rb CHANGED Viewed

@@ -99,10 +99,10 @@ module Tabula
       # spaces are not detected, b/c they have height == 0
       # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
-      # self.texts.select { |t| t.overlaps? ze }
-      self.texts.select { |t|
+      # self.texts.select { |t| t.overlaps? ze }
+      self.texts.select do |t|
         t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
-      }
+      end
     end
     def to_json(options={})
@@ -120,7 +120,7 @@ module Tabula
     attr_accessor :font, :font_size, :text, :width_of_space
     CHARACTER_DISTANCE_THRESHOLD = 1.5
-    TOLERANCE_FACTOR = 0.25
+    TOLERANCE_FACTOR = 0.25 #25
     def initialize(top, left, width, height, font, font_size, text, width_of_space)
       super(top, left, width, height)
@@ -149,7 +149,7 @@ module Tabula
       overlaps = self.vertically_overlaps?(other)
       up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
-      down_tolerance = 0.95
+      down_tolerance = 0.90 #90?
       dist = self.horizontal_distance(other).abs
@@ -261,6 +261,10 @@ module Tabula
       r >= 0 and r < 1 and s >= 0 and s < 1
     end
+    def length
+      Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
+    end
     def vertical?
       left == right
     end
@@ -269,6 +273,13 @@ module Tabula
       top == bottom
     end
+    def right
+      left + width
+    end
+    def bottom
+      top + height
+    end
     def to_json(arg)
       [left, top, right, bottom].to_json
     end

data/lib/tabula/line_segment_detector.rb CHANGED Viewed

@@ -46,7 +46,7 @@ module Tabula
       options = DETECT_LINES_DEFAULTS.merge(options)
       pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
-      page = pdf_file.getDocumentCatalog.getAllPages[page_number - 1]
+      page = pdf_file.getDocumentCatalog.getAllPages[page_number]
       bi = Tabula::Render.pageToBufferedImage(page,
                                               options[:image_size])
       pdf_file.close

data/lib/tabula/pdf_dump.rb CHANGED Viewed

@@ -77,11 +77,12 @@ module Tabula
     class CharacterExtractor
       include Observable
+      #N.B. pages can be :all, a list of pages or a range.
       def initialize(pdf_filename, pages=[1])
         raise Errno::ENOENT unless File.exists?(pdf_filename)
         @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
         @all_pages = @pdf_file.getDocumentCatalog.getAllPages
-        @pages = pages
+        @pages = pages == :all ?  (1..@all_pages.size) : pages
         @extractor = TextExtractor.new
       end

data/lib/tabula/pdf_render.rb CHANGED Viewed

@@ -20,8 +20,9 @@ module Tabula
       end
     end
-    TRANSPARENT_WHITE = Color.new(255, 255, 255, 0)
+    TRANSPARENT_WHITE = java.awt.Color.new(255, 255, 255, 0)
+    # 2048 width is important, if this is too small, thin lines won't be drawn.
     def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
       cropbox = page.findCropBox
       widthPt, heightPt = cropbox.getWidth, cropbox.getHeight

data/lib/tabula/table_extractor.rb CHANGED Viewed

@@ -27,21 +27,38 @@ module Tabula
     # (ie, take into account vertical ruling lines if available)
     def group_by_columns
       columns = []
-      tes = self.text_elements.sort_by(&:left)
+      tes = self.text_elements.sort_by &:left
       # we don't have vertical rulings
-      tes.each do |te|
-        if column = columns.detect { |c| te.horizontally_overlaps?(c) }
-          column << te
-        else
-          columns << Column.new(te.left, te.width, [te])
+      if self.options[:vertical_rulings].empty?
+        tes.each do |te|
+          if column = columns.detect { |c| te.horizontally_overlaps?(c) }
+            column << te
+          else
+            columns << Column.new(te.left, te.width, [te])
+          end
+        end
+      else
+        self.options[:vertical_rulings].sort_by! &:left
+        1.upto(self.options[:vertical_rulings].size - 1) do |i|
+          left_ruling_line =  self.options[:vertical_rulings][i - 1]
+          right_ruling_line = self.options[:vertical_rulings][i]
+          columns << Column.new(left_ruling_line.left, right_ruling_line.left - left_ruling_line.left, []) if (right_ruling_line.left - left_ruling_line.left > 10)
+        end
+        tes.each do |te|
+          if column = columns.detect { |c| te.horizontally_overlaps?(c) }
+            column << te
+          else
+            puts "couldn't find a place for #{te.inspect}"
+            #columns << Column.new(te.left, te.width, [te])
+          end
         end
       end
       columns
     end
     def get_columns
-      Tabula.group_by_columns(text_elements).map do |c|
+      TableExtractor.new(text_elements).group_by_columns.map do |c|
         {'left' => c.left, 'right' => c.right, 'width' => c.width}
       end
     end
@@ -87,6 +104,7 @@ module Tabula
     private
+    #this is where spaces come from!
     def merge_words!
       return self.text_elements if @merged # only merge once. awful hack.
       @merged = true
@@ -97,9 +115,12 @@ module Tabula
         char2 = self.text_elements[i+1]
         next if char2.nil? or char1.nil?
         if self.text_elements[current_word_index].should_merge?(char2)
+          #puts "merging: #{self.text_elements[current_word_index].text}/#{self.text_elements[current_word_index].width}"
           self.text_elements[current_word_index].merge!(char2)
           char1 = char2
           self.text_elements[i+1] = nil
@@ -107,13 +128,14 @@ module Tabula
           # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
           if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
             self.text_elements[current_word_index].text += " "
-            self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
+            #self.text_elements[current_word_index].width += self.text_elements[current_word_index].width_of_space
           end
           current_word_index = i+1
         end
         i += 1
       end
-      return self.text_elements.compact!
+      self.text_elements.compact!
+      return self.text_elements
     end
   end
@@ -174,7 +196,7 @@ module Tabula
     lines.sort_by!(&:top)
-    columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
+    columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words]}).group_by_columns.sort_by(&:left)
     # # insert empty cells if needed
     lines.each_with_index do |l, line_index|
@@ -183,23 +205,21 @@ module Tabula
       l.text_elements.uniq!  # TODO WHY do I have to do this?
       l.text_elements.sort_by!(&:left)
-      next unless l.text_elements.size < columns.size
+      #next unless l.text_elements.size < columns.size
       columns.each_with_index do |c, i|
-        if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
+        if (i > l.text_elements.size - 1) or (!l.text_elements[i].nil? and !c.text_elements.include?(l.text_elements[i]))
           l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
         end
       end
     end
     # # merge elements that are in the same column
-    columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
     lines.each_with_index do |l, line_index|
       next if l.text_elements.nil?
       (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
-        next if l.text_elements[t1].nil? or l.text_elements[t2].nil?
+        next if l.text_elements[t1].nil? or l.text_elements[t2].nil? or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
         # if same column...
         if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
@@ -233,4 +253,96 @@ module Tabula
       line.text_elements.sort_by(&:left)
     end
   end
+  def Tabula.make_table_with_vertical_rulings(text_elements, options={})
+    extractor = TableExtractor.new(text_elements, options)
+    # group by lines
+    lines = []
+    line_boundaries = extractor.get_line_boundaries
+    # find all the text elements
+    # contained within each detected line (table row) boundary
+    line_boundaries.each do |lb|
+      line = Line.new
+      line_members = text_elements.find_all do |te|
+        te.vertically_overlaps?(lb)
+      end
+      text_elements -= line_members
+      line_members.sort_by(&:left).each do |te|
+        # skip text_elements that only contain spaces
+        next if te.text =~ ONLY_SPACES_RE
+        line << te
+      end
+      lines << line if line.text_elements.size > 0
+    end
+    lines.sort_by!(&:top)
+    vertical_rulings = options[:vertical_rulings]
+    columns = TableExtractor.new(lines.map(&:text_elements).flatten.compact.uniq, {:merge_words => options[:merge_words], :vertical_rulings => vertical_rulings}).group_by_columns.sort_by(&:left)
+    # insert empty cells if needed
+    lines.each_with_index do |l, line_index|
+      next if l.text_elements.nil?
+      l.text_elements.compact! # TODO WHY do I have to do this?
+      l.text_elements.uniq!  # TODO WHY do I have to do this?
+      l.text_elements.sort_by!(&:left)
+      columns.each_with_index do |c, i|
+        if (l.text_elements.select{|te| te && te.left >= c.left && te.right <= (c.left + c.width)}.empty?)
+          l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, '', 0))
+        end
+      end
+    end
+    # merge elements that are in the same column
+    lines.each_with_index do |l, line_index|
+      next if l.text_elements.nil?
+      (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|  #don't remove a string of empty cells
+        next if l.text_elements[t1].nil? or l.text_elements[t2].nil?    or l.text_elements[t1].text.empty? or l.text_elements[t2].text.empty?
+        # if same column...
+        if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
+          == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
+          if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
+            l.text_elements[t1].merge!(l.text_elements[t2])
+            l.text_elements[t2] = nil
+          else
+            l.text_elements[t2].merge!(l.text_elements[t1])
+            l.text_elements[t1] = nil
+          end
+        end
+      end
+      l.text_elements.compact!
+    end
+    # remove duplicate lines
+    # TODO this shouldn't have happened here, check why we have to do
+    # this (maybe duplication is happening in the column merging phase?)
+    (0..lines.size - 2).each do |i|
+      next if lines[i].nil?
+      # if any of the elements on the next line is duplicated, kill
+      # the next line
+      if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
+        lines[i+1] = nil
+      end
+    end
+    lines.compact.map do |line|
+      line.text_elements.sort_by(&:left)
+    end
+  end
 end

data/lib/tabula/table_guesser.rb ADDED Viewed

@@ -0,0 +1,199 @@
+require 'java'
+require 'json'
+require_relative '../geom/point'
+require_relative '../geom/segment'
+require_relative '../geom/rectangle'
+require_relative './pdf_render'
+#CLASSPATH=:./target/javacpp.jar:./target/javacv.jar:./target/javacv-macosx-x86_64.jar:./target/PDFRenderer-0.9.1.jar
+module Tabula
+  module TableGuesser
+    def TableGuesser.find_and_write_rects(filename, output_dir)
+      #writes to JSON the rectangles on each page in the specified PDF.
+      open(File.join(output_dir, "tables.json"), 'w') do |f|
+        f.write( JSON.dump(find_rects(filename).map{|a| a.map{|r| r.dims.map &:to_i }} ))
+      end
+    end
+    def TableGuesser.find_rects(filename)
+      pdf = load_pdfbox_pdf(filename)
+      if pdf.getNumberOfPages == 0
+        puts "not a pdf!"
+        exit
+      end
+      puts "pages: " + pdf.getNumberOfPages.to_s
+      tables = []
+      pdf.getNumberOfPages.times do |i|
+        #gotcha: with PDFView, PDF pages are 1-indexed. If you ask for page 0 and then page 1, you'll get the first page twice. So start with index 1.
+        tables << find_rects_on_page(pdf, i + 1)
+      end
+      tables
+    end
+    def TableGuesser.find_lines(filename)
+      if pdf.getNumberOfPages == 0
+        puts "not a pdf!"
+        exit
+      end
+      puts "pages: " + pdf.getNumberOfPages.to_s
+      lines = []
+      pdf.getNumberOfPages.times do |i|
+        lines << detect_lines_in_pdf_page(filename, i)
+      end
+      lines
+    end
+    def TableGuesser.find_lines_on_page(pdf, page_index)
+      Tabula::LSD.detect_lines_in_pdf_page(pdf, page_index)
+    end
+    def TableGuesser.find_rects_on_page(pdf, page_index)
+      find_rects_from_lines(find_lines_on_page(pdf, page_index, 10))
+    end
+    def TableGuesser.find_rects_from_lines(lines)
+      horizontal_lines = lines.select &:horizontal?
+      vertical_lines = lines.select &:vertical?
+      find_tables(vertical_lines, horizontal_lines).inject([]){|memo, next_rect| Geometry::Rectangle.unionize(memo, next_rect )}.sort_by(&:area).reverse
+    end
+    def TableGuesser.euclidean_distance(x1, y1, x2, y2)
+      return Math.sqrt( ((x1 - x2) ** 2) + ((y1 - y2) ** 2) )
+    end
+    def TableGuesser.is_upward_oriented(line, y_value)
+      #return true if this line is oriented upwards, i.e. if the majority of it's length is above y_value.
+      return (y_value - line.top > line.bottom - y_value);
+    end
+    def TableGuesser.find_tables(verticals, horizontals)
+      # /*
+      #  * Find all the rectangles in the vertical and horizontal lines given.
+      #  *
+      #  * Rectangles are deduped with hashRectangle, which considers two rectangles identical if each point rounds to the same tens place as the other.
+      #  *
+      #  * TODO: generalize this.
+      #  */
+      corner_proximity_threshold = 0.10;
+      rectangles = []
+      #find rectangles with one horizontal line and two vertical lines that end within $threshold to the ends of the horizontal line.
+      [true, false].each do |up_or_down_lines|
+        horizontals.each do |horizontal_line|
+          horizontal_line_length = horizontal_line.length
+          has_vertical_line_from_the_left = false
+          left_vertical_line = nil
+          #for the left vertical line.
+          verticals.each do |vertical_line|
+            #1. if it is correctly oriented (up or down) given the outer loop here. (We don't want a false-positive rectangle with one "arm" going down, and one going up.)
+            next unless is_upward_oriented(vertical_line, horizontal_line.top) == up_or_down_lines
+            vertical_line_length = vertical_line.length
+            longer_line_length = [horizontal_line_length, vertical_line_length].max
+            corner_proximity = corner_proximity_threshold * longer_line_length
+            #make this the left vertical line:
+            #2. if it begins near the left vertex of the horizontal line.
+            if euclidean_distance(horizontal_line.left, horizontal_line.top, vertical_line.left, vertical_line.top) < corner_proximity ||
+               euclidean_distance(horizontal_line.left, horizontal_line.top, vertical_line.left, vertical_line.bottom) < corner_proximity
+              #3. if it is farther to the left of the line we already have.
+              if left_vertical_line.nil? || left_vertical_line.left> vertical_line.left #is this line is more to the left than left_vertical_line. #"What's your opinion on Das Kapital?"
+                has_vertical_line_from_the_left = true
+                left_vertical_line = vertical_line
+              end
+            end
+          end
+          has_vertical_line_from_the_right = false;
+          right_vertical_line = nil
+          #for the right vertical line.
+          verticals.each do |vertical_line|
+            next unless is_upward_oriented(vertical_line, horizontal_line.top) == up_or_down_lines
+            vertical_line_length = vertical_line.length
+            longer_line_length = [horizontal_line_length, vertical_line_length].max
+            corner_proximity = corner_proximity_threshold * longer_line_length
+            if euclidean_distance(horizontal_line.right, horizontal_line.top, vertical_line.left, vertical_line.top) < corner_proximity ||
+              euclidean_distance(horizontal_line.right, horizontal_line.top, vertical_line.left, vertical_line.bottom) < corner_proximity
+              if right_vertical_line.nil? || right_vertical_line.right > vertical_line.right  #is this line is more to the right than right_vertical_line. #"Can you recite all of John Galt's speech?"
+                #do two passes to guarantee we don't get a horizontal line with a upwards and downwards line coming from each of its corners.
+                #i.e. ensuring that both "arms" of the rectangle have the same orientation (up or down).
+                has_vertical_line_from_the_right = true
+                right_vertical_line = vertical_line
+              end
+            end
+          end
+          if has_vertical_line_from_the_right && has_vertical_line_from_the_left
+            #in case we eventually tolerate not-quite-vertical lines, this computers the distance in Y directly, rather than depending on the vertical lines' lengths.
+            height = [left_vertical_line.bottom - left_vertical_line.top, right_vertical_line.bottom - right_vertical_line.top].max
+            y = [left_vertical_line.top, right_vertical_line.top].min
+            width = horizontal_line.right - horizontal_line.left
+            r = Geometry::Rectangle.new_by_x_y_dims(horizontal_line.left, y, width, height ) #x, y, w, h
+            #rectangles.put(hashRectangle(r), r); #TODO: I dont' think I need this now that I'm in Rubyland
+            rectangles << r
+          end
+        end
+        #find rectangles with one vertical line and two horizontal lines that end within $threshold to the ends of the vertical line.
+        verticals.each do |vertical_line|
+          vertical_line_length = vertical_line.length
+          has_horizontal_line_from_the_top = false
+          top_horizontal_line = nil
+          #for the top horizontal line.
+          horizontals.each do |horizontal_line|
+            horizontal_line_length = horizontal_line.length
+            longer_line_length = [horizontal_line_length, vertical_line_length].max
+            corner_proximity = corner_proximity_threshold * longer_line_length
+            if euclidean_distance(vertical_line.left, vertical_line.top, horizontal_line.left, horizontal_line.top) < corner_proximity ||
+                euclidean_distance(vertical_line.left, vertical_line.top, horizontal_line.right, horizontal_line.top) < corner_proximity
+                if top_horizontal_line.nil? || top_horizontal_line.top > horizontal_line.top #is this line is more to the top than the one we've got already.
+                  has_horizontal_line_from_the_top = true;
+                  top_horizontal_line = horizontal_line;
+                end
+            end
+          end
+          has_horizontal_line_from_the_bottom = false;
+          bottom_horizontal_line = nil
+          #for the bottom horizontal line.
+          horizontals.each do |horizontal_line|
+            horizontal_line_length = horizontal_line.length
+            longer_line_length = [horizontal_line_length, vertical_line_length].max
+            corner_proximity = corner_proximity_threshold * longer_line_length
+            if euclidean_distance(vertical_line.left, vertical_line.bottom, horizontal_line.left, horizontal_line.top) < corner_proximity ||
+              euclidean_distance(vertical_line.left, vertical_line.bottom, horizontal_line.left, horizontal_line.top) < corner_proximity
+              if bottom_horizontal_line.nil? || bottom_horizontal_line.bottom > horizontal_line.bottom  #is this line is more to the bottom than the one we've got already.
+                has_horizontal_line_from_the_bottom = true;
+                bottom_horizontal_line = horizontal_line;
+              end
+            end
+          end
+          if has_horizontal_line_from_the_bottom && has_horizontal_line_from_the_top
+            x = [top_horizontal_line.left, bottom_horizontal_line.left].min
+            y = vertical_line.top
+            width = [top_horizontal_line.right - top_horizontal_line.left, bottom_horizontal_line.right - bottom_horizontal_line.right].max
+            height = vertical_line.bottom - vertical_line.top
+            r = Geometry::Rectangle.new_by_x_y_dims(x, y, width, height); #x, y, w, h
+            #rectangles.put(hashRectangle(r), r);
+            rectangles << r
+          end
+        end
+      end
+      return rectangles.uniq &:similarity_hash
+    end
+  end
+end

data/lib/tabula/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Tabula
-  VERSION = '0.5.1'
+  VERSION = '0.6.1'
 end

data/tabula-extractor.gemspec CHANGED Viewed

@@ -6,7 +6,7 @@ require 'tabula/version'
 Gem::Specification.new do |s|
   s.name        = "tabula-extractor"
   s.version     = Tabula::VERSION
-  s.authors     = ["Manuel Aristarán"]
+  s.authors     = ["Manuel Aristarán", "Jeremy B. Merill", "Mike Tigas"]
   s.email       = ["manuel@jazzido.com"]
   s.homepage    = "https://github.com/jazzido/tabula-extractor"
   s.summary     = %q{extract tables from PDF files}
@@ -14,7 +14,7 @@ Gem::Specification.new do |s|
   s.platform = 'java'
-  shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll'].map { |f| 'ext/' + f }
+  shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll', 'liblsd64.dll'].map { |f| 'ext/' + f }
   s.files         = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
   s.test_files    = `git ls-files -- {test,features}/*`.split("\n")
   s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }

data/test/data/frx_2012_disclosure.pdf ADDED Viewed

Binary file

data/test/tests.rb CHANGED Viewed

@@ -23,6 +23,8 @@ class TestPagesInfoExtractor < Minitest::Test
   end
 end
+class TestTableGuesser < MiniTest::Unit::TestCase
+end
 class TestDumper < Minitest::Test
@@ -60,6 +62,77 @@ class TestExtractor < Minitest::Test
     assert_equal expected, lines_to_array(Tabula.make_table(characters))
   end
+  def test_forest_disclosure_report_dont_regress
+    # this is the current state of the expected output. Ideally the output should be like
+    # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
+    # and a solution for half-x-height-offset lines.
+    pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
+    character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
+    lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
+    vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
+    characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
+                                                           #top left bottom right
+    expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
+                ['TOTAL', '', '', '','$85.00'],
+                ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
+                ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
+                ['TOTAL', '', '', '', '$471.25'],
+                ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
+                ['TOTAL', '', '', '','$20.39'],
+                ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
+                ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
+                ['TOTAL', '', '', '', '$5,010.33'],
+                ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
+                ['TOTAL', '', '', '', '$193.67'],
+                ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
+    assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
+  end
+  def test_missing_spaces_around_an_ampersand
+    pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
+    character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
+    lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
+    vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
+    characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
+                                                           #top left bottom right
+    expected = [
+                 ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
+                ]
+    assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
+  end
+  def test_forest_disclosure_report
+    pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
+    character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
+    lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
+    vertical_rulings = lines.select(&:vertical?).uniq{|line| (line.left / 10).round }
+    characters = character_extractor.extract.next.get_text([110, 28, 218, 833])
+                                                           #top left bottom right
+    expected = [['AANONSEN, DEBORAH, A', '', 'STATEN ISLAND, NY', 'MEALS', '$85.00'],
+                ['TOTAL', '', '', '','$85.00'],
+                ['AARON, CAREN, T', '', 'RICHMOND, VA', 'EDUCATIONAL ITEMS', '$78.80'],
+                ['AARON, CAREN, T', '', 'RICHMOND, VA', 'MEALS', '$392.45'],
+                ['TOTAL', '', '', '', '$471.25'],
+                ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
+                ['TOTAL', '', '', '','$20.39'],
+                ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
+                ['AARON, JOSHUA, N', 'REGIONAL PULMONARY & SLEEP MEDICINE', 'WEST GROVE, PA', 'SPEAKING FEES', '$4,700.00'],
+                ['TOTAL', '', '', '', '$5,010.33'],
+                ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
+                ['TOTAL', '', '', '', '$193.67'],
+                ['AARON, MICHAEL, L', '', 'WEST ISLIP, NY', 'MEALS', '$19.50']]
+    assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
+  end
   # TODO Spaces inserted in words - fails
   def test_bo_page24
     character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))

metadata CHANGED Viewed

@@ -2,14 +2,16 @@
 name: tabula-extractor
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.5.1
+  version: 0.6.1
 platform: java
 authors:
 - Manuel Aristarán
+- Jeremy B. Merill
+- Mike Tigas
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-06-14 00:00:00.000000000 Z
+date: 2013-06-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: minitest
@@ -93,6 +95,9 @@ files:
 - ext/liblsd64.dll
 - ext/lsd.c
 - ext/lsd.h
+- lib/geom/point.rb
+- lib/geom/rectangle.rb
+- lib/geom/segment.rb
 - lib/tabula.rb
 - lib/tabula/core_ext.rb
 - lib/tabula/entities.rb
@@ -100,6 +105,7 @@ files:
 - lib/tabula/pdf_dump.rb
 - lib/tabula/pdf_render.rb
 - lib/tabula/table_extractor.rb
+- lib/tabula/table_guesser.rb
 - lib/tabula/version.rb
 - lib/tabula/whitespace.rb
 - lib/tabula/writers.rb
@@ -108,6 +114,7 @@ files:
 - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
 - test/data/argentina_diputados_voting_record.pdf
 - test/data/bo_page24.pdf
+- test/data/frx_2012_disclosure.pdf
 - test/data/gre.pdf
 - test/data/tabla_subsidios.pdf
 - test/tests.rb
@@ -147,6 +154,7 @@ test_files:
 - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
 - test/data/argentina_diputados_voting_record.pdf
 - test/data/bo_page24.pdf
+- test/data/frx_2012_disclosure.pdf
 - test/data/gre.pdf
 - test/data/tabla_subsidios.pdf
 - test/tests.rb