RubyGems - tabula-extractor - Versions diffs - 0.0.1-java → 0.5.0-java - Mend

tabula-extractor 0.0.1-java → 0.5.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

data/.travis.yml +6 -0
data/Gemfile +0 -3
data/README.md +19 -2
data/Rakefile +4 -5
data/bin/tabula +27 -7
data/ext/COPYING +661 -0
data/ext/Makefile.OSX +15 -0
data/ext/Makefile.defaults +9 -0
data/ext/Makefile.linux32 +11 -0
data/ext/Makefile.linux64 +12 -0
data/ext/Makefile.mingw +10 -0
data/ext/liblsd-linux32.so +0 -0
data/ext/liblsd-linux64.so +0 -0
data/ext/liblsd.def +3 -0
data/ext/liblsd.dll +0 -0
data/ext/liblsd.dylib +0 -0
data/ext/lsd.c +2270 -0
data/ext/lsd.h +283 -0
data/lib/tabula.rb +6 -0
data/lib/tabula/core_ext.rb +21 -0
data/lib/tabula/entities.rb +141 -20
data/lib/tabula/line_segment_detector.rb +99 -0
data/lib/tabula/pdf_dump.rb +10 -8
data/lib/tabula/pdf_render.rb +64 -0
data/lib/tabula/table_extractor.rb +19 -20
data/lib/tabula/version.rb +1 -1
data/lib/tabula/writers.rb +1 -1
data/tabula-extractor.gemspec +3 -2
data/target/{pdfbox-app-1.8.0.jar → pdfbox-app-2.0.0-SNAPSHOT.jar} +0 -0
data/test/tests.rb +7 -6
metadata +22 -5

data/ext/lsd.h ADDED Viewed

@@ -0,0 +1,283 @@
+/*----------------------------------------------------------------------------
+  LSD - Line Segment Detector on digital images
+  This code is part of the following publication and was subject
+  to peer review:
+    "LSD: a Line Segment Detector" by Rafael Grompone von Gioi,
+    Jeremie Jakubowicz, Jean-Michel Morel, and Gregory Randall,
+    Image Processing On Line, 2012. DOI:10.5201/ipol.2012.gjmr-lsd
+    http://dx.doi.org/10.5201/ipol.2012.gjmr-lsd
+  Copyright (c) 2007-2011 rafael grompone von gioi <grompone@gmail.com>
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Affero General Public License as
+  published by the Free Software Foundation, either version 3 of the
+  License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  GNU Affero General Public License for more details.
+  You should have received a copy of the GNU Affero General Public License
+  along with this program. If not, see <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------------*/
+/*----------------------------------------------------------------------------*/
+/** @file lsd.h
+    LSD module header
+    @author rafael grompone von gioi <grompone@gmail.com>
+ */
+/*----------------------------------------------------------------------------*/
+#ifndef LSD_HEADER
+#define LSD_HEADER
+/*----------------------------------------------------------------------------*/
+/** LSD Full Interface
+    @param n_out       Pointer to an int where LSD will store the number of
+                       line segments detected.
+    @param img         Pointer to input image data. It must be an array of
+                       doubles of size X x Y, and the pixel at coordinates
+                       (x,y) is obtained by img[x+y*X].
+    @param X           X size of the image: the number of columns.
+    @param Y           Y size of the image: the number of rows.
+    @param scale       When different from 1.0, LSD will scale the input image
+                       by 'scale' factor by Gaussian filtering, before detecting
+                       line segments.
+                       Example: if scale=0.8, the input image will be subsampled
+                       to 80% of its size, before the line segment detector
+                       is applied.
+                       Suggested value: 0.8
+    @param sigma_scale When scale!=1.0, the sigma of the Gaussian filter is:
+                       sigma = sigma_scale / scale,   if scale <  1.0
+                       sigma = sigma_scale,           if scale >= 1.0
+                       Suggested value: 0.6
+    @param quant       Bound to the quantization error on the gradient norm.
+                       Example: if gray levels are quantized to integer steps,
+                       the gradient (computed by finite differences) error
+                       due to quantization will be bounded by 2.0, as the
+                       worst case is when the error are 1 and -1, that
+                       gives an error of 2.0.
+                       Suggested value: 2.0
+    @param ang_th      Gradient angle tolerance in the region growing
+                       algorithm, in degrees.
+                       Suggested value: 22.5
+    @param log_eps     Detection threshold, accept if -log10(NFA) > log_eps.
+                       The larger the value, the more strict the detector is,
+                       and will result in less detections.
+                       (Note that the 'minus sign' makes that this
+                       behavior is opposite to the one of NFA.)
+                       The value -log10(NFA) is equivalent but more
+                       intuitive than NFA:
+                       - -1.0 gives an average of 10 false detections on noise
+                       -  0.0 gives an average of 1 false detections on noise
+                       -  1.0 gives an average of 0.1 false detections on nose
+                       -  2.0 gives an average of 0.01 false detections on noise
+                       .
+                       Suggested value: 0.0
+    @param density_th  Minimal proportion of 'supporting' points in a rectangle.
+                       Suggested value: 0.7
+    @param n_bins      Number of bins used in the pseudo-ordering of gradient
+                       modulus.
+                       Suggested value: 1024
+    @param reg_img     Optional output: if desired, LSD will return an
+                       int image where each pixel indicates the line segment
+                       to which it belongs. Unused pixels have the value '0',
+                       while the used ones have the number of the line segment,
+                       numbered 1,2,3,..., in the same order as in the
+                       output list. If desired, a non NULL int** pointer must
+                       be assigned, and LSD will make that the pointer point
+                       to an int array of size reg_x x reg_y, where the pixel
+                       value at (x,y) is obtained with (*reg_img)[x+y*reg_x].
+                       Note that the resulting image has the size of the image
+                       used for the processing, that is, the size of the input
+                       image scaled by the given factor 'scale'. If scale!=1
+                       this size differs from XxY and that is the reason why
+                       its value is given by reg_x and reg_y.
+                       Suggested value: NULL
+    @param reg_x       Pointer to an int where LSD will put the X size
+                       'reg_img' image, when asked for.
+                       Suggested value: NULL
+    @param reg_y       Pointer to an int where LSD will put the Y size
+                       'reg_img' image, when asked for.
+                       Suggested value: NULL
+    @return            A double array of size 7 x n_out, containing the list
+                       of line segments detected. The array contains first
+                       7 values of line segment number 1, then the 7 values
+                       of line segment number 2, and so on, and it finish
+                       by the 7 values of line segment number n_out.
+                       The seven values are:
+                       - x1,y1,x2,y2,width,p,-log10(NFA)
+                       .
+                       for a line segment from coordinates (x1,y1) to (x2,y2),
+                       a width 'width', an angle precision of p in (0,1) given
+                       by angle_tolerance/180 degree, and NFA value 'NFA'.
+                       If 'out' is the returned pointer, the 7 values of
+                       line segment number 'n+1' are obtained with
+                       'out[7*n+0]' to 'out[7*n+6]'.
+ */
+double * LineSegmentDetection( int * n_out,
+                               double * img, int X, int Y,
+                               double scale, double sigma_scale, double quant,
+                               double ang_th, double log_eps, double density_th,
+                               int n_bins,
+                               int ** reg_img, int * reg_x, int * reg_y );
+/*----------------------------------------------------------------------------*/
+/** LSD Simple Interface with Scale and Region output.
+    @param n_out       Pointer to an int where LSD will store the number of
+                       line segments detected.
+    @param img         Pointer to input image data. It must be an array of
+                       doubles of size X x Y, and the pixel at coordinates
+                       (x,y) is obtained by img[x+y*X].
+    @param X           X size of the image: the number of columns.
+    @param Y           Y size of the image: the number of rows.
+    @param scale       When different from 1.0, LSD will scale the input image
+                       by 'scale' factor by Gaussian filtering, before detecting
+                       line segments.
+                       Example: if scale=0.8, the input image will be subsampled
+                       to 80% of its size, before the line segment detector
+                       is applied.
+                       Suggested value: 0.8
+    @param reg_img     Optional output: if desired, LSD will return an
+                       int image where each pixel indicates the line segment
+                       to which it belongs. Unused pixels have the value '0',
+                       while the used ones have the number of the line segment,
+                       numbered 1,2,3,..., in the same order as in the
+                       output list. If desired, a non NULL int** pointer must
+                       be assigned, and LSD will make that the pointer point
+                       to an int array of size reg_x x reg_y, where the pixel
+                       value at (x,y) is obtained with (*reg_img)[x+y*reg_x].
+                       Note that the resulting image has the size of the image
+                       used for the processing, that is, the size of the input
+                       image scaled by the given factor 'scale'. If scale!=1
+                       this size differs from XxY and that is the reason why
+                       its value is given by reg_x and reg_y.
+                       Suggested value: NULL
+    @param reg_x       Pointer to an int where LSD will put the X size
+                       'reg_img' image, when asked for.
+                       Suggested value: NULL
+    @param reg_y       Pointer to an int where LSD will put the Y size
+                       'reg_img' image, when asked for.
+                       Suggested value: NULL
+    @return            A double array of size 7 x n_out, containing the list
+                       of line segments detected. The array contains first
+                       7 values of line segment number 1, then the 7 values
+                       of line segment number 2, and so on, and it finish
+                       by the 7 values of line segment number n_out.
+                       The seven values are:
+                       - x1,y1,x2,y2,width,p,-log10(NFA)
+                       .
+                       for a line segment from coordinates (x1,y1) to (x2,y2),
+                       a width 'width', an angle precision of p in (0,1) given
+                       by angle_tolerance/180 degree, and NFA value 'NFA'.
+                       If 'out' is the returned pointer, the 7 values of
+                       line segment number 'n+1' are obtained with
+                       'out[7*n+0]' to 'out[7*n+6]'.
+ */
+double * lsd_scale_region( int * n_out,
+                           double * img, int X, int Y, double scale,
+                           int ** reg_img, int * reg_x, int * reg_y );
+/*----------------------------------------------------------------------------*/
+/** LSD Simple Interface with Scale
+    @param n_out       Pointer to an int where LSD will store the number of
+                       line segments detected.
+    @param img         Pointer to input image data. It must be an array of
+                       doubles of size X x Y, and the pixel at coordinates
+                       (x,y) is obtained by img[x+y*X].
+    @param X           X size of the image: the number of columns.
+    @param Y           Y size of the image: the number of rows.
+    @param scale       When different from 1.0, LSD will scale the input image
+                       by 'scale' factor by Gaussian filtering, before detecting
+                       line segments.
+                       Example: if scale=0.8, the input image will be subsampled
+                       to 80% of its size, before the line segment detector
+                       is applied.
+                       Suggested value: 0.8
+    @return            A double array of size 7 x n_out, containing the list
+                       of line segments detected. The array contains first
+                       7 values of line segment number 1, then the 7 values
+                       of line segment number 2, and so on, and it finish
+                       by the 7 values of line segment number n_out.
+                       The seven values are:
+                       - x1,y1,x2,y2,width,p,-log10(NFA)
+                       .
+                       for a line segment from coordinates (x1,y1) to (x2,y2),
+                       a width 'width', an angle precision of p in (0,1) given
+                       by angle_tolerance/180 degree, and NFA value 'NFA'.
+                       If 'out' is the returned pointer, the 7 values of
+                       line segment number 'n+1' are obtained with
+                       'out[7*n+0]' to 'out[7*n+6]'.
+ */
+double * lsd_scale(int * n_out, double * img, int X, int Y, double scale);
+/*----------------------------------------------------------------------------*/
+/** LSD Simple Interface
+    @param n_out       Pointer to an int where LSD will store the number of
+                       line segments detected.
+    @param img         Pointer to input image data. It must be an array of
+                       doubles of size X x Y, and the pixel at coordinates
+                       (x,y) is obtained by img[x+y*X].
+    @param X           X size of the image: the number of columns.
+    @param Y           Y size of the image: the number of rows.
+    @return            A double array of size 7 x n_out, containing the list
+                       of line segments detected. The array contains first
+                       7 values of line segment number 1, then the 7 values
+                       of line segment number 2, and so on, and it finish
+                       by the 7 values of line segment number n_out.
+                       The seven values are:
+                       - x1,y1,x2,y2,width,p,-log10(NFA)
+                       .
+                       for a line segment from coordinates (x1,y1) to (x2,y2),
+                       a width 'width', an angle precision of p in (0,1) given
+                       by angle_tolerance/180 degree, and NFA value 'NFA'.
+                       If 'out' is the returned pointer, the 7 values of
+                       line segment number 'n+1' are obtained with
+                       'out[7*n+0]' to 'out[7*n+6]'.
+ */
+double * lsd(int * n_out, double * img, int X, int Y);
+void free_values(double * p);
+#endif /* !LSD_HEADER */
+/*----------------------------------------------------------------------------*/

data/lib/tabula.rb CHANGED Viewed

@@ -1,5 +1,11 @@
+module Tabula
+  PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
+end
 require_relative './tabula/version'
 require_relative './tabula/entities'
 require_relative './tabula/pdf_dump'
 require_relative './tabula/table_extractor'
 require_relative './tabula/writers'
+require_relative './tabula/line_segment_detector'
+require_relative './tabula/pdf_render'

data/lib/tabula/core_ext.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module Enumerable
+  def sum
+    self.inject(0){|accum, i| accum + i }
+  end
+  def mean
+    self.sum/self.length.to_f
+  end
+  def sample_variance
+    m = self.mean
+    sum = self.inject(0){|accum, i| accum +(i-m)**2 }
+    sum/(self.length - 1).to_f
+  end
+  def standard_deviation
+    return Math.sqrt(self.sample_variance)
+  end
+end

data/lib/tabula/entities.rb CHANGED Viewed

@@ -96,8 +96,13 @@ module Tabula
     # get text, optionally from a provided area in the page [top, left, bottom, right]
     def get_text(area=nil)
       area = [0, 0, width, height] if area.nil?
-      ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
-      self.texts.select { |t| t.overlaps? ze }
+      # spaces are not detected, b/c they have height == 0
+      # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
+      # self.texts.select { |t| t.overlaps? ze }
+      self.texts.select { |t|
+        t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
+      }
     end
     def to_json(options={})
@@ -112,15 +117,17 @@ module Tabula
   end
   class TextElement < ZoneEntity
-    attr_accessor :font, :font_size, :text
+    attr_accessor :font, :font_size, :text, :width_of_space
     CHARACTER_DISTANCE_THRESHOLD = 1.5
+    TOLERANCE_FACTOR = 0.25
-    def initialize(top, left, width, height, font, font_size, text)
+    def initialize(top, left, width, height, font, font_size, text, width_of_space)
       super(top, left, width, height)
       self.font = font
       self.font_size = font_size
       self.text = text
+      self.width_of_space = width_of_space
     end
     # more or less returns True if distance < tolerance
@@ -128,7 +135,7 @@ module Tabula
       raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
       overlaps = self.vertically_overlaps?(other)
-      tolerance = ((self.font_size + other.font_size) / 2) * 0.25
+      tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
       overlaps or
         (self.height == 0 and other.height != 0) or
@@ -141,13 +148,13 @@ module Tabula
       raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
       overlaps = self.vertically_overlaps?(other)
-      tolerance = ((self.font_size + other.font_size) / 2) * 0.25
+      up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
+      down_tolerance = 0.95
-      dist = self.horizontal_distance(other)
-      overlaps or
-        (self.height == 0 and other.height != 0) or
-        (other.height == 0 and self.height != 0) and
-        ((tolerance <= dist) and (dist < tolerance*CHARACTER_DISTANCE_THRESHOLD))
+      dist = self.horizontal_distance(other).abs
+      rv = overlaps && (dist.between?(self.width_of_space * down_tolerance, self.width_of_space + up_tolerance))
+      rv
     end
     def merge!(other)
@@ -235,25 +242,139 @@ module Tabula
     def inspect
       vars = (self.instance_variables - [:@text_elements]).map{ |v| "#{v}=#{instance_variable_get(v).inspect}" }
       texts = self.text_elements.sort_by { |te| te.top }.map { |te| te.text }
-      "<#{self.class}: #{vars.join(', ')}, @text_elements=#{texts.join(', ')}>"
+      "<#{self.class}: #{vars.join(', ')}, @text_elements=[#{texts.join('], [')}]>"
     end
   end
+  require_relative './core_ext'
   class Ruling < ZoneEntity
-    attr_accessor :color
+    # 2D line intersection test taken from comp.graphics.algorithms FAQ
+    def intersects?(other)
+      r = ((self.top-other.top)*(other.right-other.left) - (self.left-other.left)*(other.bottom-other.top)) \
+      / ((self.right-self.left)*(other.bottom-other.top)-(self.bottom-self.top)*(other.right-other.left))
-    def initialize(top, left, width, height, color)
-      super(top, left, width, height)
-      self.color = color
+        s = ((self.top-other.top)*(self.right-self.left) - (self.left-other.left)*(self.bottom-self.top)) \
+            / ((self.right-self.left)*(other.bottom-other.top) - (self.bottom-self.top)*(other.right-other.left))
+      r >= 0 and r < 1 and s >= 0 and s < 1
     end
-    def to_h
-      hash = super
-      hash[:color] = self.color
-      hash
+    def vertical?
+      left == right
+    end
+    def horizontal?
+      top == bottom
+    end
+    def to_json(arg)
+      [left, top, right, bottom].to_json
+    end
+    def to_xml
+      "<ruling x1=\"%.2f\" y1=\"%.2f\" x2=\"%.2f\" y2=\"%.2f\" />" \
+      % [left, top, right, bottom]
+    end
+    def self.clean_rulings(rulings, max_distance=4)
+      # merge horizontal and vertical lines
+      # TODO this should be iterative
+      skip = false
+      horiz = rulings.select { |r| r.horizontal? && r.width > max_distance }
+        .group_by(&:top)
+        .values.reduce([]) { |memo, rs|
+        rs = rs.sort_by(&:left)
+        memo << if rs.size > 1
+                  Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
+                else
+                  rs.first
+                end
+      }
+      .sort_by(&:top)
+      h = []
+      horiz.size.times do |i|
+        if i == horiz.size - 1
+          h << horiz[-1]
+          break
+        end
+        if skip
+          skip = false;
+          next
+        end
+        d = (horiz[i+1].top - horiz[i].top).abs
+        h << if d < 4 # THRESHOLD DISTANCE between horizontal lines
+               skip = true
+               Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
+             else
+               horiz[i]
+             end
+      end
+      horiz = h
+      vert = rulings.select { |r| r.vertical? && r.height > max_distance }
+        .group_by(&:left)
+        .values.reduce([]) { |memo, rs|
+        rs = rs.sort_by(&:top)
+        memo << if rs.size > 1
+                  Tabula::Ruling.new(rs[0].top, rs[0].left, 0, rs[-1].bottom - rs[0].top)
+                else rs.first
+                  rs.first
+                end
+        }
+        .sort_by(&:left)
+      v = []
+      vert.size.times do |i|
+        if i == vert.size - 1
+          v << vert[-1]
+          break
+        end
+        if skip
+          skip = false;
+          next
+        end
+        d = (vert[i+1].left - vert[i].left).abs
+        v << if d < 4 # THRESHOLD DISTANCE between vertical lines
+               skip = true
+               Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
+             else
+               vert[i]
+             end
+      end
+      vert = v
+      # - only keep horizontal rulings that intersect with at least one vertical ruling
+      # - only keep vertical rulings that intersect with at least one horizontal ruling
+      # yeah, it's a naive heuristic. but hey, it works.
+      # h_mean =  horiz.reduce(0) { |accum, i| accum + i.width } / horiz.size
+      # horiz.reject { |h| h.width < h_mean }
+      #vert.delete_if  { |v| !horiz.any? { |h| h.intersects?(v) } } unless horiz.empty?
+      #horiz.delete_if { |h| !vert.any?  { |v| v.intersects?(h) } } unless vert.empty?
+      return horiz += vert
     end
   end
 end