RubyGems - tabula-extractor - Versions diffs - 0.7.2-java → 0.7.4-java - Mend

tabula-extractor 0.7.2-java → 0.7.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/README.md +4 -8
data/bin/tabula +3 -3
data/lib/tabula.rb +9 -5
data/lib/tabula/entities.rb +1 -0
data/lib/tabula/entities/cell.rb +6 -4
data/lib/tabula/entities/has_cells.rb +22 -78
data/lib/tabula/entities/line.rb +52 -6
data/lib/tabula/entities/page.rb +43 -50
data/lib/tabula/entities/ruling.rb +83 -105
data/lib/tabula/entities/spreadsheet.rb +74 -11
data/lib/tabula/entities/table.rb +55 -37
data/lib/tabula/entities/tabular.rb +42 -0
data/lib/tabula/entities/text_chunk.rb +55 -52
data/lib/tabula/entities/text_element.rb +129 -62
data/lib/tabula/entities/zone_entity.rb +15 -6
data/lib/tabula/extraction.rb +114 -49
data/lib/tabula/line_segment_detector.rb +0 -5
data/lib/tabula/table_extractor.rb +32 -37
data/lib/tabula/version.rb +1 -1
data/tabula-extractor.gemspec +2 -5
metadata +13 -95
data/ext/COPYING +0 -661
data/ext/Makefile.OSX +0 -18
data/ext/Makefile.defaults +0 -9
data/ext/Makefile.linux32 +0 -11
data/ext/Makefile.linux64 +0 -12
data/ext/Makefile.mingw +0 -10
data/ext/Makefile.mingw64 +0 -10
data/ext/liblsd-linux32.so +0 -0
data/ext/liblsd-linux64.so +0 -0
data/ext/liblsd.def +0 -3
data/ext/liblsd.dll +0 -0
data/ext/liblsd.dylib +0 -0
data/ext/liblsd64.dll +0 -0
data/ext/lsd.c +0 -2270
data/ext/lsd.h +0 -283
data/test/data/47008204D_USA.page4.pdf +0 -0
data/test/data/560015757GV_China.page1.pdf +0 -0
data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
data/test/data/GSK_2012_Q4.page437.pdf +0 -0
data/test/data/S2MNCEbirdisland.pdf +0 -0
data/test/data/argentina_diputados_voting_record.pdf +0 -0
data/test/data/bo_page24.pdf +0 -0
data/test/data/campaign_donors.pdf +0 -0
data/test/data/frx_2012_disclosure.pdf +0 -0
data/test/data/frx_2012_disclosure.tsv +0 -88
data/test/data/gre.pdf +0 -0
data/test/data/no_tables.pdf +0 -0
data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
data/test/data/puertos1.pdf +0 -0
data/test/data/spanning_cells.csv +0 -21
data/test/data/spanning_cells.pdf +0 -0
data/test/data/strongschools.pdf +0 -0
data/test/data/sydney_disclosure_contract.pdf +0 -0
data/test/data/tabla_subsidios.pdf +0 -0
data/test/data/vertical_rulings_bug.pdf +0 -0
data/test/data/vietnam3.pdf +0 -0
data/test/data/wc2012.pdf +0 -0
data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
data/test/heuristic.rb +0 -50
data/test/test_bin_tabula.sh +0 -7
data/test/tests.rb +0 -603

data/lib/tabula/entities/tabular.rb ADDED Viewed

@@ -0,0 +1,42 @@
+module Tabula
+  module AbstractInterface
+    class InterfaceNotImplementedError < NoMethodError
+    end
+    def self.included(klass)
+      klass.send(:include, AbstractInterface::Methods)
+      klass.send(:extend, AbstractInterface::Methods)
+    end
+    module Methods
+      def api_not_implemented(klass)
+        caller.first.match(/in \`(.+)\'/)
+        method_name = $1
+        raise AbstractInterface::InterfaceNotImplementedError.new("#{klass.class.name} needs to implement '#{method_name}' for interface #{self.name}!")
+      end
+    end
+  end
+  module Tabular
+    include AbstractInterface
+    # this is a pseudo-interface as described here:
+    # http://metabates.com/2011/02/07/building-interfaces-and-abstract-classes-in-ruby/
+    # Table and Spreadsheet implement this interface, so should any class
+    # intended to represent tabular data from a PDF, e.g. if another extraction
+    # method were created, so that Tabula GUI and API can correctly handle
+    # its data.
+    def extraction_method; raise Tabular.api_not_implemented(self); end
+    def page; Tabular.api_not_implemented(self); end
+    def rows; Tabular.api_not_implemented(self); end
+    def cols; Tabular.api_not_implemented(self); end
+    def to_csv; Tabular.api_not_implemented(self); end
+    def to_tsv; Tabular.api_not_implemented(self); end
+    def to_a; Tabular.api_not_implemented(self); end
+    def to_json; Tabular.api_not_implemented(self); end
+  end
+end

data/lib/tabula/entities/text_chunk.rb CHANGED Viewed

@@ -8,46 +8,71 @@ module Tabula
     # initialize a new TextChunk from a TextElement
     def self.create_from_text_element(text_element)
       raise TypeError, "argument is not a TextElement" unless text_element.instance_of?(TextElement)
-      tc = self.new(text_element.top, text_element.left, text_element.width, text_element.height)
+      tc = self.new(*text_element.tlwh)
       tc.text_elements = [text_element]
       return tc
     end
-    ##
-    # group an iterable of TextChunk into a list of Line
     def self.group_by_lines(text_chunks)
-      lines = []
-      text_chunks.each do |te|
-        next if te.text =~ ONLY_SPACES_RE
-        l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
-        if l.nil?
-          l = Line.new
-          lines << l
+      bbwidth = text_chunks.max_by(&:right).right - text_chunks.min_by(&:left).left
+      l = Line.new
+      l << text_chunks.first
+      lines = text_chunks[1..-1].inject([l]) do |lines, te|
+        if lines.last.horizontal_overlap_ratio(te) < 0.01
+          # skip lines such that:
+          # - are wider than the 90% of the width of the text_chunks bounding box
+          # - it contains a single repeated character
+          if lines.last.width / bbwidth > 0.9 \
+            && l.text_elements.all? { |te| te.text =~  SAME_CHAR_RE }
+            lines.pop
+          end
+          lines << Line.new
         end
-        l << te
+        lines.last << te
+        lines
       end
-      lines
+      if lines.last.width / bbwidth > 0.9 \
+         && l.text_elements.all? { |te| te.text =~ SAME_CHAR_RE }
+        lines.pop
+      end
+      lines.map!(&:remove_sequential_spaces!)
     end
     ##
-    # calculate estimated columns from an iterable of TextChunk
-    def self.column_positions(top, text_chunks)
-      right = 0
-      columns = []
-      text_chunks.each do |te|
-        next if te.text =~ ONLY_SPACES_RE
-        if te.top >= top
-          left = te.left
-          if (left > right)
-            columns << right
-            right = te.right
-          elsif te.right > right
-            right = te.right
+    # returns a list of column boundaries (x axis)
+    # +lines+ must be an array of lines sorted by their +top+ attribute
+    def self.column_positions(lines)
+      init = lines.first.text_elements.inject([]) { |memo, text_chunk|
+        next memo if text_chunk.text =~ ONLY_SPACES_RE
+        memo << Tabula::ZoneEntity.new(*text_chunk.tlwh)
+        memo
+      }
+      regions = lines[1..-1]
+        .inject(init) do |column_regions, line|
+        line_text_elements = line.text_elements.clone.select { |te| te.text !~ ONLY_SPACES_RE }
+        column_regions.each do |cr|
+          overlaps = line_text_elements
+            .select { |te| te.text !~ ONLY_SPACES_RE && cr.horizontally_overlaps?(te) }
+          overlaps.inject(cr) do |memo, te|
+            cr.merge!(te)
           end
+          line_text_elements = line_text_elements - overlaps
         end
+        column_regions += line_text_elements.map { |te| Tabula::ZoneEntity.new(*te.tlwh) }
       end
-      columns
+      regions.map { |r| r.right.round(2) }.uniq
     end
     ##
@@ -59,10 +84,10 @@ module Tabula
     def merge!(other)
       if other.instance_of?(TextChunk)
-        if self.horizontally_overlaps?(other) && other.top < self.top
-          self.text_elements = other.text_elements + self.text_elements
-        else
+        if (self <=> other) < 0
           self.text_elements = self.text_elements + other.text_elements
+        else
+          self.text_elements = other.text_elements + self.text_elements
         end
       end
       super(other)
@@ -75,28 +100,6 @@ module Tabula
       raise "Not Implemented"
     end
-    ##
-    # remove leading and trailing whitespace
-    # (changes geometry accordingly)
-    # TODO horrible implementation - fix.
-    def strip!
-      acc = 0
-      new_te = self.text_elements.drop_while { |te|
-        te.text == ' ' && acc += 1
-      }
-      self.left += self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
-      self.text_elements = new_te
-      self.text_elements.reverse!
-      acc = 0
-      new_te = self.text_elements.drop_while { |te|
-        te.text == ' ' && acc += 1
-      }
-      self.right -= self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
-      self.text_elements = new_te.reverse
-      self
-    end
     def text
       self.text_elements.map(&:text).join
     end

data/lib/tabula/entities/text_element.rb CHANGED Viewed

@@ -1,4 +1,6 @@
+# -*- coding: utf-8 -*-
 module Tabula
   ##
   # a Glyph
   class TextElement < ZoneEntity
@@ -17,8 +19,20 @@ module Tabula
     EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
+    def self.within(first, second, variance )
+      second < first + variance && second > first - variance
+    end
+    def self.overlap(y1, height1, y2, height2, variance=0.1)
+      within( y1, y2, variance) || (y2 <= y1 && y2 >= y1 - height1) \
+      || (y1 <= y2 && y1 >= y2-height2)
+    end
     ##
     # heuristically merge an iterable of TextElement into a list of TextChunk
+    # lots of ideas taken from PDFBox's PDFTextStripper.writePage
+    # here be dragons
     def self.merge_words(text_elements, options={})
       default_options = {:vertical_rulings => []}
       options = default_options.merge(options)
@@ -28,74 +42,138 @@ module Tabula
       text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
+      previousAveCharWidth = text_chunks.first.width
+      endOfLastTextX = text_chunks.first.right
+      maxYForLine = text_chunks.first.bottom
+      maxHeightForLine = text_chunks.first.height
+      minYTopForLine = text_chunks.first.top
+      lastWordSpacing = -1
+      sp = nil
       text_elements.inject(text_chunks) do |chunks, char|
         current_chunk = chunks.last
         prev_char = current_chunk.text_elements.last
+        # Resets the average character width when we see a change in font
+        # or a change in the font size
+        if (char.font != prev_char.font) || (char.font_size != prev_char.font_size)
+          previousAveCharWidth = -1;
+        end
         # if same char AND overlapped, skip
-        if prev_char.text == char.text && prev_char.overlaps_with_ratio?(char, 0.85)
-          chunks
+        if (prev_char.text == char.text) && prev_char.overlaps_with_ratio?(char, 0.5)
+          next chunks
+        end
+        # if char is a space that overlaps with the prev_char, skip
+        if char.text == ' ' && prev_char.left == char.left && prev_char.top == char.top
+          next chunks
+        end
+        # any vertical ruling goes across prev_char and char?
+        across_vertical_ruling = vertical_ruling_locations.any? { |loc|
+          prev_char.left < loc && char.left > loc
+        }
+        # Estimate the expected width of the space based on the
+        # space character with some margin.
+        wordSpacing = char.width_of_space
+        deltaSpace  = 0
+        deltaSpace = if (wordSpacing.nan? || wordSpacing == 0)
+                       ::Float::MAX
+                     elsif lastWordSpacing < 0
+                       wordSpacing * 0.5 # 0.5 == spacingTolerance
+                     else
+                       ((wordSpacing + lastWordSpacing) / 2.0) * 0.5
+                     end
+        # Estimate the expected width of the space based on the
+        # average character width with some margin. This calculation does not
+        # make a true average (average of averages) but we found that it gave the
+        # best results after numerous experiments. Based on experiments we also found that
+        # .3 worked well.
+        averageCharWidth = if previousAveCharWidth < 0
+                             char.width / char.text.size
+                           else
+                             (previousAveCharWidth + (char.width / char.text.size)) / 2.0
+                           end
+        deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance
+        # Compares the values obtained by the average method and the wordSpacing method and picks
+        # the smaller number.
+        expectedStartOfNextWordX = -::Float::MAX
+        if endOfLastTextX != -1
+          expectedStartOfNextWordX = endOfLastTextX + [deltaCharWidth, deltaSpace].min
+        end
+        sameLine = true
+        if !overlap(char.bottom, char.height, maxYForLine, maxHeightForLine)
+          endOfLastTextX = -1
+          expectedStartOfNextWordX = -::Float::MAX
+          maxYForLine = -::Float::MAX
+          maxHeightForLine = -1
+          minYTopForLine = ::Float::MAX
+          sameLine = false
+        end
+        endOfLastTextX = char.right
+        # should we add a space?
+        if !across_vertical_ruling \
+          && sameLine \
+          && expectedStartOfNextWordX < char.left \
+          && !prev_char.text.end_with?(' ')
+          sp = self.new(prev_char.top,
+                        prev_char.right,
+                        expectedStartOfNextWordX - prev_char.right,
+                        prev_char.height,
+                        prev_char.font,
+                        prev_char.font_size,
+                        ' ',
+                        prev_char.width_of_space)
+          current_chunk << sp
         else
-          # any vertical ruling goes across prev_char and char?
-          across_vertical_ruling = vertical_ruling_locations.any? { |loc|
-            prev_char.left < loc && char.left > loc
-          }
-          # should we add a space?
-          if (prev_char.text != " ") && (char.text != " ") \
-            && !across_vertical_ruling \
-            && prev_char.should_add_space?(char)
-            sp = self.new(prev_char.top,
-                          prev_char.right,
-                          prev_char.width_of_space,
-                          prev_char.width_of_space, # width == height for spaces
-                          prev_char.font,
-                          prev_char.font_size,
-                          ' ',
-                          prev_char.width_of_space)
-            chunks.last << sp
-            prev_char = sp
-          end
-          # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
-          # that they ought to be merged by that account.
-          # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
-          # Why are both of those `.left`?, you might ask. The intuition is that a letter
-          # that starts on the left of a vertical ruling ought to remain on the left of it.
-          if !across_vertical_ruling && prev_char.should_merge?(char)
-            chunks.last << char
-          else
-            # create a new chunk
-            chunks << TextChunk.create_from_text_element(char)
-          end
-          chunks
+          sp = nil
         end
-      end
-    end
-    # more or less returns True if distance < tolerance
-    def should_merge?(other)
-      raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
-      self.vertically_overlaps?(other) && self.horizontal_distance(other) < width_of_space * (1 + TOLERANCE_FACTOR) && !self.should_add_space?(other)
-    end
+        maxYForLine = [char.bottom, maxYForLine].max
+        maxHeightForLine = [maxHeightForLine, char.height].max
+        minYTopForLine = [minYTopForLine, char.top].min
+        # if sameLine
+        #   puts "prev: #{prev_char.text} - char: #{char.text} - diff: #{char.left - prev_char.right} - space: #{[deltaCharWidth, deltaSpace].min} - spacing: #{wordSpacing} - sp: #{!sp.nil?}"
+        # else
+        #   puts
+        # end
-    # more or less returns True if (tolerance <= distance < CHARACTER_DISTANCE_THRESHOLD*tolerance)
-    def should_add_space?(other)
-      raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
-      return false if self.width_of_space.nan?
+        dist = (char.left - (sp ? sp.right : prev_char.right))
-      (self.vertically_overlaps?(other) &&
-        self.horizontal_distance(other).abs.between?(self.width_of_space * (1 - TOLERANCE_FACTOR), self.width_of_space * (1 + TOLERANCE_FACTOR))) ||
-      (self.vertical_distance(other) > self.height)
+        if !across_vertical_ruling \
+           && sameLine \
+           && (dist < 0 ? current_chunk.vertically_overlaps?(char) : dist < wordSpacing)
+          current_chunk << char
+        else
+          # create a new chunk
+          chunks << TextChunk.create_from_text_element(char)
+        end
+        lastWordSpacing = wordSpacing
+        previousAveCharWidth = sp ? (averageCharWidth + sp.width) / 2.0 : averageCharWidth
+        chunks
+      end
     end
     ##
     # merge this TextElement with another (adjust size and text content accordingly)
     def merge!(other)
       raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
-      if self.horizontally_overlaps?(other) and other.top < self.top
+      if (self <=> other) < 0
         self.text = other.text + self.text
       else
         self.text << other.text
@@ -115,16 +193,5 @@ module Tabula
       self.text.strip == other.text.strip
     end
-    # sort in lexicographic (reading) order
-    def <=>(other)
-      if self.vertically_overlaps?(other)
-        self.left <=> other.left
-      elsif self.top < other.top
-        -1
-      else
-        1
-      end
-    end
   end
 end

data/lib/tabula/entities/zone_entity.rb CHANGED Viewed

@@ -4,6 +4,7 @@ module Tabula
   class ZoneEntity < java.awt.geom.Rectangle2D::Float
+    # TODO used? remove if not.
     attr_accessor :texts
     def initialize(top, left, width, height)
@@ -11,6 +12,7 @@ module Tabula
       if left && top && width && height
         self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], left, top, width, height
       end
+      # TODO used? remove if not.
       self.texts = []
     end
@@ -21,18 +23,21 @@ module Tabula
       self.height = [self.bottom, other.bottom].max - top
       self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.left, self.top, self.width, self.height
+      self
     end
     ##
     # default sorting order for ZoneEntity objects
     # is lexicographical (left to right, top to bottom)
     def <=>(other)
-      return  1 if self.left > other.left
-      return -1 if self.left < other.left
-      return  0 if self.vertically_overlaps?(other)
-      return  1 if self.top  > other.top
-      return -1 if self.top  < other.top
-      return  0
+      yDifference = (self.bottom - other.bottom).abs
+      if yDifference < 0.1 ||
+          (other.bottom >= self.top && other.bottom <= self.bottom) ||
+          (self.bottom >= other.top && self.bottom <= other.bottom)
+        self.left <=> other.left
+      else
+        self.bottom <=> other.bottom
+      end
     end
     def to_json(options={})
@@ -47,6 +52,10 @@ module Tabula
       [top, left, bottom, right]
     end
+    def tlwh
+      [top, left, width, height]
+    end
     def points
       [ Point2D::Float.new(left, top),
         Point2D::Float.new(right, top),