RubyGems - tabula-extractor - Versions diffs - 0.6.6-java → 0.7.0-java - Mend

tabula-extractor 0.6.6-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

checksums.yaml +7 -0
data/AUTHORS.md +1 -0
data/README.md +27 -11
data/bin/tabula +61 -19
data/ext/liblsd-linux32.so +0 -0
data/ext/liblsd-linux64.so +0 -0
data/ext/liblsd.dll +0 -0
data/ext/liblsd.dylib +0 -0
data/ext/liblsd64.dll +0 -0
data/ext/lsd.c +137 -137
data/ext/lsd.h +9 -9
data/lib/tabula.rb +20 -3
data/lib/tabula/core_ext.rb +261 -0
data/lib/tabula/entities.rb +11 -456
data/lib/tabula/entities/cell.rb +42 -0
data/lib/tabula/entities/has_cells.rb +244 -0
data/lib/tabula/entities/line.rb +39 -0
data/lib/tabula/entities/page.rb +269 -0
data/lib/tabula/entities/page_area.rb +7 -0
data/lib/tabula/entities/ruling.rb +300 -0
data/lib/tabula/entities/spreadsheet.rb +92 -0
data/lib/tabula/entities/table.rb +81 -0
data/lib/tabula/entities/text_chunk.rb +114 -0
data/lib/tabula/entities/text_element.rb +112 -0
data/lib/tabula/entities/zone_entity.rb +57 -0
data/lib/tabula/extraction.rb +327 -0
data/lib/tabula/line_segment_detector.rb +9 -7
data/lib/tabula/pdf_line_extractor.rb +319 -0
data/lib/tabula/pdf_render.rb +1 -5
data/lib/tabula/spreadsheet_extractor.rb +52 -0
data/lib/tabula/table_extractor.rb +50 -348
data/lib/tabula/table_guesser.rb +21 -23
data/lib/tabula/version.rb +1 -1
data/lib/tabula/writers.rb +5 -6
data/tabula-extractor.gemspec +1 -0
data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
data/test/data/47008204D_USA.page4.pdf +0 -0
data/test/data/560015757GV_China.page1.pdf +0 -0
data/test/data/GSK_2012_Q4.page437.pdf +0 -0
data/test/data/S2MNCEbirdisland.pdf +0 -0
data/test/data/campaign_donors.pdf +0 -0
data/test/data/frx_2012_disclosure.tsv +88 -0
data/test/data/no_tables.pdf +0 -0
data/test/data/puertos1.pdf +0 -0
data/test/data/spanning_cells.csv +21 -0
data/test/data/spanning_cells.pdf +0 -0
data/test/data/strongschools.pdf +0 -0
data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
data/test/data/vietnam3.pdf +0 -0
data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
data/test/heuristic.rb +50 -0
data/test/test_bin_tabula.sh +7 -0
data/test/tests.rb +476 -63
metadata +79 -28
data/lib/geom/point.rb +0 -21
data/lib/geom/rectangle.rb +0 -101
data/lib/geom/segment.rb +0 -82
data/lib/tabula/pdf_dump.rb +0 -132
data/lib/tabula/whitespace.rb +0 -50
data/vertical_rulings_bug.rb +0 -29

data/lib/tabula/entities/text_chunk.rb ADDED Viewed

@@ -0,0 +1,114 @@
+module Tabula
+  ##
+  # a "collection" of TextElements
+  class TextChunk < ZoneEntity
+    attr_accessor :font, :font_size, :text_elements, :width_of_space
+    ##
+    # initialize a new TextChunk from a TextElement
+    def self.create_from_text_element(text_element)
+      raise TypeError, "argument is not a TextElement" unless text_element.instance_of?(TextElement)
+      tc = self.new(text_element.top, text_element.left, text_element.width, text_element.height)
+      tc.text_elements = [text_element]
+      return tc
+    end
+    ##
+    # group an iterable of TextChunk into a list of Line
+    def self.group_by_lines(text_chunks)
+      lines = []
+      text_chunks.each do |te|
+        next if te.text =~ ONLY_SPACES_RE
+        l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
+        if l.nil?
+          l = Line.new
+          lines << l
+        end
+        l << te
+      end
+      lines
+    end
+    ##
+    # calculate estimated columns from an iterable of TextChunk
+    def self.column_positions(text_chunks)
+      right = 0
+      columns = []
+      lines = TextChunk.group_by_lines(text_chunks)
+      top = lines.first.text_elements.map(&:top).min
+      text_chunks.each do |te|
+        next if te.text =~ ONLY_SPACES_RE
+        if te.top >= top
+          left = te.left
+          if (left > right)
+            columns << right
+            right = te.right
+          elsif te.right > right
+            right = te.right
+          end
+        end
+      end
+      columns
+    end
+    ##
+    # add a TextElement to this TextChunk
+    def <<(text_element)
+      self.text_elements << text_element
+      self.merge!(text_element)
+    end
+    def merge!(other)
+      if other.instance_of?(TextChunk)
+        if self.horizontally_overlaps?(other) && other.top < self.top
+          self.text_elements = other.text_elements + self.text_elements
+        else
+          self.text_elements = self.text_elements + other.text_elements
+        end
+      end
+      super(other)
+    end
+    ##
+    # split this TextChunk vertically
+    # (in place, returns the remaining chunk)
+    def split_vertically!(y)
+      raise "Not Implemented"
+    end
+    ##
+    # remove leading and trailing whitespace
+    # (changes geometry accordingly)
+    # TODO horrible implementation - fix.
+    def strip!
+      acc = 0
+      new_te = self.text_elements.drop_while { |te|
+        te.text == ' ' && acc += 1
+      }
+      self.left += self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
+      self.text_elements = new_te
+      self.text_elements.reverse!
+      acc = 0
+      new_te = self.text_elements.drop_while { |te|
+        te.text == ' ' && acc += 1
+      }
+      self.right -= self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
+      self.text_elements = new_te.reverse
+      self
+    end
+    def text
+      self.text_elements.map(&:text).join
+    end
+    def inspect
+      "#<TextChunk: #{self.top.round(2)},#{self.left.round(2)},#{self.bottom.round(2)},#{right.round(2)} '#{self.text}'>"
+    end
+    def to_h
+      super.merge(:text => self.text)
+    end
+  end
+end

data/lib/tabula/entities/text_element.rb ADDED Viewed

@@ -0,0 +1,112 @@
+module Tabula
+  ##
+  # a Glyph
+  class TextElement < ZoneEntity
+    attr_accessor :font, :font_size, :text, :width_of_space
+    TOLERANCE_FACTOR = 0.25
+    def initialize(top, left, width, height, font, font_size, text, width_of_space)
+      super(top, left, width, height)
+      self.font = font
+      self.font_size = font_size
+      self.text = text
+      self.width_of_space = width_of_space
+    end
+    EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
+    ##
+    # heuristically merge an iterable of TextElement into a list of TextChunk
+    def self.merge_words(text_elements, options={})
+      default_options = {:vertical_rulings => []}
+      options = default_options.merge(options)
+      vertical_ruling_locations = options[:vertical_rulings].map(&:left) if options[:vertical_rulings]
+      return [] if text_elements.empty?
+      text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
+      text_elements.inject(text_chunks) do |chunks, char|
+        current_chunk = chunks.last
+        prev_char = current_chunk.text_elements.last
+        # any vertical ruling goes across prev_char and char?
+        across_vertical_ruling = vertical_ruling_locations.any? { |loc|
+          prev_char.left < loc && char.left > loc
+        }
+        # should we add a space?
+        if (prev_char.text != " ") && (char.text != " ") \
+          && !across_vertical_ruling \
+          && prev_char.should_add_space?(char)
+          sp = self.new(prev_char.top,
+                        prev_char.right,
+                        prev_char.width_of_space,
+                        prev_char.width_of_space, # width == height for spaces
+                        prev_char.font,
+                        prev_char.font_size,
+                        ' ',
+                        prev_char.width_of_space)
+          chunks.last << sp
+          prev_char = sp
+        end
+        # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
+        # that they ought to be merged by that account.
+        # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
+        # Why are both of those `.left`?, you might ask. The intuition is that a letter
+        # that starts on the left of a vertical ruling ought to remain on the left of it.
+        if !across_vertical_ruling && prev_char.should_merge?(char)
+          chunks.last << char
+        else
+          # create a new chunk
+          chunks << TextChunk.create_from_text_element(char)
+        end
+        chunks
+      end
+    end
+    # more or less returns True if distance < tolerance
+    def should_merge?(other)
+      raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
+      self.vertically_overlaps?(other) && self.horizontal_distance(other) < width_of_space * (1 + TOLERANCE_FACTOR) && !self.should_add_space?(other)
+    end
+    # more or less returns True if (tolerance <= distance < CHARACTER_DISTANCE_THRESHOLD*tolerance)
+    def should_add_space?(other)
+      raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
+      return false if self.width_of_space.nan?
+      (self.vertically_overlaps?(other) &&
+        self.horizontal_distance(other).abs.between?(self.width_of_space * (1 - TOLERANCE_FACTOR), self.width_of_space * (1 + TOLERANCE_FACTOR))) ||
+      (self.vertical_distance(other) > self.height)
+    end
+    ##
+    # merge this TextElement with another (adjust size and text content accordingly)
+    def merge!(other)
+      raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
+      if self.horizontally_overlaps?(other) and other.top < self.top
+        self.text = other.text + self.text
+      else
+        self.text << other.text
+      end
+      super(other)
+    end
+    def to_h
+      super.merge({:font => self.font, :text => self.text })
+    end
+    def inspect
+      "#<TextElement: #{self.top.round(2)},#{self.left.round(2)},#{self.bottom.round(2)},#{right.round(2)} '#{self.text}'>"
+    end
+    def ==(other)
+      self.text.strip == other.text.strip
+    end
+  end
+end

data/lib/tabula/entities/zone_entity.rb ADDED Viewed

@@ -0,0 +1,57 @@
+java_import java.awt.geom.Point2D
+module Tabula
+  class ZoneEntity < java.awt.geom.Rectangle2D::Float
+    attr_accessor :texts
+    def initialize(top, left, width, height)
+      super()
+      if left && top && width && height
+        self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], left, top, width, height
+      end
+      self.texts = []
+    end
+    def merge!(other)
+      self.top    = [self.top, other.top].min
+      self.left   = [self.left, other.left].min
+      self.width  = [self.right, other.right].max - left
+      self.height = [self.bottom, other.bottom].max - top
+      self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.left, self.top, self.width, self.height
+    end
+    ##
+    # default sorting order for ZoneEntity objects
+    # is lexicographical (left to right, top to bottom)
+    def <=>(other)
+      return  1 if self.left > other.left
+      return -1 if self.left < other.left
+      return  0 if self.vertically_overlaps?(other)
+      return  1 if self.top  > other.top
+      return -1 if self.top  < other.top
+      return  0
+    end
+    def to_json(options={})
+      self.to_h.to_json
+    end
+    def inspect
+      "#<#{self.class} dims: #{self.dims(:top, :left, :width, :height)}>"
+    end
+    def tlbr
+      [top, left, bottom, right]
+    end
+    def points
+      [ Point2D::Float.new(left, top),
+        Point2D::Float.new(right, top),
+        Point2D::Float.new(right, bottom),
+        Point2D::Float.new(left, bottom) ]
+    end
+  end
+end

data/lib/tabula/extraction.rb ADDED Viewed

@@ -0,0 +1,327 @@
+java_import org.apache.pdfbox.pdfparser.PDFParser
+java_import org.apache.pdfbox.util.TextPosition
+java_import org.apache.pdfbox.pdmodel.PDDocument
+java_import org.apache.pdfbox.util.PDFTextStripper
+java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
+java_import java.awt.geom.AffineTransform
+module Tabula
+  module Extraction
+    def Extraction.openPDF(pdf_filename, password='')
+      raise Errno::ENOENT unless File.exists?(pdf_filename)
+      document = PDDocument.load(pdf_filename)
+      if document.isEncrypted
+        sdm = StandardDecryptionMaterial.new(password)
+        document.openProtection(sdm)
+      end
+      document
+    end
+    class ObjectExtractor < org.apache.pdfbox.pdfviewer.PageDrawer
+      attr_accessor :characters, :debug_text, :debug_clipping_paths, :clipping_paths, :options
+      field_accessor :pageSize, :page
+      PRINTABLE_RE = /[[:print:]]/
+      DEFAULT_OPTIONS = {
+        :line_color_filter => nil,
+        :extract_ruling_lines => true
+      }
+      def initialize(pdf_filename, pages=[1], password='', options={})
+        raise Errno::ENOENT unless File.exists?(pdf_filename)
+        @pdf_filename = pdf_filename
+        @pdf_file = Extraction.openPDF(pdf_filename, password)
+        @all_pages = @pdf_file.getDocumentCatalog.getAllPages
+        @pages = pages == :all ?  (1..@all_pages.size) : pages
+        super()
+        self.options = DEFAULT_OPTIONS.merge(options)
+        self.characters = []
+        @debug_clipping_paths = false
+        @clipping_path = nil
+        @transformed_clipping_path = nil
+        self.clipping_paths = []
+        @rulings = []
+        @min_char_width = @min_char_height = 1000000
+      end
+      def extract
+        Enumerator.new do |y|
+          begin
+            @pages.each do |i|
+              page = @all_pages.get(i-1)
+              contents = page.getContents
+              next if contents.nil?
+              self.clear!
+              self.drawPage(page)
+              p = Tabula::Page.new(@pdf_filename,
+                                   page.findCropBox.width,
+                                   page.findCropBox.height,
+                                   page.getRotation.to_i,
+                                   i, #one-indexed, just like `i` is.
+                                   self.characters,
+                                   self.rulings,
+                                   @min_char_width,
+                                   @min_char_height)
+              y.yield p
+            end
+          ensure
+            @pdf_file.close
+          end # begin
+        end
+      end
+      def clear!
+        self.characters.clear
+        self.clipping_paths.clear
+        @page_transform = nil
+        @rulings.clear
+      end
+      def ensurePageSize!
+        if self.pageSize.nil? && !self.page.nil?
+          mediaBox = self.page.findMediaBox
+          self.pageSize = (mediaBox == nil ? nil : mediaBox.createDimension)
+        end
+      end
+      def drawPage(page)
+        self.page = page
+        if !self.page.getContents.nil?
+          ensurePageSize!
+          self.processStream(self.page,
+                             self.page.findResources,
+                             self.page.getContents.getStream)
+        end
+      end
+      def setStroke(stroke)
+        @basicStroke = stroke
+      end
+      def getStroke
+        @basicStroke
+      end
+      def strokePath(filter_by_color=nil)
+        unless self.options[:extract_ruling_lines]
+          self.getLinePath.reset
+          return
+        end
+        path = self.pathToList(self.getLinePath)
+        if path[0][0] != java.awt.geom.PathIterator::SEG_MOVETO \
+          || path[1..-1].any? { |p| p.first != java.awt.geom.PathIterator::SEG_LINETO && p.first != java.awt.geom.PathIterator::SEG_MOVETO && p.first != java.awt.geom.PathIterator::SEG_CLOSE }
+          self.getLinePath.reset
+          return
+        end
+        ccp_bounds = self.currentClippingPath
+        strokeColorComps = filter_by_color || self.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
+        color_filter = self.options[:line_color_filter]
+        first = path.shift
+        start_pos = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1])
+        path.each do |p|
+          end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
+          line = (start_pos <=> end_pos) == -1 \
+            ? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \
+            : java.awt.geom.Line2D::Float.new(end_pos, start_pos)
+          if p[0] == java.awt.geom.PathIterator::SEG_LINETO \
+            && (color_filter.nil? ? true : color_filter.call(strokeColorComps)) \
+            && line.intersects(ccp_bounds)
+            # convert line to rectangle for clipping it to the current clippath
+            # sucks, but awt doesn't have methods for this
+            tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
+            @rulings << ::Tabula::Ruling.new(tmp.getY,
+                                             tmp.getX,
+                                             tmp.getWidth,
+                                             tmp.getHeight,
+                                             filter_by_color.to_a)
+          end
+          start_pos = end_pos
+        end
+        self.getLinePath.reset
+      end
+      def fillPath(windingRule)
+        self.strokePath(self.getGraphicsState.getNonStrokingColor.getJavaColor.getRGBColorComponents(nil))
+      end
+      def drawImage(image, at)
+      end
+      def transformPath(path)
+        self.pageTransform.createTransformedShape(path)
+      end
+      def pageTransform
+        unless @page_transform.nil?
+          return @page_transform
+        end
+        cb = page.findCropBox
+        if !([90, -270, -90, 270].include?(page.getRotation))
+          @page_transform = AffineTransform.getScaleInstance(1, -1)
+          @page_transform.translate(0, -cb.getHeight)
+        else
+          @page_transform = AffineTransform.getScaleInstance(-1, 1)
+          @page_transform.rotate(page.getRotation * (Math::PI/180.0),
+                                 cb.getLowerLeftX, cb.getLowerLeftY)
+        end
+        @page_transform
+      end
+      def currentClippingPath
+        cp = self.getGraphicsState.getCurrentClippingPath
+        if cp == @clipping_path
+          return @transformed_clipping_path_bounds
+        end
+        @clipping_path = cp
+        @transformed_clipping_path = self.transformPath(cp)
+        @transformed_clipping_path_bounds = @transformed_clipping_path.getBounds
+        return @transformed_clipping_path_bounds
+      end
+      def processTextPosition(text)
+        c = text.getCharacter
+        h = c == ' ' ? text.getWidthDirAdj.round(2) : text.getHeightDir.round(2)
+        te = Tabula::TextElement.new(text.getYDirAdj.round(2) - h,
+                                     text.getXDirAdj.round(2),
+                                     text.getWidthDirAdj.round(2),
+                                     # ugly hack follows: we need spaces to have a height, so we can
+                                     # test for vertical overlap. height == width seems a safe bet.
+                                     h,
+                                     text.getFont,
+                                     text.getFontSize.round(2),
+                                     c,
+                                     # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
+                                     text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace)
+        ccp_bounds = self.currentClippingPath
+        if self.debug_clipping_paths && !self.clipping_paths.include?(ccp_bounds)
+          self.clipping_paths << ::Tabula::ZoneEntity.new(ccp_bounds.getMinY,
+                                                          ccp_bounds.getMinX,
+                                                          ccp_bounds.getWidth,
+                                                          ccp_bounds.getHeight)
+        end
+        if te.width < @min_char_width
+          @min_char_width = te.width
+        end
+        if te.height < @min_char_height
+          @min_char_height = te.height
+        end
+        if c =~ PRINTABLE_RE && ccp_bounds.intersects(te)
+          self.characters << te
+        end
+      end
+      def page_count
+        @all_pages.size
+      end
+      def rulings
+        return [] if @rulings.empty?
+        @rulings.reject { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
+      end
+      protected
+      # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
+      def currentSpaceWidth
+        gs = self.getGraphicsState
+        font = gs.getTextState.getFont
+        fontSizeText = gs.getTextState.getFontSize
+        horizontalScalingText = gs.getTextState.getHorizontalScalingPercent / 100.0
+        if font.java_kind_of?(org.apache.pdfbox.pdmodel.font.PDType3Font)
+          puts "TYPE3"
+        end
+        # idea from pdf.js
+        # https://github.com/mozilla/pdf.js/blob/master/src/core/fonts.js#L4418
+        spaceWidthText = spaceWidthText = [' ', '-', '1', 'i'] \
+          .map { |c| font.getFontWidth(c.ord) } \
+          .find { |w| w > 0 } || 1000
+        ctm00 = gs.getCurrentTransformationMatrix.getValue(0, 0)
+        return (spaceWidthText/1000.0) * fontSizeText * horizontalScalingText * (ctm00 == 0 ? 1 : ctm00)
+      end
+      def pathToList(path)
+        iterator = path.getPathIterator(self.pageTransform)
+        rv = []
+        while !iterator.isDone do
+          coords = Java::double[6].new
+          segType = iterator.currentSegment(coords)
+          rv << [segType, coords]
+          iterator.next
+        end
+        rv
+      end
+      def debugPath(path)
+        rv = ''
+        pathToList(path).each do |segType, coords|
+          case segType
+          when java.awt.geom.PathIterator::SEG_MOVETO
+            rv += "MOVE: #{coords[0]} #{coords[1]}\n"
+          when java.awt.geom.PathIterator::SEG_LINETO
+            rv += "LINE: #{coords[0]} #{coords[1]}\n"
+          when java.awt.geom.PathIterator::SEG_CLOSE
+            rv += "CLOSE\n\n"
+          end
+        end
+        rv
+      end
+    end
+    class PagesInfoExtractor
+      def initialize(pdf_filename, password='')
+        @pdf_filename = pdf_filename
+        @pdf_file = Extraction.openPDF(pdf_filename, password)
+        @all_pages = @pdf_file.getDocumentCatalog.getAllPages
+      end
+      def pages
+        Enumerator.new do |y|
+          begin
+            @all_pages.each_with_index do |page, i|
+              contents = page.getContents
+              y.yield Tabula::Page.new(@pdf_filename,
+                                       page.findCropBox.width,
+                                       page.findCropBox.height,
+                                       page.getRotation.to_i,
+                                       i+1) #remember, these are one-indexed
+            end
+          ensure
+            @pdf_file.close
+          end
+        end
+      end
+    end
+  end
+end