RubyGems - tabula-extractor - Versions diffs - 0.0.1-java - Mend

tabula-extractor 0.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/.gitignore +18 -0
data/AUTHORS.md +15 -0
data/Gemfile +6 -0
data/LICENSE.md +7 -0
data/NOTICE.txt +6 -0
data/README.md +24 -0
data/Rakefile +13 -0
data/bin/tabula +53 -0
data/lib/tabula.rb +5 -0
data/lib/tabula/entities.rb +259 -0
data/lib/tabula/pdf_dump.rb +118 -0
data/lib/tabula/table_extractor.rb +237 -0
data/lib/tabula/version.rb +3 -0
data/lib/tabula/whitespace.rb +50 -0
data/lib/tabula/writers.rb +30 -0
data/tabula-extractor.gemspec +26 -0
data/target/pdfbox-app-1.8.0.jar +0 -0
data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
data/test/data/argentina_diputados_voting_record.pdf +0 -0
data/test/data/bo_page24.pdf +0 -0
data/test/data/gre.pdf +0 -0
data/test/data/tabla_subsidios.pdf +0 -0
data/test/tests.rb +72 -0
metadata +133 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,18 @@
+*.gem
+*.rbc
+.bundle
+.config
+coverage
+InstalledFiles
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+# YARD artifacts
+.yardoc
+_yardoc
+doc/

data/AUTHORS.md ADDED Viewed

@@ -0,0 +1,15 @@
+Tabula was originally started by Manuel Aristarán in late 2012
+The PRIMARY AUTHORS are (and/or have been):
+* Manuel Aristarán - La Nación (Buenos Aires, Argentina), Knight-Mozilla OpenNews
+* Mike Tigas - ProPublica, Knight-Mozilla OpenNews
+* Jeremy Merrill - ProPublica
+* David Frackman
+* Travis Swicegood - Texas Tribune
+Special thanks to these organizations:
+* Knight-Mozilla OpenNews <http://www.mozillaopennews.org/>
+* ProPublica <http://propublica.org>
+* La Nación <http://www.lanacion.com.ar>

data/Gemfile ADDED Viewed

@@ -0,0 +1,6 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in jruby-memcached.gemspec
+gemspec
+gem "rake"

data/LICENSE.md ADDED Viewed

@@ -0,0 +1,7 @@
+Copyright (C) 2012-2013 Manuel Aristarán <jazzido@jazzido.com>
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/NOTICE.txt ADDED Viewed

@@ -0,0 +1,6 @@
+Tabula
+© 2012-2013 Manuel Aristarán. Available under MIT License. See `AUTHORS.md`
+and `LICENSE.md`.
+This product includes software (target/pdfbox-app-1.8.0.jar) developed at
+The Apache Software Foundation (http://www.apache.org/).

data/README.md ADDED Viewed

@@ -0,0 +1,24 @@
+tabula-extractor
+================
+Extract tables from PDF files
+## Usage
+```
+$ tabula --help
+Tabula helps you extract tables from PDFs
+Usage:
+       tabula [options] <pdf_file>
+where [options] are:
+     --page, -p <i>:   Page number (default: 1)
+     --area, -a <s>:   Portion of the page to analyze (top, left, bottom,
+                       right). Example: --area 269.875, 12.75, 790.5, 561.
+                       Default is entire page
+   --format, -f <s>:   Output format (CSV,TSV,HTML,JSON) (default: CSV)
+  --outfile, -o <s>:   Write output to <file> instead of STDOUT (default: -)
+      --version, -v:   Print version and exit
+         --help, -h:   Show this message
+```

data/Rakefile ADDED Viewed

@@ -0,0 +1,13 @@
+#!/usr/bin/env rake
+require 'bundler'
+require 'rake'
+require 'rake/testtask'
+Bundler::GemHelper.install_tasks
+task :test do
+  Rake::TestTask.new do |t|
+    t.test_files = Dir.glob('test/*.rb')
+    t.verbose = true
+  end
+end

data/bin/tabula ADDED Viewed

@@ -0,0 +1,53 @@
+#!/usr/bin/env jruby
+# encoding: utf-8
+require 'trollop'
+require_relative '../lib/tabula'
+FORMATS = ['CSV', 'TSV', 'HTML', 'JSON']
+def parse_command_line
+  opts = Trollop::options do
+    version "tabula #{Tabula::VERSION} (c) 2012-2013 Manuel Aristarán"
+    banner <<-EOS
+Tabula helps you extract tables from PDFs
+Usage:
+       tabula [options] <pdf_file>
+where [options] are:
+EOS
+    opt :page, 'Page number', :default => 1, :type => Integer
+    opt :area, 'Portion of the page to analyze (top, left, bottom, right). Example: --area 269.875, 12.75, 790.5, 561. Default is entire page', :type => String, :default => nil
+    opt :format, "Output format (#{FORMATS.join(",")})", :default => 'CSV'
+    opt :outfile, 'Write output to <file> instead of STDOUT', :default => '-'
+  end
+  if !opts[:area].nil?
+    unless opts[:area].split(',').size == 4 \
+      && opts[:area].split(',').all? { |x| x.strip =~ /(\d+\.?\d*)/ }
+      Trollop::die :area, "is invalid"
+    end
+  end
+  Trollop::die :format, "is unknown" unless FORMATS.include?(opts[:format])
+  Trollop::die "need one filename" if ARGV.empty?
+  pdf_filename = ARGV.shift
+  Trollop::die 'file does not exist' unless File.exists? pdf_filename
+  return opts, pdf_filename
+end
+def main
+  opts, filename = parse_command_line
+  extractor = Tabula::Extraction::CharacterExtractor.new(filename, [opts[:page]])
+  table = Tabula.make_table(extractor.extract.next.get_text(opts[:area].nil? ? nil : opts[:area].split(',').map(&:to_f)))
+  out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
+  Tabula::Writers.send(opts[:format].to_sym, table, out)
+  out.close
+end
+main

data/lib/tabula.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require_relative './tabula/version'
+require_relative './tabula/entities'
+require_relative './tabula/pdf_dump'
+require_relative './tabula/table_extractor'
+require_relative './tabula/writers'

data/lib/tabula/entities.rb ADDED Viewed

@@ -0,0 +1,259 @@
+module Tabula
+  class ZoneEntity
+    attr_accessor :top, :left, :width, :height
+    attr_accessor :texts
+    def initialize(top, left, width, height)
+      self.top = top
+      self.left = left
+      self.width = width
+      self.height = height
+      self.texts = []
+    end
+    def bottom
+      self.top + self.height
+    end
+    def right
+      self.left + self.width
+    end
+    # [x, y]
+    def midpoint
+      [self.left + (self.width / 2), self.top + (self.height / 2)]
+    end
+    def area
+      self.width * self.height
+    end
+    def merge!(other)
+      self.top    = [self.top, other.top].min
+      self.left   = [self.left, other.left].min
+      self.width  = [self.right, other.right].max - left
+      self.height = [self.bottom, other.bottom].max - top
+    end
+    def horizontal_distance(other)
+      (other.left - self.right).abs
+    end
+    def vertical_distance(other)
+      (other.bottom - self.bottom).abs
+    end
+    # Roughly, detects if self and other belong to the same line
+    def vertically_overlaps?(other)
+      vertical_overlap = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
+      vertical_overlap > 0
+    end
+    # detects if self and other belong to the same column
+    def horizontally_overlaps?(other)
+      horizontal_overlap = [0, [self.right, other.right].min  - [self.left, other.left].max].max
+      horizontal_overlap > 0
+    end
+    def overlaps?(other, ratio_tolerance=0.00001)
+      self.overlap_ratio(other) > ratio_tolerance
+    end
+    def overlap_ratio(other)
+      intersection_width = [0, [self.right, other.right].min  - [self.left, other.left].max].max
+      intersection_height = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
+      intersection_area = [0, intersection_height * intersection_width].max
+      union_area = self.area + other.area - intersection_area
+      intersection_area / union_area
+    end
+    def to_h
+      hash = {}
+      [:top, :left, :width, :height].each do |m|
+        hash[m] = self.send(m)
+      end
+      hash
+    end
+    def to_json(options={})
+      self.to_h.to_json
+    end
+  end
+  class Page < ZoneEntity
+    attr_reader :rotation, :number
+    def initialize(width, height, rotation, number, texts=[])
+      super(0, 0, width, height)
+      @rotation = rotation
+      @number = number
+      self.texts = texts
+    end
+    # get text, optionally from a provided area in the page [top, left, bottom, right]
+    def get_text(area=nil)
+      area = [0, 0, width, height] if area.nil?
+      ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
+      self.texts.select { |t| t.overlaps? ze }
+    end
+    def to_json(options={})
+      { :width => self.width,
+        :height => self.height,
+        :number => self.number,
+        :rotation => self.rotation,
+        :texts => self.texts
+      }.to_json(options)
+    end
+  end
+  class TextElement < ZoneEntity
+    attr_accessor :font, :font_size, :text
+    CHARACTER_DISTANCE_THRESHOLD = 1.5
+    def initialize(top, left, width, height, font, font_size, text)
+      super(top, left, width, height)
+      self.font = font
+      self.font_size = font_size
+      self.text = text
+    end
+    # more or less returns True if distance < tolerance
+    def should_merge?(other)
+      raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
+      overlaps = self.vertically_overlaps?(other)
+      tolerance = ((self.font_size + other.font_size) / 2) * 0.25
+      overlaps or
+        (self.height == 0 and other.height != 0) or
+        (other.height == 0 and self.height != 0) and
+        self.horizontal_distance(other) < tolerance
+    end
+    # more or less returns True if (tolerance <= distance < CHARACTER_DISTANCE_THRESHOLD*tolerance)
+    def should_add_space?(other)
+      raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
+      overlaps = self.vertically_overlaps?(other)
+      tolerance = ((self.font_size + other.font_size) / 2) * 0.25
+      dist = self.horizontal_distance(other)
+      overlaps or
+        (self.height == 0 and other.height != 0) or
+        (other.height == 0 and self.height != 0) and
+        ((tolerance <= dist) and (dist < tolerance*CHARACTER_DISTANCE_THRESHOLD))
+    end
+    def merge!(other)
+      raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
+      # unless self.horizontally_overlaps?(other) or self.vertically_overlaps?(other)
+      #   raise ArgumentError, "won't merge TextElements that don't overlap"
+      # end
+      if self.horizontally_overlaps?(other) and other.top < self.top
+        self.text = other.text + self.text
+      else
+        self.text << other.text
+      end
+      super(other)
+    end
+    def to_h
+      hash = super
+      [:font, :text].each do |m|
+        hash[m] = self.send(m)
+      end
+      hash
+    end
+  end
+  class Line < ZoneEntity
+    attr_accessor :text_elements
+    def initialize
+      self.text_elements = []
+    end
+    def <<(t)
+      if self.text_elements.size == 0
+        self.text_elements << t
+        self.top = t.top
+        self.left = t.left
+        self.width = t.width
+        self.height = t.height
+      else
+        if in_same_column = self.text_elements.find { |te| te.horizontally_overlaps?(t) }
+          in_same_column.merge!(t)
+        else
+          self.text_elements << t
+          self.merge!(t)
+        end
+      end
+    end
+  end
+  class Column < ZoneEntity
+    attr_accessor :text_elements
+    def initialize(left, width, text_elements=[])
+      super(0, left, width, 0)
+      @text_elements = text_elements
+    end
+    def <<(te)
+      self.text_elements << te
+      self.update_boundaries!(te)
+      self.text_elements.sort_by! { |t| t.top }
+    end
+    def update_boundaries!(text_element)
+      self.merge!(text_element)
+    end
+    # this column can be merged with other_column?
+    def contains?(other_column)
+      self.horizontally_overlaps?(other_column)
+    end
+    def average_line_distance
+      # avg distance between lines
+      # this might help to MERGE lines that are shouldn't be split
+      # e.g. cells with > 1 lines of text
+      1.upto(self.text_elements.size - 1).map { |i|
+        self.text_elements[i].top - self.text_elements[i - 1].top
+      }.inject{ |sum, el| sum + el }.to_f / self.text_elements.size
+    end
+    def inspect
+      vars = (self.instance_variables - [:@text_elements]).map{ |v| "#{v}=#{instance_variable_get(v).inspect}" }
+      texts = self.text_elements.sort_by { |te| te.top }.map { |te| te.text }
+      "<#{self.class}: #{vars.join(', ')}, @text_elements=#{texts.join(', ')}>"
+    end
+  end
+  class Ruling < ZoneEntity
+    attr_accessor :color
+    def initialize(top, left, width, height, color)
+      super(top, left, width, height)
+      self.color = color
+    end
+    def to_h
+      hash = super
+      hash[:color] = self.color
+      hash
+    end
+  end
+end

data/lib/tabula/pdf_dump.rb ADDED Viewed

@@ -0,0 +1,118 @@
+require 'observer'
+require_relative './entities.rb'
+require 'java'
+require File.join(File.dirname(__FILE__), '../../target/pdfbox-app-1.8.0.jar')
+java_import org.apache.pdfbox.pdfparser.PDFParser
+java_import org.apache.pdfbox.pdmodel.PDDocument
+java_import org.apache.pdfbox.util.PDFTextStripper
+module Tabula
+  module Extraction
+    class TextExtractor < org.apache.pdfbox.util.PDFTextStripper
+      attr_accessor :characters, :fonts
+      PRINTABLE_RE = /[[:print:]]/
+      def initialize
+        super
+        self.fonts = {}
+        self.characters = []
+        self.setSortByPosition(true)
+      end
+      def clear!
+        self.characters = []; self.fonts = {}
+      end
+      def processTextPosition(text)
+        #    return if text.getCharacter == ' '
+        # text_font = text.getFont
+        # text_size = text.getFontSize
+        # font_plus_size = self.fonts.select { |k, v| v == text_font }.first.first + "-" + text_size.to_i.to_s
+        # $fonts[$current_page].merge!({
+        #   font_plus_size => { :family => text_font.getBaseFont, :size => text_size }
+        # })
+        #    $page_contents[$current_page] += "  <text top=\"%.2f\" left=\"%.2f\" width=\"%.2f\" height=\"%.2f\" font=\"#{font_plus_size}\" dir=\"#{text.getDir}\">#{text.getCharacter}</text>\n" % [text.getYDirAdj - text.getHeightDir, text.getXDirAdj, text.getWidthDirAdj, text.getHeightDir]
+        c = text.getCharacter
+        # probably not the fastest way of detecting printable chars
+        self.characters << text  if c =~ PRINTABLE_RE
+      end
+    end
+    class PagesInfoExtractor
+      def initialize(pdf_filename)
+        raise Errno::ENOENT unless File.exists?(pdf_filename)
+        @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
+        @all_pages = @pdf_file.getDocumentCatalog.getAllPages
+      end
+      def pages
+        Enumerator.new do |y|
+          begin
+            @all_pages.each_with_index do |page, i|
+              contents = page.getContents
+              next if contents.nil?
+              y.yield Tabula::Page.new(page.findCropBox.width,
+                                       page.findCropBox.height,
+                                       page.getRotation.to_i,
+                                       i+1)
+            end
+          ensure
+            @pdf_file.close
+          end
+        end
+      end
+    end
+    class CharacterExtractor
+      include Observable
+      def initialize(pdf_filename, pages=[1])
+        raise Errno::ENOENT unless File.exists?(pdf_filename)
+        @pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil)
+        @all_pages = @pdf_file.getDocumentCatalog.getAllPages
+        @pages = pages
+        @extractor = TextExtractor.new
+      end
+      def extract
+        Enumerator.new do |y|
+          begin
+            @pages.each do |i|
+              page = @all_pages.get(i-1)
+              contents = page.getContents
+              next if contents.nil?
+              @extractor.clear!
+              @extractor.processStream(page, page.findResources, contents.getStream)
+              y.yield Tabula::Page.new(page.findCropBox.width,
+                                       page.findCropBox.height,
+                                       page.getRotation.to_i,
+                                       i+1,
+                                       @extractor.characters.map { |char|
+                                         Tabula::TextElement.new(char.getYDirAdj,
+                                                                 char.getXDirAdj,
+                                                                 char.getWidthDirAdj,
+                                                                 char.getHeightDir,
+                                                                 nil,
+                                                                 char.getFontSize,
+                                                                 char.getCharacter)
+                                       })
+            end
+          ensure
+            @pdf_file.close
+          end # begin
+        end
+      end
+    end
+  end
+end

data/lib/tabula/table_extractor.rb ADDED Viewed

@@ -0,0 +1,237 @@
+require 'csv'
+module Tabula
+  class TableExtractor
+    attr_accessor :text_elements, :options
+    DEFAULT_OPTIONS = {
+      :horizontal_rulings => [],
+      :vertical_rulings => [],
+      :merge_words => true,
+      :split_multiline_cells => false
+    }
+    def initialize(text_elements, options = {})
+      self.text_elements = text_elements
+      self.options = DEFAULT_OPTIONS.merge(options)
+      @merged = false
+      merge_words! if self.options[:merge_words]
+    end
+    def get_rows
+      hg = self.get_line_boundaries
+      hg.sort_by(&:top).map { |r| {'top' => r.top, 'bottom' => r.bottom, 'text' => r.texts} }
+    end
+    # TODO finish writing this method
+    # it should be analogous to get_line_boundaries
+    # (ie, take into account vertical ruling lines if available)
+    def group_by_columns
+      columns = []
+      tes = self.text_elements.sort_by(&:left)
+      # we don't have vertical rulings
+      tes.each do |te|
+        if column = columns.detect { |c| te.horizontally_overlaps?(c) }
+          column << te
+        else
+          columns << Column.new(te.left, te.width, [te])
+        end
+      end
+      columns
+    end
+    def get_columns
+      Tabula.group_by_columns(text_elements).map { |c|
+        {'left' => c.left, 'right' => c.right, 'width' => c.width}
+      }
+    end
+    def get_line_boundaries
+      boundaries = []
+      if self.options[:horizontal_rulings].empty?
+        # we don't have rulings
+        # iteratively grow boundaries to construct lines
+        self.text_elements.each do |te|
+          row = boundaries.detect { |l| l.vertically_overlaps?(te) }
+          ze = ZoneEntity.new(te.top, te.left, te.width, te.height)
+          if row.nil?
+            boundaries << ze
+            ze.texts << te.text
+          else
+            row.merge!(ze)
+            row.texts << te.text
+          end
+        end
+      else
+        self.options[:horizontal_rulings].sort_by!(&:top)
+        1.upto(self.options[:horizontal_rulings].size - 1) do |i|
+          above = self.options[:horizontal_rulings][i - 1]
+          below = self.options[:horizontal_rulings][i]
+          # construct zone between a horizontal ruling and the next
+          ze = ZoneEntity.new(above.top,
+                              [above.left, below.left].min,
+                              [above.width, below.width].max,
+                              below.top - above.top)
+          # skip areas shorter than some threshold
+          # TODO: this should be the height of the shortest character, or something like that
+          next if ze.height < 2
+          boundaries << ze
+        end
+      end
+      boundaries
+    end
+    private
+    def merge_words!
+      return self.text_elements if @merged # only merge once. awful hack.
+      @merged = true
+      current_word_index = i = 0
+      char1 = self.text_elements[i]
+      while i < self.text_elements.size-1 do
+        char2 = self.text_elements[i+1]
+        next if char2.nil? or char1.nil?
+        if self.text_elements[current_word_index].should_merge?(char2)
+          self.text_elements[current_word_index].merge!(char2)
+          char1 = char2
+          self.text_elements[i+1] = nil
+        else
+          # is there a space? is this within `CHARACTER_DISTANCE_THRESHOLD` points of previous char?
+          if (char1.text != " ") and (char2.text != " ") and self.text_elements[current_word_index].should_add_space?(char2)
+            self.text_elements[current_word_index].text += " "
+          end
+          current_word_index = i+1
+        end
+        i += 1
+      end
+      return self.text_elements.compact!
+    end
+  end
+  # TODO next four module methods are deprecated
+  def Tabula.group_by_columns(text_elements, merge_words=false)
+    TableExtractor.new(text_elements, :merge_words => merge_words).group_by_columns
+  end
+  def Tabula.get_line_boundaries(text_elements)
+    TableExtractor.new(text_elements).get_line_boundaries
+  end
+  def Tabula.get_columns(text_elements, merge_words=true)
+    TableExtractor.new(text_elements, :merge_words => merge_words).get_columns
+  end
+  def Tabula.get_rows(text_elements, merge_words=true)
+    TableExtractor.new(text_elements, :merge_words => merge_words).get_rows
+  end
+  def Tabula.lines_to_csv(lines)
+    CSV.generate { |csv|
+      lines.each { |l|
+        csv << l.map { |c| c.text.strip }
+      }
+    }
+  end
+  ONLY_SPACES_RE = Regexp.new('^\s+$')
+  # Returns an array of Tabula::Line
+  def Tabula.make_table(text_elements, options={})
+    extractor = TableExtractor.new(text_elements, options)
+    # group by lines
+    lines = []
+    line_boundaries = extractor.get_line_boundaries
+    # find all the text elements
+    # contained within each detected line (table row) boundary
+    line_boundaries.each { |lb|
+      line = Line.new
+      line_members = text_elements.find_all { |te|
+        te.vertically_overlaps?(lb)
+      }
+      text_elements -= line_members
+      line_members.sort_by(&:left).each { |te|
+        # skip text_elements that only contain spaces
+        next if te.text =~ ONLY_SPACES_RE
+        line << te
+      }
+      lines << line if line.text_elements.size > 0
+    }
+    lines.sort_by!(&:top)
+    columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq).sort_by(&:left)
+    # # insert empty cells if needed
+    lines.each_with_index { |l, line_index|
+      next if l.text_elements.nil?
+      l.text_elements.compact! # TODO WHY do I have to do this?
+      l.text_elements.uniq!  # TODO WHY do I have to do this?
+      l.text_elements.sort_by!(&:left)
+      # l.text_elements = Tabula.merge_words(l.text_elements)
+      next unless l.text_elements.size < columns.size
+      columns.each_with_index do |c, i|
+        if (i > l.text_elements.size - 1) or !l.text_elements(&:left)[i].nil? and !c.text_elements.include?(l.text_elements[i])
+          l.text_elements.insert(i, TextElement.new(l.top, c.left, c.width, l.height, nil, 0, ''))
+        end
+      end
+    }
+    # # merge elements that are in the same column
+    columns = Tabula.group_by_columns(lines.map(&:text_elements).flatten.compact.uniq)
+    lines.each_with_index do |l, line_index|
+      next if l.text_elements.nil?
+      (0..l.text_elements.size-1).to_a.combination(2).each do |t1, t2|
+        next if l.text_elements[t1].nil? or l.text_elements[t2].nil?
+        # if same column...
+        if columns.detect { |c| c.text_elements.include? l.text_elements[t1] } \
+          == columns.detect { |c| c.text_elements.include? l.text_elements[t2] }
+          if l.text_elements[t1].bottom <= l.text_elements[t2].bottom
+            l.text_elements[t1].merge!(l.text_elements[t2])
+            l.text_elements[t2] = nil
+          else
+            l.text_elements[t2].merge!(l.text_elements[t1])
+            l.text_elements[t1] = nil
+          end
+        end
+      end
+      l.text_elements.compact!
+    end
+    # remove duplicate lines
+    # TODO this shouldn't have happened here, check why we have to do
+    # this (maybe duplication is happening in the column merging phase?)
+    (0..lines.size - 2).each do |i|
+      next if lines[i].nil?
+      # if any of the elements on the next line is duplicated, kill
+      # the next line
+      if (0..lines[i].text_elements.size-1).any? { |j| lines[i].text_elements[j] == lines[i+1].text_elements[j] }
+        lines[i+1] = nil
+      end
+    end
+    lines.compact.map { |line|
+      line.text_elements.sort_by(&:left)
+    }
+  end
+end

data/lib/tabula/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Tabula
+  VERSION = '0.0.1'
+end

data/lib/tabula/whitespace.rb ADDED Viewed

@@ -0,0 +1,50 @@
+require 'algorithms'
+module Tabula
+  module Whitespace
+    # Detect whitespace in a document (not yet used in Tabula)
+    # Described in "Two Geometric Algorithms for layout analysis" (Thomas Breuer)
+    # http://pdf.aminer.org/000/140/219/two_geometric_algorithms_for_layout_analysis.pdf
+    def self.find_closest(text_elements, x, y)
+      text_elements.sort_by { |te|
+        Math.sqrt((x - te.midpoint[0]) ** 2 + (y - te.midpoint[1]) ** 2)
+      }.first
+    end
+    def self.find_whitespace(text_elements, bounds)
+      queue = Containers::PriorityQueue.new
+      queue.push([bounds, text_elements], bounds.width * bounds.height)
+      rv = []
+      while !queue.empty?
+        r, obstacles = queue.pop
+        if obstacles.empty?
+          return r
+        end
+        pivot = self.find_closest(obstacles, *r.midpoint)
+        subrectangles = [
+                         ZoneEntity.new(r.top, pivot.right, r.right - pivot.right, pivot.top - r.top),
+                         ZoneEntity.new(r.top, r.left, pivot.left - r.left, pivot.top - r.top),
+                         ZoneEntity.new(pivot.bottom, r.left, pivot.left - r.left, r.bottom - pivot.bottom),
+                         ZoneEntity.new(pivot.bottom, pivot.right, r.right - pivot.right, r.bottom - pivot.bottom)
+                        ]
+        subrectangles.each do |sub_r|
+          obs = obstacles.select { |s|
+            s.overlaps?(sub_r)
+          }
+          if obs.empty?
+            rv << sub_r
+          else
+            queue.push([sub_r, obs], sub_r.width * sub_r.height)
+          end
+        end
+      end
+      return rv
+    end
+  end
+end

data/lib/tabula/writers.rb ADDED Viewed

@@ -0,0 +1,30 @@
+require 'csv'
+require 'json'
+module Tabula
+  module Writers
+    def Writers.CSV(lines, output=$stdout)
+      lines.each { |l|
+        output.write CSV.generate_line(l.map(&:text), row_sep: "\r\n")
+      }
+    end
+    def Writers.JSON(lines, output=$stdout)
+      output.write lines.to_json
+    end
+    def Writers.TSV(lines, output=$stdout)
+      tsv_string = lines.each { |l|
+        output.write(l.map(&:text).join("\t") + '\n')
+      }
+    end
+    def Writers.HTML(lines, output=$stdout)
+      raise "not implemented"
+    end
+  end
+end

data/tabula-extractor.gemspec ADDED Viewed

@@ -0,0 +1,26 @@
+# encoding: utf-8
+$:.push File.expand_path("../lib", __FILE__)
+require 'tabula/version'
+Gem::Specification.new do |s|
+  s.name        = "tabula-extractor"
+  s.version     = Tabula::VERSION
+  s.authors     = ["Manuel Aristarán"]
+  s.email       = ["manuel@jazzido.com"]
+  s.homepage    = "https://github.com/jazzido/tabula-extractor"
+  s.summary     = %q{extract tables from PDF files}
+  s.description = %q{extract tables from PDF files}
+  s.platform = 'java'
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.add_development_dependency 'minitest'
+  s.add_development_dependency 'bundler', '>= 1.3.5'
+  s.add_runtime_dependency "trollop", ["~> 2.0"]
+end

data/target/pdfbox-app-1.8.0.jar ADDED Viewed

Binary file

data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf ADDED Viewed

Binary file

data/test/data/argentina_diputados_voting_record.pdf ADDED Viewed

Binary file

data/test/data/bo_page24.pdf ADDED Viewed

Binary file

data/test/data/gre.pdf ADDED Viewed

Binary file

data/test/data/tabla_subsidios.pdf ADDED Viewed

Binary file

data/test/tests.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+require 'minitest/autorun'
+require_relative '../lib/tabula'
+def lines_to_array(lines)
+  lines.map { |l|
+    l.map { |te| te.text }
+  }
+end
+class TestPagesInfoExtractor < MiniTest::Unit::TestCase
+  def test_pages_info_extractor
+    extractor = Tabula::Extraction::PagesInfoExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
+    i = 0
+    extractor.pages.each do |page|
+      assert_instance_of Tabula::Page, page
+      i += 1
+    end
+    assert_equal 2, i
+  end
+end
+class TestDumper < MiniTest::Unit::TestCase
+  def test_extractor
+    extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
+    page = extractor.extract.first
+    assert_instance_of Tabula::Page, page
+  end
+  def test_get_by_area
+#    http://localhost:8080/debug/418b1d5698e5c7b724551d9610c071ab3063275c/characters?x1=57.921428571428564&x2=290.7&y1=107.1&y2=394.52142857142854&page=1&use_lines=false
+    extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
+    characters = extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
+    assert_equal characters.size, 206
+  end
+end
+class TestExtractor < MiniTest::Unit::TestCase
+  def test_table_extraction_1
+    character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/gre.pdf', File.dirname(__FILE__)))
+    characters = character_extractor.extract.next.get_text([107.1, 57.9214, 394.5214, 290.7])
+    table = lines_to_array Tabula.make_table(characters)
+    expected = [["Prior Scale ", "New Scale ", "% Rank* "], ["800 ", "170 ", "99 "], ["790 ", "170 ", "99 "], ["780 ", "170 ", "99 "], ["770 ", "170 ", "99 "], ["760 ", "170 ", "99 "], ["750 ", "169 ", "99 "], ["740 ", "169 ", "99 "], ["730 ", "168 ", "98 "], ["720 ", "168 ", "98 "], ["710 ", "167 ", "97 "], ["700 ", "166 ", "96 "], ["690 ", "165 ", "95 "], ["680 ", "165 ", "95 "], ["670 ", "164 ", "93 "], ["660 ", "164 ", "93 "], ["650 ", "163 ", "91 "]]
+    assert_equal expected, table
+  end
+  def test_diputados_voting_record
+    character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/argentina_diputados_voting_record.pdf', File.dirname(__FILE__)))
+    characters = character_extractor.extract.next.get_text([269.875, 12.75, 790.5, 561])
+    expected = [["Apellido y Nombre", "Bloque político", "Provincia", ""], ["ABDALA de MATARAZZO, Norma Amanda", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["ALONSO, María Luz", "Frente para la Victoria - PJ", "La Pampa", "AFIRMATIVO"], ["ARENA, Celia Isabel", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["ARREGUI, Andrés Roberto", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["BALCEDO, María Ester", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BARRANDEGUY, Raúl Enrique", "Frente para la Victoria - PJ", "Entre Ríos", "AFIRMATIVO"], ["BASTERRA, Luis Eugenio", "Frente para la Victoria - PJ", "Formosa", "AFIRMATIVO"], ["BEDANO, Nora Esther", "Frente para la Victoria - PJ", "Córdoba", "AFIRMATIVO"], ["BERNAL, María Eugenia", "Frente para la Victoria - PJ", "Jujuy", "AFIRMATIVO"], ["BERTONE, Rosana Andrea", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["BIANCHI, María del Carmen", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BIDEGAIN, Gloria Mercedes", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["BRAWER, Mara", "Frente para la Victoria - PJ", "Cdad. Aut. Bs. As.", "AFIRMATIVO"], ["BRILLO, José Ricardo", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["BROMBERG, Isaac Benjamín", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["BRUE, Daniel Agustín", "Frente Cívico por Santiago", "Santiago del Estero", "AFIRMATIVO"], ["CALCAGNO, Eric", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARLOTTO, Remo Gerardo", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CARMONA, Guillermo Ramón", "Frente para la Victoria - PJ", "Mendoza", "AFIRMATIVO"], ["CATALAN MAGNI, Julio César", "Frente para la Victoria - PJ", "Tierra del Fuego", "AFIRMATIVO"], ["CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO"], ["CHIENO, María Elena", "Frente para la Victoria - PJ", "Corrientes", "AFIRMATIVO"], ["CIAMPINI, José Alberto", "Frente para la Victoria - PJ", "Neuquén", "AFIRMATIVO"], ["CIGOGNA, Luis Francisco Jorge", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CLERI, Marcos", "Frente para la Victoria - PJ", "Santa Fe", "AFIRMATIVO"], ["COMELLI, Alicia Marcela", "Movimiento Popular Neuquino", "Neuquén", "AFIRMATIVO"], ["CONTI, Diana Beatriz", "Frente para la Victoria - PJ", "Buenos Aires", "AFIRMATIVO"], ["CORDOBA, Stella Maris", "Frente para la Victoria - PJ", "Tucumán", "AFIRMATIVO"], ["CURRILEN, Oscar Rubén", "Frente para la Victoria - PJ", "Chubut", "AFIRMATIVO"]]
+    assert_equal expected, lines_to_array(Tabula.make_table(characters))
+  end
+  # TODO Spaces inserted in words - fails
+  def test_bo_page24
+    character_extractor = Tabula::Extraction::CharacterExtractor.new(File.expand_path('data/bo_page24.pdf', File.dirname(__FILE__)))
+    characters = character_extractor.extract.next.get_text([435.625, 53.125, 570.7142857142857, 810.5357142857142])
+    expected = [["1", "UNICA", "CECILIA KANDUS", "16/12/2008", "PEDRO ALBERTO GALINDEZ", "60279/09"], ["1", "UNICA", "CECILIA KANDUS", "10/06/2009", "PASTORA FILOMENA NAVARRO", "60280/09"], ["13", "UNICA", "MIRTA S. BOTTALLO DE VILLA", "02/07/2009", "MARIO LUIS ANGELERI, DNI 4.313.138", "60198/09"], ["16", "UNICA", "LUIS PEDRO FASANELLI", "22/05/2009", "PETTER o PEDRO KAHRS", "60244/09"], ["18", "UNICA", "ALEJANDRA SALLES", "26/06/2009", "RAUL FERNANDO FORTINI", "60236/09"], ["31", "UNICA", "MARÍA CRISTINA GARCÍA", "17/06/2009", "DOMINGO TRIPODI Y PAULA LUPPINO", "60302/09"], ["34", "UNICA", "SUSANA B.MARZIONI", "11/06/2009", "JESUSA CARMEN VAZQUEZ", "60177/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "19/05/2009", "DANIEL DECUADRO", "60227/09"], ["51", "UNICA", "MARIA LUCRECIA SERRAT", "12/02/2009", "ELIZABETH LILIANA MANSILLA ROMERO", "60150/09"], ["75", "UNICA", "IGNACIO M. REBAUDI BASAVILBASO", "01/07/2009", "ABALSAMO ALFREDO DANIEL", "60277/09"], ["94", "UNICA", "GABRIELA PALÓPOLI", "02/07/2009", "ALVAREZ ALICIA ESTHER", "60360/09"], ["96", "UNICA", "DANIEL PAZ EYNARD", "16/06/2009", "NELIDA ALBORADA ALCARAZ SERRANO", "60176/09"]]
+    assert_equal expected, lines_to_array(Tabula.make_table(characters))
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,133 @@
+--- !ruby/object:Gem::Specification
+name: tabula-extractor
+version: !ruby/object:Gem::Version
+  prerelease:
+  version: 0.0.1
+platform: java
+authors:
+- Manuel Aristarán
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-05-09 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: minitest
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: !binary |-
+          MA==
+    none: false
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: !binary |-
+          MA==
+    none: false
+  prerelease: false
+  type: :development
+- !ruby/object:Gem::Dependency
+  name: bundler
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.3.5
+    none: false
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.3.5
+    none: false
+  prerelease: false
+  type: :development
+- !ruby/object:Gem::Dependency
+  name: trollop
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+    none: false
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+    none: false
+  prerelease: false
+  type: :runtime
+description: extract tables from PDF files
+email:
+- manuel@jazzido.com
+executables:
+- tabula
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- AUTHORS.md
+- Gemfile
+- LICENSE.md
+- NOTICE.txt
+- README.md
+- Rakefile
+- bin/tabula
+- lib/tabula.rb
+- lib/tabula/entities.rb
+- lib/tabula/pdf_dump.rb
+- lib/tabula/table_extractor.rb
+- lib/tabula/version.rb
+- lib/tabula/whitespace.rb
+- lib/tabula/writers.rb
+- tabula-extractor.gemspec
+- target/pdfbox-app-1.8.0.jar
+- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
+- test/data/argentina_diputados_voting_record.pdf
+- test/data/bo_page24.pdf
+- test/data/gre.pdf
+- test/data/tabla_subsidios.pdf
+- test/tests.rb
+homepage: https://github.com/jazzido/tabula-extractor
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      hash: 2
+      version: !binary |-
+        MA==
+  none: false
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      hash: 2
+      version: !binary |-
+        MA==
+  none: false
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: extract tables from PDF files
+test_files:
+- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
+- test/data/argentina_diputados_voting_record.pdf
+- test/data/bo_page24.pdf
+- test/data/gre.pdf
+- test/data/tabla_subsidios.pdf
+- test/tests.rb