RubyGems - mateusmaso-docsplit - Versions diffs - 0.6.4 - Mend

mateusmaso-docsplit 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

data/LICENSE +24 -0
data/README +22 -0
data/bin/docsplit +5 -0
data/docsplit.gemspec +24 -0
data/lib/docsplit/command_line.rb +122 -0
data/lib/docsplit/image_extractor.rb +103 -0
data/lib/docsplit/info_extractor.rb +39 -0
data/lib/docsplit/page_extractor.rb +36 -0
data/lib/docsplit/text_cleaner.rb +94 -0
data/lib/docsplit/text_extractor.rb +130 -0
data/lib/docsplit/transparent_pdfs.rb +26 -0
data/lib/docsplit.rb +130 -0
data/vendor/conf/document-formats.js +233 -0
data/vendor/jodconverter/commons-cli-1.1.jar +0 -0
data/vendor/jodconverter/commons-io-1.4.jar +0 -0
data/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar +0 -0
data/vendor/jodconverter/json-20090211.jar +0 -0
data/vendor/jodconverter/juh-3.2.1.jar +0 -0
data/vendor/jodconverter/jurt-3.2.1.jar +0 -0
data/vendor/jodconverter/ridl-3.2.1.jar +0 -0
data/vendor/jodconverter/unoil-3.2.1.jar +0 -0
data/vendor/logging.properties +1 -0
metadata +72 -0

data/LICENSE ADDED Viewed

@@ -0,0 +1,24 @@
+JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
+Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.

data/README ADDED Viewed

@@ -0,0 +1,22 @@
+==
+         __                      ___ __
+    ____/ /___  ______________  / (_) /_
+   / __  / __ \/ ___/ ___/ __ \/ / / __/
+  / /_/ / /_/ / /__(__  ) /_/ / / / /_
+  \____/\____/\___/____/ .___/_/_/\__/
+                      /_/
+  Docsplit is a command-line utility and Ruby library for splitting apart
+  documents into their component parts: searchable UTF-8 plain text, page
+  images or thumbnails in any format, PDFs, single pages, and document
+  metadata (title, author, number of pages...)
+  Installation:
+  gem install docsplit
+  For documentation, usage, and examples, see:
+  http://documentcloud.github.com/docsplit/
+  To suggest a feature or report a bug:
+  http://github.com/documentcloud/docsplit/issues/

data/bin/docsplit ADDED Viewed

@@ -0,0 +1,5 @@
+#!/usr/bin/env ruby
+require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
+Docsplit::CommandLine.new

data/docsplit.gemspec ADDED Viewed

@@ -0,0 +1,24 @@
+Gem::Specification.new do |s|
+  s.name      = 'mateusmaso-docsplit'
+  s.version   = '0.6.4'
+  s.date      = '2013-02-05'
+  s.homepage    = "http://github.com/mateusmaso/docsplit"
+  s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
+  s.description = <<-EOS
+    Docsplit is a command-line utility and Ruby library for splitting apart
+    documents into their component parts: searchable UTF-8 plain text, page
+    images or thumbnails in any format, PDFs, single pages, and document
+    metadata (title, author, number of pages...)
+  EOS
+  s.authors           = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
+  s.email             = 'jeremy@documentcloud.org'
+  s.rubyforge_project = 'docsplit'
+  s.require_paths     = ['lib']
+  s.executables       = ['docsplit']
+  s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
+                'docsplit.gemspec', 'LICENSE', 'README']
+end

data/lib/docsplit/command_line.rb ADDED Viewed

@@ -0,0 +1,122 @@
+require 'optparse'
+require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
+module Docsplit
+  # A single command-line utility to separate a PDF into all its component parts.
+  class CommandLine
+    BANNER = <<-EOS
+docsplit breaks apart documents into images, text, or individual pages.
+It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
+Usage:
+  docsplit COMMAND [OPTIONS] path/to/doc.pdf
+  Main commands:
+    pages, images, text, pdf.
+  Metadata commands:
+    author, date, creator, keywords, producer, subject, title, length.
+Example:
+  docsplit images --size 700x --format jpg document.pdf
+Dependencies:
+  Ruby, Java, A working GraphicsMagick (gm) command,
+  and a headless OpenOffice server for non-PDF documents.
+Options:
+    (size, pages and format can take comma-separated values)
+    EOS
+    # Creating a CommandLine runs off of the contents of ARGV.
+    def initialize
+      parse_options
+      cmd = ARGV.shift
+      @command = cmd && cmd.to_sym
+      run
+    end
+    # Delegate to the Docsplit Ruby API to perform all extractions.
+    def run
+      begin
+        case @command
+        when :images  then Docsplit.extract_images(ARGV, @options)
+        when :pages   then Docsplit.extract_pages(ARGV, @options)
+        when :text    then Docsplit.extract_text(ARGV, @options)
+        when :pdf     then Docsplit.extract_pdf(ARGV, @options)
+        else
+          if METADATA_KEYS.include?(@command)
+            value = Docsplit.send("extract_#{@command}", ARGV, @options)
+            puts value unless value.nil?
+          else
+            usage
+          end
+        end
+      rescue ExtractionFailed => e
+        puts e.message.chomp
+        exit(1)
+      end
+    end
+    # Print out the usage help message.
+    def usage
+      puts "\n#{@option_parser}\n"
+      exit
+    end
+    private
+    # Use the OptionParser library to parse out all supported options. Return
+    # options formatted for the Ruby API.
+    def parse_options
+      @options = {:ocr => :default, :clean => true}
+      @option_parser = OptionParser.new do |opts|
+        opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
+          @options[:output] = d
+        end
+        opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
+          @options[:pages] = p
+        end
+        opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
+          @options[:size] = s.split(',')
+        end
+        opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
+          @options[:format] = t.split(',')
+        end
+        opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
+          @options[:density] = d
+        end
+        opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
+          @options[:ocr] = o
+        end
+        opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
+          @options[:clean] = false
+        end
+        opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
+          @options[:language] = l
+        end
+        opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
+          @options[:rolling] = true
+        end
+        opts.on_tail('-v', '--version', 'display docsplit version') do
+          puts "Docsplit version #{Docsplit::VERSION}"
+          exit
+        end
+        opts.on_tail('-h', '--help', 'display this help message') do
+          usage
+        end
+      end
+      @option_parser.banner = BANNER
+      begin
+        @option_parser.parse!(ARGV)
+      rescue OptionParser::InvalidOption => e
+        puts e.message
+        exit(1)
+      end
+    end
+  end
+end

data/lib/docsplit/image_extractor.rb ADDED Viewed

@@ -0,0 +1,103 @@
+module Docsplit
+  # Delegates to GraphicsMagick in order to convert PDF documents into
+  # nicely sized images.
+  class ImageExtractor
+    MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
+    DEFAULT_FORMAT  = :png
+    DEFAULT_DENSITY = '150'
+    # Extract a list of PDFs as rasterized page images, according to the
+    # configuration in options.
+    def extract(pdfs, options)
+      @pdfs = [pdfs].flatten
+      extract_options(options)
+      @pdfs.each do |pdf|
+        previous = nil
+        @sizes.each_with_index do |size, i|
+          @formats.each {|format| convert(pdf, size, format, previous) }
+          previous = size if @rolling
+        end
+      end
+    end
+    # Convert a single PDF into page images at the specified size and format.
+    # If `--rolling`, and we have a previous image at a larger size to work with,
+    # we simply downsample that image, instead of re-rendering the entire PDF.
+    # Now we generate one page at a time, a counterintuitive opimization
+    # suggested by the GraphicsMagick list, that seems to work quite well.
+    def convert(pdf, size, format, previous=nil)
+      tempdir   = Dir.mktmpdir
+      basename  = File.basename(pdf, File.extname(pdf))
+      directory = directory_for(size)
+      pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
+      escaped_pdf = ESCAPE[pdf]
+      FileUtils.mkdir_p(directory) unless File.exists?(directory)
+      common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
+      if previous
+        FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
+        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
+        raise ExtractionFailed, result if $? != 0
+      else
+        page_list(pages).each do |page|
+          out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
+          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
+          result = `#{cmd}`.chomp
+          raise ExtractionFailed, result if $? != 0
+        end
+      end
+    ensure
+      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+    end
+    private
+    # Extract the relevant GraphicsMagick options from the options hash.
+    def extract_options(options)
+      @output  = options[:output]  || '.'
+      @pages   = options[:pages]
+      @density = options[:density] || DEFAULT_DENSITY
+      @formats = [options[:format] || DEFAULT_FORMAT].flatten
+      @sizes   = [options[:size]].flatten.compact
+      @sizes   = [nil] if @sizes.empty?
+      @rolling = !!options[:rolling]
+    end
+    # If there's only one size requested, generate the images directly into
+    # the output directory. Multiple sizes each get a directory of their own.
+    def directory_for(size)
+      path = @sizes.length == 1 ? @output : File.join(@output, size)
+      File.expand_path(path)
+    end
+    # Generate the resize argument.
+    def resize_arg(size)
+      size.nil? ? '' : "-resize #{size}"
+    end
+    # Generate the appropriate quality argument for the image format.
+    def quality_arg(format)
+      case format.to_s
+      when /jpe?g/ then "-quality 85"
+      when /png/   then "-quality 100"
+      else ""
+      end
+    end
+    # Generate the expanded list of requested page numbers.
+    def page_list(pages)
+      pages.split(',').map { |range|
+        if range.include?('-')
+          range = range.split('-')
+          Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
+        else
+          range.to_i
+        end
+      }.flatten.uniq.sort
+    end
+  end
+end

data/lib/docsplit/info_extractor.rb ADDED Viewed

@@ -0,0 +1,39 @@
+module Docsplit
+  # Delegates to **pdfinfo** in order to extract information about a PDF file.
+  class InfoExtractor
+    # Regex matchers for different bits of information.
+    MATCHERS = {
+      :author   => /^Author:\s+([^\n]+)/,
+      :date     => /^CreationDate:\s+([^\n]+)/,
+      :creator  => /^Creator:\s+([^\n]+)/,
+      :keywords => /^Keywords:\s+([^\n]+)/,
+      :producer => /^Producer:\s+([^\n]+)/,
+      :subject  => /^Subject:\s+([^\n]+)/,
+      :title    => /^Title:\s+([^\n]+)/,
+      :length   => /^Pages:\s+([^\n]+)/,
+    }
+    # Pull out a single datum from a pdf.
+    def extract(key, pdfs, opts)
+      pdf = [pdfs].flatten.first
+      cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
+      result = `#{cmd}`.chomp
+      raise ExtractionFailed, result if $? != 0
+      # ruby  1.8 (iconv) and 1.9 (String#encode) :
+      if String.method_defined?(:encode)
+        result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
+      else
+        ic = Iconv.new('UTF-8', 'UTF-8//IGNORE')
+        result = ic.iconv(result)
+      end
+      match = result.match(MATCHERS[key])
+      answer = match && match[1]
+      answer = answer.to_i if answer && key == :length
+      answer
+    end
+  end
+end

data/lib/docsplit/page_extractor.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Docsplit
+  # Delegates to **pdftk** in order to create bursted single pages from
+  # a PDF document.
+  class PageExtractor
+    # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
+    def extract(pdfs, opts)
+      extract_options opts
+      [pdfs].flatten.each do |pdf|
+        pdf_name = File.basename(pdf, File.extname(pdf))
+        page_path = File.join(@output, "#{pdf_name}_%d.pdf")
+        FileUtils.mkdir_p @output unless File.exists?(@output)
+        cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
+          "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
+        else
+          "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
+        end
+        result = `#{cmd}`.chomp
+        FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
+        raise ExtractionFailed, result if $? != 0
+        result
+      end
+    end
+    private
+    def extract_options(options)
+      @output = options[:output] || '.'
+    end
+  end
+end

data/lib/docsplit/text_cleaner.rb ADDED Viewed

@@ -0,0 +1,94 @@
+require 'strscan'
+module Docsplit
+  # Cleans up OCR'd text by using a series of heuristics to remove garbage
+  # words. Algorithms taken from:
+  #
+  #     Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
+  #       -- Taghva, Nartker, Condit, and Borsack
+  #
+  #     Improving Search and Retrieval Performance through Shortening Documents,
+  #     Detecting Garbage, and Throwing out Jargon
+  #       -- Kulp
+  #
+  class TextCleaner
+    # Cached regexes we plan on using.
+    WORD        = /\S+/
+    SPACE       = /\s+/
+    NEWLINE     = /[\r\n]/
+    ALNUM       = /[a-z0-9]/i
+    PUNCT       = /[[:punct:]]/i
+    REPEAT      = /([^0-9])\1{2,}/
+    UPPER       = /[A-Z]/
+    LOWER       = /[a-z]/
+    ACRONYM     = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
+    ALL_ALPHA   = /^[a-z]+$/i
+    CONSONANT   = /(^y|[bcdfghjklmnpqrstvwxz])/i
+    VOWEL       = /([aeiou]|y$)/i
+    CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
+    VOWEL_5     = /[aeiou]{5}/i
+    REPEATED    = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
+    SINGLETONS  = /^[AaIi]$/
+    # For the time being, `clean` uses the regular StringScanner, and not the
+    # multibyte-aware version, coercing to ASCII first.
+    def clean(text)
+      require 'iconv' unless defined?(Iconv)
+      text    = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
+      scanner = StringScanner.new(text)
+      cleaned = []
+      spaced  = false
+      loop do
+        if space = scanner.scan(SPACE)
+          cleaned.push(space) unless spaced && (space !~ NEWLINE)
+          spaced = true
+        elsif word = scanner.scan(WORD)
+          unless garbage(word)
+            cleaned.push(word)
+            spaced = false
+          end
+        elsif scanner.eos?
+          return cleaned.join('').gsub(REPEATED, '')
+        end
+      end
+    end
+    # Is a given word OCR garbage?
+    def garbage(w)
+      acronym = w =~ ACRONYM
+      # More than 30 bytes in length.
+      (w.length > 30) ||
+      # If there are three or more identical characters in a row in the string.
+      (w =~ REPEAT) ||
+      # More punctuation than alpha numerics.
+      (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
+      # Ignoring the first and last characters in the string, if there are three or
+      # more different punctuation characters in the string.
+      (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
+      # Four or more consecutive vowels, or five or more consecutive consonants.
+      ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
+      # Number of uppercase letters greater than lowercase letters, but the word is
+      # not all uppercase + punctuation.
+      (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
+      # Single letters that are not A or I.
+      (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
+      # All characters are alphabetic and there are 8 times more vowels than
+      # consonants, or 8 times more consonants than vowels.
+      (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
+        (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
+          (cons > vows * 8)))
+    end
+  end
+end

data/lib/docsplit/text_extractor.rb ADDED Viewed

@@ -0,0 +1,130 @@
+module Docsplit
+  # Delegates to **pdftotext** and **tesseract** in order to extract text from
+  # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
+  # forbid OCR extraction, but by default the heuristic works like this:
+  #
+  #  * Check for the presence of fonts in the PDF. If no fonts are detected,
+  #    OCR is used automatically.
+  #  * Extract the text of each page with **pdftotext**, if the page has less
+  #    than 100 bytes of text (a scanned image page, or a page that just
+  #    contains a filename and a page number), then add it to the list of
+  #    `@pages_to_ocr`.
+  #  * Re-OCR each page in the `@pages_to_ocr` list at the end.
+  #
+  class TextExtractor
+    NO_TEXT_DETECTED = /---------\n\Z/
+    OCR_FLAGS   = '-density 400x400 -colorspace GRAY'
+    MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
+    MIN_TEXT_PER_PAGE = 100 # in bytes
+    def initialize
+      @pages_to_ocr = []
+    end
+    # Extract text from a list of PDFs.
+    def extract(pdfs, opts)
+      extract_options opts
+      FileUtils.mkdir_p @output unless File.exists?(@output)
+      [pdfs].flatten.each do |pdf|
+        @pdf_name = File.basename(pdf, File.extname(pdf))
+        pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
+        if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
+          extract_from_ocr(pdf, pages)
+        else
+          extract_from_pdf(pdf, pages)
+          if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
+            extract_from_ocr(pdf, @pages_to_ocr)
+          end
+        end
+      end
+    end
+    # Does a PDF have any text embedded?
+    def contains_text?(pdf)
+      fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
+      !fonts.match(NO_TEXT_DETECTED)
+    end
+    # Extract a page range worth of text from a PDF, directly.
+    def extract_from_pdf(pdf, pages)
+      return extract_full(pdf) unless pages
+      pages.each {|page| extract_page(pdf, page) }
+    end
+    # Extract a page range worth of text from a PDF via OCR.
+    def extract_from_ocr(pdf, pages)
+      tempdir = Dir.mktmpdir
+      base_path = File.join(@output, @pdf_name)
+      escaped_pdf = ESCAPE[pdf]
+      if pages
+        pages.each do |page|
+          tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
+          escaped_tiff = ESCAPE[tiff]
+          file = "#{base_path}_#{page}"
+          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
+          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
+          clean_text(file + '.txt') if @clean_ocr
+          FileUtils.remove_entry_secure tiff
+        end
+      else
+        tiff = "#{tempdir}/#{@pdf_name}.tif"
+        escaped_tiff = ESCAPE[tiff]
+        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
+        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
+        clean_text(base_path + '.txt') if @clean_ocr
+      end
+    ensure
+      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+    end
+    private
+    def clean_text(file)
+      File.open(file, 'r+') do |f|
+        text = f.read
+        f.truncate(0)
+        f.rewind
+        f.write(Docsplit.clean_text(text))
+      end
+    end
+    # Run an external process and raise an exception if it fails.
+    def run(command)
+      result = `#{command}`
+      raise ExtractionFailed, result if $? != 0
+      result
+    end
+    # Extract the full contents of a pdf as a single file, directly.
+    def extract_full(pdf)
+      text_path = File.join(@output, "#{@pdf_name}.txt")
+      run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+    end
+    # Extract the contents of a single page of text, directly, adding it to
+    # the `@pages_to_ocr` list if the text length is inadequate.
+    def extract_page(pdf, page)
+      text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
+      run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
+      unless @forbid_ocr
+        @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
+      end
+    end
+    def extract_options(options)
+      @output     = options[:output] || '.'
+      @pages      = options[:pages]
+      @force_ocr  = options[:ocr] == true
+      @forbid_ocr = options[:ocr] == false
+      @clean_ocr  = !(options[:clean] == false)
+      @language   = options[:language] || 'eng'
+    end
+  end
+end

data/lib/docsplit/transparent_pdfs.rb ADDED Viewed

@@ -0,0 +1,26 @@
+module Docsplit
+  # Include a method to transparently convert non-PDF arguments to temporary
+  # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
+  module TransparentPDFs
+    # Temporarily convert any non-PDF documents to PDFs before running them
+    # through further extraction.
+    def ensure_pdfs(docs)
+      [docs].flatten.map do |doc|
+        ext = File.extname(doc)
+        if ext.downcase == '.pdf'
+          doc
+        else
+          tempdir = File.join(Dir.tmpdir, 'docsplit')
+          extract_pdf([doc], {:output => tempdir})
+          File.join(tempdir, File.basename(doc, ext) + '.pdf')
+        end
+      end
+    end
+  end
+  extend TransparentPDFs
+end

data/lib/docsplit.rb ADDED Viewed

@@ -0,0 +1,130 @@
+require 'tmpdir'
+require 'fileutils'
+require 'shellwords'
+# The Docsplit module delegates to the Java PDF extractors.
+module Docsplit
+  VERSION       = '0.6.4' # Keep in sync with gemspec.
+  ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
+  ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
+  ESCAPED_ROOT  = ESCAPE[ROOT]
+  CLASSPATH     = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
+  LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
+  HEADLESS      = "-Djava.awt.headless=true"
+  office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
+  office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
+  OFFICE        = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
+  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
+  GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
+  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
+  # Check for all dependencies, and note their absence.
+  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
+  DEPENDENCIES.each_key do |dep|
+    dirs.each do |dir|
+      if File.executable?(File.join(dir, dep.to_s))
+        DEPENDENCIES[dep] = true
+        break
+      end
+    end
+  end
+  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
+  # broke.
+  class ExtractionFailed < StandardError; end
+  # Use the ExtractPages Java class to burst a PDF into single pages.
+  def self.extract_pages(pdfs, opts={})
+    pdfs = ensure_pdfs(pdfs)
+    PageExtractor.new.extract(pdfs, opts)
+  end
+  # Use the ExtractText Java class to write out all embedded text.
+  def self.extract_text(pdfs, opts={})
+    pdfs = ensure_pdfs(pdfs)
+    TextExtractor.new.extract(pdfs, opts)
+  end
+  # Use the ExtractImages Java class to rasterize a PDF into each page's image.
+  def self.extract_images(pdfs, opts={})
+    pdfs = ensure_pdfs(pdfs)
+    opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
+    ImageExtractor.new.extract(pdfs, opts)
+  end
+  # Use JODCConverter to extract the documents as PDFs.
+  # If the document is in an image format, use GraphicsMagick to extract the PDF.
+  def self.extract_pdf(docs, opts={})
+    out = opts[:output] || '.'
+    FileUtils.mkdir_p out unless File.exists?(out)
+    [docs].flatten.each do |doc|
+      ext = File.extname(doc)
+      basename = File.basename(doc, ext)
+      escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
+      if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
+        `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
+      else
+        options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
+        run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
+      end
+    end
+  end
+  # Define custom methods for each of the metadata keys that we support.
+  # Use the ExtractInfo Java class to print out a single bit of metadata.
+  METADATA_KEYS.each do |key|
+    instance_eval <<-EOS
+      def self.extract_#{key}(pdfs, opts={})
+        pdfs = ensure_pdfs(pdfs)
+        InfoExtractor.new.extract(:#{key}, pdfs, opts)
+      end
+    EOS
+  end
+  # Utility method to clean OCR'd text with garbage characters.
+  def self.clean_text(text)
+    TextCleaner.new.clean(text)
+  end
+  private
+  # Runs a Java command, with quieted logging, and the classpath set properly.
+  def self.run(command, pdfs, opts, return_output=false)
+    pdfs    = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
+    cmd     = "java #{HEADLESS} #{LOGGING} #{OFFICE} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
+    result  = `#{cmd}`.chomp
+    raise ExtractionFailed, result if $? != 0
+    return return_output ? (result.empty? ? nil : result) : true
+  end
+  # Normalize a value in an options hash for the command line.
+  # Ranges look like: 1-10, Arrays like: 1,2,3.
+  def self.normalize_value(value)
+    case value
+    when Range then normalize_range(value)
+    when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
+    else            value.to_s
+    end
+  end
+end
+require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
+require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"

data/vendor/conf/document-formats.js ADDED Viewed

@@ -0,0 +1,233 @@
+[
+  {
+    "name": "Portable Document Format",
+    "extension": "pdf",
+    "mediaType": "application/pdf",
+    "storePropertiesByFamily": {
+      "DRAWING": {"FilterName": "draw_pdf_Export"},
+      "SPREADSHEET": {"FilterName": "calc_pdf_Export"},
+      "PRESENTATION": {"FilterName": "impress_pdf_Export"},
+      "TEXT": {"FilterName": "writer_pdf_Export"}
+    }
+  },
+  {
+    "name": "Macromedia Flash",
+    "extension": "swf",
+    "mediaType": "application/x-shockwave-flash",
+    "storePropertiesByFamily": {
+      "DRAWING": {"FilterName": "draw_flash_Export"},
+      "PRESENTATION": {"FilterName": "impress_flash_Export"}
+    }
+  },
+  {
+    "name": "HTML",
+    "extension": "html",
+    "mediaType": "text/html",
+    "inputFamily": "TEXT",
+    "storePropertiesByFamily": {
+      "SPREADSHEET": {"FilterName": "HTML (StarCalc)"},
+      "PRESENTATION": {"FilterName": "impress_html_Export"},
+      "TEXT": {"FilterName": "HTML (StarWriter)"}
+    }
+  },
+  {
+    "name": "OpenDocument Text",
+    "extension": "odt",
+    "mediaType": "application/vnd.oasis.opendocument.text",
+    "inputFamily": "TEXT",
+    "storePropertiesByFamily": {"TEXT": {"FilterName": "writer8"}}
+  },
+  {
+    "name": "OpenOffice.org 1.0 Text Document",
+    "extension": "sxw",
+    "mediaType": "application/vnd.sun.xml.writer",
+    "inputFamily": "TEXT",
+    "storePropertiesByFamily": {"TEXT": {"FilterName": "StarOffice XML (Writer)"}}
+  },
+  {
+    "name": "Microsoft Word",
+    "extension": "doc",
+    "mediaType": "application/msword",
+    "inputFamily": "TEXT",
+    "storePropertiesByFamily": {"TEXT": {"FilterName": "MS Word 97"}}
+  },
+  {
+    "name": "Microsoft Word 2007 XML",
+    "extension": "docx",
+    "mediaType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    "inputFamily": "TEXT"
+  },
+  {
+    "name": "Rich Text Format",
+    "extension": "rtf",
+    "mediaType": "text/rtf",
+    "inputFamily": "TEXT",
+    "storePropertiesByFamily": {"TEXT": {"FilterName": "Rich Text Format"}}
+  },
+  {
+    "name": "WordPerfect",
+    "extension": "wpd",
+    "mediaType": "application/wordperfect",
+    "inputFamily": "TEXT"
+  },
+  {
+    "name": "Plain Text",
+    "extension": "txt",
+    "mediaType": "text/plain",
+    "inputFamily": "TEXT",
+    "loadProperties": {
+      "FilterName": "Text (encoded)",
+      "FilterOptions": "utf8"
+    },
+    "storePropertiesByFamily": {"TEXT": {
+      "FilterName": "Text (encoded)",
+      "FilterOptions": "utf8"
+    }}
+  },
+  {
+    "name": "MediaWiki wikitext",
+    "extension": "wiki",
+    "mediaType": "text/x-wiki",
+    "storePropertiesByFamily": {"TEXT": {"FilterName": "MediaWiki"}}
+  },
+  {
+    "name": "OpenDocument Spreadsheet",
+    "extension": "ods",
+    "mediaType": "application/vnd.oasis.opendocument.spreadsheet",
+    "inputFamily": "SPREADSHEET",
+    "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "calc8"}}
+  },
+  {
+    "name": "OpenOffice.org 1.0 Spreadsheet",
+    "extension": "sxc",
+    "mediaType": "application/vnd.sun.xml.calc",
+    "inputFamily": "SPREADSHEET",
+    "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "StarOffice XML (Calc)"}}
+  },
+  {
+    "name": "Microsoft Excel",
+    "extension": "xls",
+    "mediaType": "application/vnd.ms-excel",
+    "inputFamily": "SPREADSHEET",
+    "storePropertiesByFamily": {"SPREADSHEET": {"FilterName": "MS Excel 97"}}
+  },
+  {
+    "name": "Microsoft Excel 2007 XML",
+    "extension": "xlsx",
+    "mediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    "inputFamily": "SPREADSHEET"
+  },
+  {
+    "name": "Comma Separated Values",
+    "extension": "csv",
+    "mediaType": "text/csv",
+    "inputFamily": "SPREADSHEET",
+    "loadProperties": {
+      "FilterName": "Text - txt - csv (StarCalc)",
+      "FilterOptions": "44,34,0"
+    },
+    "storePropertiesByFamily": {"SPREADSHEET": {
+      "FilterName": "Text - txt - csv (StarCalc)",
+      "FilterOptions": "44,34,0"
+    }}
+  },
+  {
+    "name": "Tab Separated Values",
+    "extension": "tsv",
+    "mediaType": "text/tab-separated-values",
+    "inputFamily": "SPREADSHEET",
+    "loadProperties": {
+      "FilterName": "Text - txt - csv (StarCalc)",
+      "FilterOptions": "9,34,0"
+    },
+    "storePropertiesByFamily": {"SPREADSHEET": {
+      "FilterName": "Text - txt - csv (StarCalc)",
+      "FilterOptions": "9,34,0"
+    }}
+  },
+  {
+    "name": "OpenDocument Presentation",
+    "extension": "odp",
+    "mediaType": "application/vnd.oasis.opendocument.presentation",
+    "inputFamily": "PRESENTATION",
+    "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "impress8"}}
+  },
+  {
+    "name": "OpenOffice.org 1.0 Presentation",
+    "extension": "sxi",
+    "mediaType": "application/vnd.sun.xml.impress",
+    "inputFamily": "PRESENTATION",
+    "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "StarOffice XML (Impress)"}}
+  },
+  {
+    "name": "Microsoft PowerPoint",
+    "extension": "ppt",
+    "mediaType": "application/vnd.ms-powerpoint",
+    "inputFamily": "PRESENTATION",
+    "storePropertiesByFamily": {"PRESENTATION": {"FilterName": "MS PowerPoint 97"}}
+  },
+  {
+    "name": "Microsoft PowerPoint 2007 XML",
+    "extension": "pptx",
+    "mediaType": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    "inputFamily": "PRESENTATION"
+  },
+  {
+    "name": "OpenDocument Drawing",
+    "extension": "odg",
+    "mediaType": "application/vnd.oasis.opendocument.graphics",
+    "inputFamily": "DRAWING",
+    "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw8"}}
+  },
+  {
+    "name": "Scalable Vector Graphics",
+    "extension": "svg",
+    "mediaType": "image/svg+xml",
+    "storePropertiesByFamily": {"DRAWING": {"FilterName": "draw_svg_Export"}}
+  },
+  {
+    "name": "Portable Network Graphic",
+    "extension": "png",
+    "mediaType": "image/png",
+    "storePropertiesByFamily": {
+      "DRAWING": {"FilterName": "draw_png_Export"},
+      "PRESENTATION": {"FilterName": "impress_png_Export"}
+    }
+  },
+  {
+    "name": "Graphics Interchange Format",
+    "extension": "gif",
+    "mediaType": "image/gif",
+    "storePropertiesByFamily": {
+      "DRAWING": {"FilterName": "draw_gif_Export"},
+      "PRESENTATION": {"FilterName": "impress_gif_Export"}
+    }
+  },
+  {
+    "name": "Joint Photographic Experts Group",
+    "extension": "jpg",
+    "mediaType": "image/jpeg",
+    "storePropertiesByFamily": {
+      "DRAWING": {"FilterName": "draw_jpg_Export"},
+      "PRESENTATION": {"FilterName": "impress_jpg_Export"}
+    }
+  },
+  {
+    "name": "Windows Bitmap",
+    "extension": "bmp",
+    "mediaType": "image/bmp",
+    "storePropertiesByFamily": {
+      "DRAWING": {"FilterName": "draw_bmp_Export"},
+      "PRESENTATION": {"FilterName": "impress_bmp_Export"}
+    }
+  },
+  {
+    "name": "Tagged Image File Format",
+    "extension": "tif",
+    "mediaType": "image/tiff",
+    "storePropertiesByFamily": {
+      "DRAWING": {"FilterName": "draw_tif_Export"},
+      "PRESENTATION": {"FilterName": "impress_tif_Export"}
+    }
+  }
+]

data/vendor/jodconverter/commons-cli-1.1.jar ADDED Viewed

Binary file

data/vendor/jodconverter/commons-io-1.4.jar ADDED Viewed

Binary file

data/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar ADDED Viewed

Binary file

data/vendor/jodconverter/json-20090211.jar ADDED Viewed

Binary file

data/vendor/jodconverter/juh-3.2.1.jar ADDED Viewed

Binary file

data/vendor/jodconverter/jurt-3.2.1.jar ADDED Viewed

Binary file

data/vendor/jodconverter/ridl-3.2.1.jar ADDED Viewed

Binary file

data/vendor/jodconverter/unoil-3.2.1.jar ADDED Viewed

Binary file

data/vendor/logging.properties ADDED Viewed

	@@ -0,0 +1 @@
1	+ .level=WARNING

metadata ADDED Viewed

@@ -0,0 +1,72 @@
+--- !ruby/object:Gem::Specification
+name: mateusmaso-docsplit
+version: !ruby/object:Gem::Version
+  version: 0.6.4
+  prerelease:
+platform: ruby
+authors:
+- Jeremy Ashkenas
+- Samuel Clay
+- Ted Han
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-02-05 00:00:00.000000000 Z
+dependencies: []
+description: ! "    Docsplit is a command-line utility and Ruby library for splitting
+  apart\n    documents into their component parts: searchable UTF-8 plain text, page\n
+  \   images or thumbnails in any format, PDFs, single pages, and document\n    metadata
+  (title, author, number of pages...)\n"
+email: jeremy@documentcloud.org
+executables:
+- docsplit
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/docsplit/command_line.rb
+- lib/docsplit/image_extractor.rb
+- lib/docsplit/info_extractor.rb
+- lib/docsplit/page_extractor.rb
+- lib/docsplit/text_cleaner.rb
+- lib/docsplit/text_extractor.rb
+- lib/docsplit/transparent_pdfs.rb
+- lib/docsplit.rb
+- bin/docsplit
+- vendor/conf/document-formats.js
+- vendor/jodconverter/commons-cli-1.1.jar
+- vendor/jodconverter/commons-io-1.4.jar
+- vendor/jodconverter/jodconverter-core-3.0-beta-4.jar
+- vendor/jodconverter/json-20090211.jar
+- vendor/jodconverter/juh-3.2.1.jar
+- vendor/jodconverter/jurt-3.2.1.jar
+- vendor/jodconverter/ridl-3.2.1.jar
+- vendor/jodconverter/unoil-3.2.1.jar
+- vendor/logging.properties
+- docsplit.gemspec
+- LICENSE
+- README
+homepage: http://github.com/mateusmaso/docsplit
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project: docsplit
+rubygems_version: 1.8.10
+signing_key:
+specification_version: 3
+summary: Break Apart Documents into Images, Text, Pages and PDFs
+test_files: []