RubyGems - docsplit - Versions diffs - 0.6.4 → 0.7.0 - Mend

docsplit 0.6.4 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/docsplit.gemspec +3 -3
data/lib/docsplit.rb +10 -38
data/lib/docsplit/info_extractor.rb +23 -5
data/lib/docsplit/pdf_extractor.rb +132 -0
data/lib/docsplit/transparent_pdfs.rb +1 -1
metadata +25 -39

data/docsplit.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.6.4'         # Keep version in sync with docsplit.rb
-  s.date      = '2012-11-12'
+  s.version   = '0.7.0'         # Keep version in sync with docsplit.rb
+  s.date      = '2013-02-21'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -13,7 +13,7 @@ Gem::Specification.new do |s|
   EOS
   s.authors           = ['Jeremy Ashkenas', 'Samuel Clay', 'Ted Han']
-  s.email             = 'jeremy@documentcloud.org'
+  s.email             = 'opensource@documentcloud.org'
   s.rubyforge_project = 'docsplit'
   s.require_paths     = ['lib']

data/lib/docsplit.rb CHANGED Viewed

@@ -5,24 +5,13 @@ require 'shellwords'
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.6.4' # Keep in sync with gemspec.
+  VERSION       = '0.7.0' # Keep in sync with gemspec.
   ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
   ESCAPED_ROOT  = ESCAPE[ROOT]
-  CLASSPATH     = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
-  LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
-  HEADLESS      = "-Djava.awt.headless=true"
-  office ||= "/usr/lib/openoffice" if File.exists? '/usr/lib/openoffice'
-  office ||= "/usr/lib/libreoffice" if File.exists? '/usr/lib/libreoffice'
-  OFFICE        = RUBY_PLATFORM.match(/darwin/i) ? '' : "-Doffice.home=#{office}"
   METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
   GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
@@ -66,20 +55,7 @@ module Docsplit
   # Use JODCConverter to extract the documents as PDFs.
   # If the document is in an image format, use GraphicsMagick to extract the PDF.
   def self.extract_pdf(docs, opts={})
-    out = opts[:output] || '.'
-    FileUtils.mkdir_p out unless File.exists?(out)
-    [docs].flatten.each do |doc|
-      ext = File.extname(doc)
-      basename = File.basename(doc, ext)
-      escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
-      if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
-        `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
-      else
-        options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
-        run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
-      end
-    end
+    PdfExtractor.new.extract(docs, opts)
   end
   # Define custom methods for each of the metadata keys that we support.
@@ -92,30 +68,25 @@ module Docsplit
       end
     EOS
   end
+  def self.extract_info(pdfs, opts={})
+    pdfs = ensure_pdfs(pdfs)
+    InfoExtractor.new.extract_all(pdfs, opts)
+  end
   # Utility method to clean OCR'd text with garbage characters.
   def self.clean_text(text)
     TextCleaner.new.clean(text)
   end
   private
-  # Runs a Java command, with quieted logging, and the classpath set properly.
-  def self.run(command, pdfs, opts, return_output=false)
-    pdfs    = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
-    cmd     = "java #{HEADLESS} #{LOGGING} #{OFFICE} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
-    result  = `#{cmd}`.chomp
-    raise ExtractionFailed, result if $? != 0
-    return return_output ? (result.empty? ? nil : result) : true
-  end
   # Normalize a value in an options hash for the command line.
   # Ranges look like: 1-10, Arrays like: 1,2,3.
   def self.normalize_value(value)
     case value
-    when Range then normalize_range(value)
-    when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
+    when Range then value.to_a.join(',')
+    when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
     else            value.to_s
     end
   end
@@ -126,5 +97,6 @@ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
 require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/pdf_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"

data/lib/docsplit/info_extractor.rb CHANGED Viewed

@@ -17,16 +17,34 @@ module Docsplit
     # Pull out a single datum from a pdf.
     def extract(key, pdfs, opts)
+      extract_all(pdfs, opts)[key]
+    end
+    def extract_all(pdfs, opts)
       pdf = [pdfs].flatten.first
       cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
       result = `#{cmd}`.chomp
       raise ExtractionFailed, result if $? != 0
-      match = result.match(MATCHERS[key])
-      answer = match && match[1]
-      answer = answer.to_i if answer && key == :length
-      answer
+      # ruby  1.8 (iconv) and 1.9 (String#encode) :
+      if String.method_defined?(:encode)
+        result.encode!('UTF-8', 'UTF-8', :invalid => :replace)
+      else
+        require 'iconv' unless defined?(Iconv)
+        ic = Iconv.new('UTF-8//IGNORE','UTF-8')
+        result = ic.iconv(result)
+      end
+      info = {}
+      MATCHERS.each do |key, matcher|
+        match = result.match(matcher)
+        answer = match && match[1]
+        if answer
+          answer = answer.to_i if key == :length
+          info[key] = answer
+        end
+      end
+      info
     end
   end
-end
+end

data/lib/docsplit/pdf_extractor.rb ADDED Viewed

@@ -0,0 +1,132 @@
+require 'rbconfig'
+module Docsplit
+  class PdfExtractor
+    @@executable = nil
+    HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
+    def windows?
+      !!HOST_OS.match(/mswin|windows|cygwin/i)
+    end
+    def osx?
+      !!HOST_OS.match(/darwin/i)
+    end
+    def linux?
+      !!HOST_OS.match(/linux/i)
+    end
+    def version_string
+      @@help ||= `#{office_executable} -h 2>&1`.split("\n").first
+    end
+    def libre_office?
+      !!version_string.match(/^LibreOffice/)
+    end
+    def open_office?
+      !!version_string.match(/^OpenOffice.org/)
+    end
+    def office_search_paths
+      if windows?
+        office_names       = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
+        program_files_path = ENV["CommonProgramFiles"]
+        search_paths       = office_name.map{ |program| File.join(program_files_path, program) }
+      elsif osx?
+        search_paths = %w(
+          /Applications/LibreOffice.app/Contents
+          /Applications/OpenOffice.org.app/Contents
+        )
+      else # probably linux/unix
+        search_paths = %w(
+          /usr/lib/libreoffice
+          /opt/libreoffice
+          /usr/lib/openoffice
+          /opt/openoffice.org3
+        )
+      end
+      search_paths
+    end
+    def office_executable
+      paths = office_search_paths
+      if ENV['OFFICE_PATH']
+        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
+        paths.unshift(ENV['OFFICE_PATH'])
+      end
+      path_pieces = ["soffice"]
+      if windows?
+        path_pieces += [["program", "soffice.bin"]]
+      elsif osx?
+        path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
+      else
+        path_pieces += [["program", "soffice"]]
+      end
+      paths.each do |path|
+        if File.exists? path
+          @@executable ||= path unless File.directory? path
+          path_pieces.each do |pieces|
+            check_path = File.join(path, pieces)
+            @@executable ||= check_path if File.exists? check_path
+          end
+        end
+        break if @@executable
+      end
+      raise OfficeNotFound, "No office software found" unless @@executable
+      @@executable
+    end
+    def office_path
+      File.dirname(File.dirname(office_executable))
+    end
+    def extract(docs, opts)
+      out = opts[:output] || '.'
+      FileUtils.mkdir_p out unless File.exists?(out)
+      [docs].flatten.each do |doc|
+        ext = File.extname(doc)
+        basename = File.basename(doc, ext)
+        escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
+        if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
+          `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
+        else
+          if libre_office?
+            options = "--headless --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
+            cmd = "#{office_executable} #{options} 2>&1"
+            result = `#{cmd}`.chomp
+            raise ExtractionFailed, result if $? != 0
+            true
+          else # open office presumably
+            options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
+            run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
+          end
+        end
+      end
+    end
+    CLASSPATH     = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
+    LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
+    HEADLESS      = "-Djava.awt.headless=true"
+    private
+    # Runs a Java command, with quieted logging, and the classpath set properly.
+    def run_jod(command, pdfs, opts, return_output=false)
+      pdfs   = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
+      office = osx? ? "-Doffice.home=#{office_path}" : office_path
+      cmd    = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
+      result = `#{cmd}`.chomp
+      raise ExtractionFailed, result if $? != 0
+      return return_output ? (result.empty? ? nil : result) : true
+    end
+    class OfficeNotFound < StandardError; end
+  end
+end

data/lib/docsplit/transparent_pdfs.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module Docsplit
         if ext.downcase == '.pdf'
           doc
         else
-          tempdir = File.join(Dir.tmpdir, 'docsplit')
+          tempdir = Dir.mktmpdir
           extract_pdf([doc], {:output => tempdir})
           File.join(tempdir, File.basename(doc, ext) + '.pdf')
         end

metadata CHANGED Viewed

@@ -1,38 +1,33 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: docsplit
-version: !ruby/object:Gem::Version
-  hash: 15
+version: !ruby/object:Gem::Version
+  version: 0.7.0
   prerelease:
-  segments:
-  - 0
-  - 6
-  - 4
-  version: 0.6.4
 platform: ruby
-authors:
+authors:
 - Jeremy Ashkenas
 - Samuel Clay
 - Ted Han
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-11-12 00:00:00 Z
+date: 2013-02-21 00:00:00.000000000 Z
 dependencies: []
-description: "    Docsplit is a command-line utility and Ruby library for splitting apart\n    documents into their component parts: searchable UTF-8 plain text, page\n    images or thumbnails in any format, PDFs, single pages, and document\n    metadata (title, author, number of pages...)\n"
-email: jeremy@documentcloud.org
-executables:
+description: ! "    Docsplit is a command-line utility and Ruby library for splitting
+  apart\n    documents into their component parts: searchable UTF-8 plain text, page\n
+  \   images or thumbnails in any format, PDFs, single pages, and document\n    metadata
+  (title, author, number of pages...)\n"
+email: opensource@documentcloud.org
+executables:
 - docsplit
 extensions: []
 extra_rdoc_files: []
-files:
+files:
 - lib/docsplit/command_line.rb
 - lib/docsplit/image_extractor.rb
 - lib/docsplit/info_extractor.rb
 - lib/docsplit/page_extractor.rb
+- lib/docsplit/pdf_extractor.rb
 - lib/docsplit/text_cleaner.rb
 - lib/docsplit/text_extractor.rb
 - lib/docsplit/transparent_pdfs.rb
@@ -53,36 +48,27 @@ files:
 - README
 homepage: http://documentcloud.github.com/docsplit/
 licenses: []
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
+required_ruby_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
 rubyforge_project: docsplit
 rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
 summary: Break Apart Documents into Images, Text, Pages and PDFs
 test_files: []
+has_rdoc: