RubyGems - docsplit - Versions diffs - 0.6.3 → 0.6.4 - Mend

docsplit 0.6.3 → 0.6.4

Files changed (6) hide show

data/docsplit.gemspec +2 -2
data/lib/docsplit.rb +12 -10
data/lib/docsplit/command_line.rb +3 -0
data/lib/docsplit/image_extractor.rb +1 -1
data/lib/docsplit/page_extractor.rb +6 -1
metadata +10 -7

data/docsplit.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.6.3'         # Keep version in sync with docsplit.rb
-  s.date      = '2011-11-23'
+  s.version   = '0.6.4'         # Keep version in sync with docsplit.rb
+  s.date      = '2012-11-12'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"

data/lib/docsplit.rb CHANGED Viewed

@@ -1,13 +1,20 @@
+require 'tmpdir'
+require 'fileutils'
+require 'shellwords'
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.6.3' # Keep in sync with gemspec.
+  VERSION       = '0.6.4' # Keep in sync with gemspec.
+  ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
+  ESCAPED_ROOT  = ESCAPE[ROOT]
-  CLASSPATH     = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
+  CLASSPATH     = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
-  LOGGING       = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
+  LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
   HEADLESS      = "-Djava.awt.headless=true"
@@ -20,9 +27,7 @@ module Docsplit
   GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
-  ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
+  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -71,7 +76,7 @@ module Docsplit
       if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
         `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
       else
-        options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ROOT}/vendor/conf/document-formats.js"
+        options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
         run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
       end
     end
@@ -117,9 +122,6 @@ module Docsplit
 end
-require 'tmpdir'
-require 'fileutils'
-require 'shellwords'
 require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
 require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"

data/lib/docsplit/command_line.rb CHANGED Viewed

@@ -94,6 +94,9 @@ Options:
         opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
           @options[:clean] = false
         end
+        opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
+          @options[:language] = l
+        end
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true
         end

data/lib/docsplit/image_extractor.rb CHANGED Viewed

@@ -42,7 +42,7 @@ module Docsplit
       else
         page_list(pages).each do |page|
           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
+          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
           raise ExtractionFailed, result if $? != 0
         end

data/lib/docsplit/page_extractor.rb CHANGED Viewed

@@ -11,7 +11,12 @@ module Docsplit
         pdf_name = File.basename(pdf, File.extname(pdf))
         page_path = File.join(@output, "#{pdf_name}_%d.pdf")
         FileUtils.mkdir_p @output unless File.exists?(@output)
-        cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
+        cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
+          "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
+        else
+          "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
+        end
         result = `#{cmd}`.chomp
         FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
         raise ExtractionFailed, result if $? != 0

metadata CHANGED Viewed

@@ -1,12 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: docsplit
 version: !ruby/object:Gem::Version
-  prerelease: false
+  hash: 15
+  prerelease:
   segments:
   - 0
   - 6
-  - 3
-  version: 0.6.3
+  - 4
+  version: 0.6.4
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -16,8 +17,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-11-23 00:00:00 -06:00
-default_executable:
+date: 2012-11-12 00:00:00 Z
 dependencies: []
 description: "    Docsplit is a command-line utility and Ruby library for splitting apart\n    documents into their component parts: searchable UTF-8 plain text, page\n    images or thumbnails in any format, PDFs, single pages, and document\n    metadata (title, author, number of pages...)\n"
@@ -51,7 +51,6 @@ files:
 - docsplit.gemspec
 - LICENSE
 - README
-has_rdoc: true
 homepage: http://documentcloud.github.com/docsplit/
 licenses: []
@@ -61,23 +60,27 @@ rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
       segments:
       - 0
       version: "0"
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
       segments:
       - 0
       version: "0"
 requirements: []
 rubyforge_project: docsplit
-rubygems_version: 1.3.6
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
 summary: Break Apart Documents into Images, Text, Pages and PDFs