RubyGems - docsplit - Versions diffs - 0.6.3 → 0.6.4 - Mend

docsplit 0.6.3 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/docsplit.gemspec +2 -2
data/lib/docsplit.rb +12 -10
data/lib/docsplit/command_line.rb +3 -0
data/lib/docsplit/image_extractor.rb +1 -1
data/lib/docsplit/page_extractor.rb +6 -1
metadata +10 -7

data/docsplit.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.6.3'         # Keep version in sync with docsplit.rb
-  s.date      = '2011-11-23'
+  s.version   = '0.6.4'         # Keep version in sync with docsplit.rb
+  s.date      = '2012-11-12'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"

data/lib/docsplit.rb CHANGED Viewed

@@ -1,13 +1,20 @@
+require 'tmpdir'
+require 'fileutils'
+require 'shellwords'
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.6.3' # Keep in sync with gemspec.
+  VERSION       = '0.6.4' # Keep in sync with gemspec.
+  ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
+  ESCAPED_ROOT  = ESCAPE[ROOT]
-  CLASSPATH     = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
+  CLASSPATH     = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
-  LOGGING       = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
+  LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
   HEADLESS      = "-Djava.awt.headless=true"
@@ -20,9 +27,7 @@ module Docsplit
   GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
-  ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
+  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -71,7 +76,7 @@ module Docsplit
       if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
         `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
       else
-        options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ROOT}/vendor/conf/document-formats.js"
+        options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
         run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
       end
     end
@@ -117,9 +122,6 @@ module Docsplit
 end
-require 'tmpdir'
-require 'fileutils'
-require 'shellwords'
 require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
 require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"

data/lib/docsplit/command_line.rb CHANGED Viewed

@@ -94,6 +94,9 @@ Options:
         opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
           @options[:clean] = false
         end
+        opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
+          @options[:language] = l
+        end
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true
         end

data/lib/docsplit/image_extractor.rb CHANGED Viewed

@@ -42,7 +42,7 @@ module Docsplit
       else
         page_list(pages).each do |page|
           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
+          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
           raise ExtractionFailed, result if $? != 0
         end

data/lib/docsplit/page_extractor.rb CHANGED Viewed

@@ -11,7 +11,12 @@ module Docsplit
         pdf_name = File.basename(pdf, File.extname(pdf))
         page_path = File.join(@output, "#{pdf_name}_%d.pdf")
         FileUtils.mkdir_p @output unless File.exists?(@output)
-        cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
+        cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
+          "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
+        else
+          "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
+        end
         result = `#{cmd}`.chomp
         FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
         raise ExtractionFailed, result if $? != 0

metadata CHANGED Viewed

@@ -1,12 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: docsplit
 version: !ruby/object:Gem::Version
-  prerelease: false
+  hash: 15
+  prerelease:
   segments:
   - 0
   - 6
-  - 3
-  version: 0.6.3
+  - 4
+  version: 0.6.4
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -16,8 +17,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-11-23 00:00:00 -06:00
-default_executable:
+date: 2012-11-12 00:00:00 Z
 dependencies: []
 description: "    Docsplit is a command-line utility and Ruby library for splitting apart\n    documents into their component parts: searchable UTF-8 plain text, page\n    images or thumbnails in any format, PDFs, single pages, and document\n    metadata (title, author, number of pages...)\n"
@@ -51,7 +51,6 @@ files:
 - docsplit.gemspec
 - LICENSE
 - README
-has_rdoc: true
 homepage: http://documentcloud.github.com/docsplit/
 licenses: []
@@ -61,23 +60,27 @@ rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
       segments:
       - 0
       version: "0"
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
       segments:
       - 0
       version: "0"
 requirements: []
 rubyforge_project: docsplit
-rubygems_version: 1.3.6
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
 summary: Break Apart Documents into Images, Text, Pages and PDFs