RubyGems - docsplit - Versions diffs - 0.5.1 → 0.5.2 - Mend

docsplit 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

data/docsplit.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.5.1'         # Keep version in sync with docsplit.rb
-  s.date      = '2010-04-26'
+  s.version   = '0.5.2'         # Keep version in sync with docsplit.rb
+  s.date      = '2011-05-13'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -15,7 +15,6 @@ Gem::Specification.new do |s|
   s.authors           = ['Jeremy Ashkenas', 'Samuel Clay']
   s.email             = 'jeremy@documentcloud.org'
   s.rubyforge_project = 'docsplit'
-  s.has_rdoc          = false
   s.require_paths     = ['lib']
   s.executables       = ['docsplit']

data/lib/docsplit.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.5.1' # Keep in sync with gemspec.
+  VERSION       = '0.5.2' # Keep in sync with gemspec.
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
@@ -14,6 +14,8 @@ module Docsplit
   OFFICE        = RUBY_PLATFORM.match(/darwin/i) ? '' : '-Doffice.home=/usr/lib/openoffice'
   METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
+  GM_FORMATS    = [:png, :gif, :jpg, :jpeg, :tif, :tiff, :bmp, :pnm, :ppm, :svg, :eps]
   DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
@@ -53,11 +55,19 @@ module Docsplit
   end
   # Use JODCConverter to extract the documents as PDFs.
+  # If the document is in an image format, use GraphicsMagick to extract the PDF.
   def self.extract_pdf(docs, opts={})
+    out = opts[:output] || '.'
+    FileUtils.mkdir_p out unless File.exists?(out)
     [docs].flatten.each do |doc|
-      basename = File.basename(doc, File.extname(doc))
-      options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
-      run "#{options} \"#{doc}\" \"#{opts[:output] || '.'}/#{basename}.pdf\"", [], {}
+      ext = File.extname(doc)
+      basename = File.basename(doc, ext)
+      if GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
+        `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
+      else
+        options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
+        run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
+      end
     end
   end

data/lib/docsplit/text_cleaner.rb CHANGED Viewed

@@ -1,4 +1,3 @@
-require 'iconv'
 require 'strscan'
 module Docsplit
@@ -36,6 +35,7 @@ module Docsplit
     # For the time being, `clean` uses the regular StringScanner, and not the
     # multibyte-aware version, coercing to ASCII first.
     def clean(text)
+      require 'iconv' unless defined?(Iconv)
       text    = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
       scanner = StringScanner.new(text)
       cleaned = []

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: docsplit
 version: !ruby/object:Gem::Version
-  hash: 9
+  hash: 15
   prerelease:
   segments:
   - 0
   - 5
-  - 1
-  version: 0.5.1
+  - 2
+  version: 0.5.2
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-04-26 00:00:00 Z
+date: 2011-05-13 00:00:00 Z
 dependencies: []
 description: "    Docsplit is a command-line utility and Ruby library for splitting apart\n    documents into their component parts: searchable UTF-8 plain text, page\n    images or thumbnails in any format, PDFs, single pages, and document\n    metadata (title, author, number of pages...)\n"