RubyGems - docsplit - Versions diffs - 0.3.1 → 0.3.3 - Mend

docsplit 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/docsplit.gemspec +2 -2
data/lib/docsplit.rb +1 -1
data/lib/docsplit/image_extractor.rb +4 -2
data/lib/docsplit/text_extractor.rb +6 -6
data/lib/docsplit/transparent_pdfs.rb +3 -3
metadata +9 -4

data/docsplit.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.3.1'         # Keep version in sync with docsplit.rb
-  s.date      = '2010-8-10'
+  s.version   = '0.3.3'         # Keep version in sync with docsplit.rb
+  s.date      = '2010-8-17'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"

data/lib/docsplit.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.3.1' # Keep in sync with gemspec.
+  VERSION       = '0.3.3' # Keep in sync with gemspec.
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')

data/lib/docsplit/image_extractor.rb CHANGED Viewed

@@ -24,6 +24,7 @@ module Docsplit
     # Convert a single PDF into page images at the specified size and format.
     def convert(pdf, size, format, previous=nil)
+      tempdir   = Dir.mktmpdir
       basename  = File.basename(pdf, File.extname(pdf))
       directory = directory_for(size)
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
@@ -31,13 +32,14 @@ module Docsplit
       common    = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
-        cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
+        cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
       else
-        cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
+        cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
       end
       result = `#{cmd}`.chomp
       raise ExtractionFailed, result if $? != 0
       renumber_images(out_file, format)
+      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end

data/lib/docsplit/text_extractor.rb CHANGED Viewed

@@ -42,7 +42,6 @@ module Docsplit
           end
         end
       end
-      FileUtils.remove_entry_secure @tempdir if @tempdir
     end
     # Does a PDF have any text embedded?
@@ -59,19 +58,20 @@ module Docsplit
     # Extract a page range worth of text from a PDF via OCR.
     def extract_from_ocr(pdf, pages)
-      @tempdir  ||= Dir.mktmpdir
+      tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       if pages
-        run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
+        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
         @tiffs_generated = true
         pages.each do |page|
-          run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
+          run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
         end
       else
-        tiff = "#{@tempdir}/#{@pdf_name}.tif"
-        run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
+        tiff = "#{tempdir}/#{@pdf_name}.tif"
+        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
         run "tesseract #{tiff} #{base_path} -l eng 2>&1"
       end
+      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end

data/lib/docsplit/transparent_pdfs.rb CHANGED Viewed

@@ -12,9 +12,9 @@ module Docsplit
         if ext.downcase == '.pdf'
           doc
         else
-          @tempdir ||= File.join(Dir.tmpdir, 'docsplit')
-          extract_pdf([doc], {:output => @tempdir})
-          File.join(@tempdir, File.basename(doc, ext) + '.pdf')
+          tempdir = File.join(Dir.tmpdir, 'docsplit')
+          extract_pdf([doc], {:output => tempdir})
+          File.join(tempdir, File.basename(doc, ext) + '.pdf')
         end
       end
     end

metadata CHANGED Viewed

@@ -1,12 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: docsplit
 version: !ruby/object:Gem::Version
+  hash: 21
   prerelease: false
   segments:
   - 0
   - 3
-  - 1
-  version: 0.3.1
+  - 3
+  version: 0.3.3
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -15,7 +16,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-08-10 00:00:00 -04:00
+date: 2010-08-17 00:00:00 -04:00
 default_executable:
 dependencies: []
@@ -60,23 +61,27 @@ rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
       segments:
       - 0
       version: "0"
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
       segments:
       - 0
       version: "0"
 requirements: []
 rubyforge_project: docsplit
-rubygems_version: 1.3.6
+rubygems_version: 1.3.7
 signing_key:
 specification_version: 3
 summary: Break Apart Documents into Images, Text, Pages and PDFs