RubyGems - docsplit - Versions diffs - 0.3.3 → 0.3.4 - Mend

docsplit 0.3.3 → 0.3.4

Files changed (5) hide show

data/docsplit.gemspec +2 -2
data/lib/docsplit.rb +1 -1
data/lib/docsplit/image_extractor.rb +17 -33
data/lib/docsplit/text_extractor.rb +6 -6
metadata +4 -4

data/docsplit.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.3.3'         # Keep version in sync with docsplit.rb
-  s.date      = '2010-8-17'
+  s.version   = '0.3.4'         # Keep version in sync with docsplit.rb
+  s.date      = '2010-8-20'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"

data/lib/docsplit.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.3.3' # Keep in sync with gemspec.
+  VERSION       = '0.3.4' # Keep in sync with gemspec.
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')

data/lib/docsplit/image_extractor.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module Docsplit
   class ImageExtractor
     DENSITY_ARG     = "-density 150"
-    MEMORY_ARGS     = "-limit memory 128MiB -limit map 256MiB"
+    MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
     DEFAULT_FORMAT  = :png
     # Extract a list of PDFs as rasterized page images, according to the
@@ -23,22 +23,29 @@ module Docsplit
     end
     # Convert a single PDF into page images at the specified size and format.
+    # If `--rolling`, and we have a previous image at a larger size to work with,
+    # we simply downsample that image, instead of re-rendering the entire PDF.
+    # Now we generate one page at a time, a counterintuitive opimization
+    # suggested by the GraphicsMagick list, that seems to work quite well.
     def convert(pdf, size, format, previous=nil)
       tempdir   = Dir.mktmpdir
       basename  = File.basename(pdf, File.extname(pdf))
       directory = directory_for(size)
+      pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
-      out_file  = File.join(directory, "#{basename}_%05d.#{format}")
       common    = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
-        cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
+        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
+        raise ExtractionFailed, result if $? != 0
       else
-        cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
+        page_list(pages).each do |page|
+          out_file  = File.join(directory, "#{basename}_#{page}.#{format}")
+          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
+          result = `#{cmd}`.chomp
+          raise ExtractionFailed, result if $? != 0
+        end
       end
-      result = `#{cmd}`.chomp
-      raise ExtractionFailed, result if $? != 0
-      renumber_images(out_file, format)
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end
@@ -76,39 +83,16 @@ module Docsplit
       end
     end
-    # Generate the requested page index into the document.
-    def pages_arg
-      return '' if @pages.nil?
-      pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
-      "[#{pages}]"
-    end
     # Generate the expanded list of requested page numbers.
-    def page_list
-      @pages.split(',').map { |range|
+    def page_list(pages)
+      pages.split(',').map { |range|
         if range.include?('-')
           range = range.split('-')
           Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
         else
           range.to_i
         end
-      }.flatten.sort
-    end
-    # When GraphicsMagick is through, it will have generated a number of
-    # incrementing page images, starting at 0. Renumber them with their correct
-    # page numbers.
-    def renumber_images(template, format)
-      suffixer = /_0+(\d+)\.#{format}\Z/
-      images = Dir[template.sub('%05d', '0*')].map do |path|
-        index = path[suffixer, 1].to_i
-        {:path => path, :index => index, :page_number => index + 1}
-      end
-      numbers = @pages ? page_list.reverse : nil
-      images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
-        number = numbers ? numbers[i] : image[:page_number]
-        FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))
-      end
+      }.flatten.uniq.sort
     end
   end

data/lib/docsplit/text_extractor.rb CHANGED Viewed

@@ -17,13 +17,12 @@ module Docsplit
     NO_TEXT_DETECTED = /---------\n\Z/
     OCR_FLAGS   = '-density 200x200 -colorspace GRAY'
-    MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
+    MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
     MIN_TEXT_PER_PAGE = 100 # in bytes
     def initialize
-      @tiffs_generated = false
-      @pages_to_ocr    = []
+      @pages_to_ocr = []
     end
     # Extract text from a list of PDFs.
@@ -61,10 +60,11 @@ module Docsplit
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       if pages
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
-        @tiffs_generated = true
         pages.each do |page|
-          run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
+          tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
+          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
+          run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
+          FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: docsplit
 version: !ruby/object:Gem::Version
-  hash: 21
+  hash: 27
   prerelease: false
   segments:
   - 0
   - 3
-  - 3
-  version: 0.3.3
+  - 4
+  version: 0.3.4
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-08-17 00:00:00 -04:00
+date: 2010-08-20 00:00:00 -04:00
 default_executable:
 dependencies: []