RubyGems - docsplit - Versions diffs - 0.3.3 → 0.3.4 - Mend

docsplit 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/docsplit.gemspec +2 -2
data/lib/docsplit.rb +1 -1
data/lib/docsplit/image_extractor.rb +17 -33
data/lib/docsplit/text_extractor.rb +6 -6
metadata +4 -4

data/docsplit.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.3.3'         # Keep version in sync with docsplit.rb
-  s.date      = '2010-8-17'
+  s.version   = '0.3.4'         # Keep version in sync with docsplit.rb
+  s.date      = '2010-8-20'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"

data/lib/docsplit.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.3.3' # Keep in sync with gemspec.
+  VERSION       = '0.3.4' # Keep in sync with gemspec.
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')

data/lib/docsplit/image_extractor.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module Docsplit
   class ImageExtractor
     DENSITY_ARG     = "-density 150"
-    MEMORY_ARGS     = "-limit memory 128MiB -limit map 256MiB"
+    MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
     DEFAULT_FORMAT  = :png
     # Extract a list of PDFs as rasterized page images, according to the
@@ -23,22 +23,29 @@ module Docsplit
     end
     # Convert a single PDF into page images at the specified size and format.
+    # If `--rolling`, and we have a previous image at a larger size to work with,
+    # we simply downsample that image, instead of re-rendering the entire PDF.
+    # Now we generate one page at a time, a counterintuitive opimization
+    # suggested by the GraphicsMagick list, that seems to work quite well.
     def convert(pdf, size, format, previous=nil)
       tempdir   = Dir.mktmpdir
       basename  = File.basename(pdf, File.extname(pdf))
       directory = directory_for(size)
+      pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
-      out_file  = File.join(directory, "#{basename}_%05d.#{format}")
       common    = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
-        cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
+        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
+        raise ExtractionFailed, result if $? != 0
       else
-        cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
+        page_list(pages).each do |page|
+          out_file  = File.join(directory, "#{basename}_#{page}.#{format}")
+          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
+          result = `#{cmd}`.chomp
+          raise ExtractionFailed, result if $? != 0
+        end
       end
-      result = `#{cmd}`.chomp
-      raise ExtractionFailed, result if $? != 0
-      renumber_images(out_file, format)
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end
@@ -76,39 +83,16 @@ module Docsplit
       end
     end
-    # Generate the requested page index into the document.
-    def pages_arg
-      return '' if @pages.nil?
-      pages = @pages.gsub(/\d+/) {|digits| (digits.to_i - 1).to_s }
-      "[#{pages}]"
-    end
     # Generate the expanded list of requested page numbers.
-    def page_list
-      @pages.split(',').map { |range|
+    def page_list(pages)
+      pages.split(',').map { |range|
         if range.include?('-')
           range = range.split('-')
           Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
         else
           range.to_i
         end
-      }.flatten.sort
-    end
-    # When GraphicsMagick is through, it will have generated a number of
-    # incrementing page images, starting at 0. Renumber them with their correct
-    # page numbers.
-    def renumber_images(template, format)
-      suffixer = /_0+(\d+)\.#{format}\Z/
-      images = Dir[template.sub('%05d', '0*')].map do |path|
-        index = path[suffixer, 1].to_i
-        {:path => path, :index => index, :page_number => index + 1}
-      end
-      numbers = @pages ? page_list.reverse : nil
-      images.sort_by {|i| -i[:page_number] }.each_with_index do |image, i|
-        number = numbers ? numbers[i] : image[:page_number]
-        FileUtils.mv(image[:path], image[:path].sub(suffixer, "_#{number}.#{format}"))
-      end
+      }.flatten.uniq.sort
     end
   end

data/lib/docsplit/text_extractor.rb CHANGED Viewed

@@ -17,13 +17,12 @@ module Docsplit
     NO_TEXT_DETECTED = /---------\n\Z/
     OCR_FLAGS   = '-density 200x200 -colorspace GRAY'
-    MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
+    MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
     MIN_TEXT_PER_PAGE = 100 # in bytes
     def initialize
-      @tiffs_generated = false
-      @pages_to_ocr    = []
+      @pages_to_ocr = []
     end
     # Extract text from a list of PDFs.
@@ -61,10 +60,11 @@ module Docsplit
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       if pages
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
-        @tiffs_generated = true
         pages.each do |page|
-          run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
+          tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
+          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
+          run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
+          FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: docsplit
 version: !ruby/object:Gem::Version
-  hash: 21
+  hash: 27
   prerelease: false
   segments:
   - 0
   - 3
-  - 3
-  version: 0.3.3
+  - 4
+  version: 0.3.4
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-08-17 00:00:00 -04:00
+date: 2010-08-20 00:00:00 -04:00
 default_executable:
 dependencies: []