RubyGems - docsplit - Versions diffs - 0.7.5 → 0.7.6 - Mend

docsplit 0.7.5 → 0.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/docsplit.gemspec +2 -2
data/lib/docsplit.rb +10 -3
data/lib/docsplit/command_line.rb +3 -1
data/lib/docsplit/page_extractor.rb +4 -4
data/lib/docsplit/pdf_extractor.rb +5 -1
data/lib/docsplit/text_extractor.rb +12 -9
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ebdecba4d9b5b3a19244e08a2f3bcbaff8d8fab1
-  data.tar.gz: adb719d204a184313c1d282c837a7d1c929977ff
+  metadata.gz: 6c4106dcd5d8d9f8f6a1915a99a438b293154e1e
+  data.tar.gz: 90450ce6412bbedb022f4bc68ec7171f47b5d829
 SHA512:
-  metadata.gz: 8e58b660472bbff77ce500e50007c1dcdeec9adce59d527d9e331b42bde76d4ff9f2df9aece56e3793eef9d49795aafeddc8a7dfc9bd56a4fa07ca69fa080ca2
-  data.tar.gz: 1ca0706ec0b4eca050c9a1d0a45858365f5bde18b639c3ff641278dc777a1edf17128585f264b2a128e36a5388d4b2c7b9386e8f0b1993e850bc995d625c731b
+  metadata.gz: 1f6ccf476687ce1bf3a5559f07d0f7d8ebd2a80034b102b3058f538fb962a3b537b8e3eaeb245df27f14a4dc70716b69e34599bb50edf3e99e7b8a7b3f38d98d
+  data.tar.gz: 912d974bc4ed17942d32a932232439cd2df6903d6d20e72af31e0e80a1c70fc5e58d4be63bd00f245c53be90dc93a815ffd41a25268072367a1a244a5cb59ec4

data/docsplit.gemspec CHANGED

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.7.5'         # Keep version in sync with docsplit.rb
-  s.date      = '2014-05-28'
+  s.version   = '0.7.6'         # Keep version in sync with docsplit.rb
+  s.date      = '2014-11-17'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"

data/lib/docsplit.rb CHANGED

@@ -5,7 +5,7 @@ require 'shellwords'
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.7.5' # Keep in sync with gemspec.
+  VERSION       = '0.7.6' # Keep in sync with gemspec.
   ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
@@ -16,7 +16,7 @@ module Docsplit
   GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
+  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -29,7 +29,14 @@ module Docsplit
     end
   end
-  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
+  # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
+  if DEPENDENCIES[:tesseract]
+    # osd will be listed in tesseract --listlangs
+    val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
+    DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
+  end
+    # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
   # broke.
   class ExtractionFailed < StandardError; end

data/lib/docsplit/command_line.rb CHANGED

@@ -96,7 +96,9 @@ Options:
         end
         opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
           @options[:language] = l
-          @options[:clean] = false
+        end
+        opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
+          @options[:detect_orientation] = false
         end
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true

data/lib/docsplit/page_extractor.rb CHANGED

@@ -9,13 +9,13 @@ module Docsplit
       extract_options opts
       [pdfs].flatten.each do |pdf|
         pdf_name = File.basename(pdf, File.extname(pdf))
-        page_path = File.join(@output, "#{pdf_name}_%d.pdf")
+        page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
         FileUtils.mkdir_p @output unless File.exists?(@output)
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
-          "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
+          "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
         else
-          "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
+          "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
         end
         result = `#{cmd}`.chomp
         FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
@@ -33,4 +33,4 @@ module Docsplit
   end
-end
+end

data/lib/docsplit/pdf_extractor.rb CHANGED

@@ -23,7 +23,7 @@ module Docsplit
       unless @@version_string
         null = windows? ? "NUL" : "/dev/null"
         @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
-        if !!@@version_string.match(/[0-9]*/)
+        if !!@@version_string.to_s.match(/[0-9]*/)
           @@version_string = `#{office_executable} --version`.split("\n").first
         end
       end
@@ -61,6 +61,10 @@ module Docsplit
           /usr/lib64/openoffice
           /opt/openoffice.org3
           /app/vendor/libreoffice
+          /usr/bin/libreoffice
+          /usr/local/bin
+          /usr/lib64/libreoffice
+          /usr/lib64/openoffice.org3
         )
       end
       search_paths

data/lib/docsplit/text_extractor.rb CHANGED

@@ -60,13 +60,14 @@ module Docsplit
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
+      psm = @detect_orientation ? "-psm 1" : ""
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
+          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
@@ -74,7 +75,8 @@ module Docsplit
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
+        #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
+        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
@@ -117,14 +119,15 @@ module Docsplit
     end
     def extract_options(options)
-      @output     = options[:output] || '.'
-      @pages      = options[:pages]
-      @force_ocr  = options[:ocr] == true
-      @forbid_ocr = options[:ocr] == false
-      @clean_ocr  = !(options[:clean] == false)
-      @language   = options[:language] || 'eng'
+      @output             = options[:output] || '.'
+      @pages              = options[:pages]
+      @force_ocr          = options[:ocr] == true
+      @forbid_ocr         = options[:ocr] == false
+      @language           = options[:language] || 'eng'
+      @clean_ocr          = (!(options[:clean] == false) and @language == 'eng')
+      @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
     end
   end
-end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: docsplit
 version: !ruby/object:Gem::Version
-  version: 0.7.5
+  version: 0.7.6
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-05-28 00:00:00.000000000 Z
+date: 2014-11-17 00:00:00.000000000 Z
 dependencies: []
 description: |2
       Docsplit is a command-line utility and Ruby library for splitting apart