RubyGems - burisu-docsplit - Versions diffs - 0.7.7 → 0.7.8 - Mend

burisu-docsplit 0.7.7 → 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/docsplit.gemspec +1 -1
data/lib/docsplit.rb +10 -3
data/lib/docsplit/command_line.rb +3 -1
data/lib/docsplit/pdf_extractor.rb +4 -0
data/lib/docsplit/text_extractor.rb +12 -9
metadata +4 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 370a33126963926b13bef202fb15e05127a02db0
-  data.tar.gz: 76024f613e3ad9a339cc207ac428037c4ba6f7ef
+  metadata.gz: cc1638e9d3bdaf775ea840631c3e342a68e33059
+  data.tar.gz: 83834d054f1c95520aa375ebbd4d3f3ced1617d3
 SHA512:
-  metadata.gz: d3564ec6ea484e25fd09f8e3b135bdbfb31c02ed64e74f5f3f269c38fbd58ab9f2c0d63cf9387cedd7eb10549832d583819f2caf09e9d2c2b3316da1c31243e4
-  data.tar.gz: 6a88a1820ab2bf23a0dacab2d54d5949de9bee18d1ba2bda86ce67948dfec97f2dee8dd3195a01033199d2e0b5c28b8e530c7d493aaafa910d2781af82371ee8
+  metadata.gz: 93d0291009a6fb31e016f68862ee97a33cdd7f27f94c37a197078ccbe9c77f8ba5cbc5ccb19367991734e43065db7eb112510be9952c9fdbd16fb39fd5072f36
+  data.tar.gz: d0a3485206de1367d07b10ab800197396928b178c3516ae2796cda108ebb992cb19b2951579edaf7a6ec24e7ce7f0ad3a2f43ef20448664b84fe9285843eb709

data/docsplit.gemspec CHANGED

@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
   s.name      = 'burisu-docsplit'
-  s.version   = '0.7.7'         # Keep version in sync with docsplit.rb
+  s.version   = '0.7.8'         # Keep version in sync with docsplit.rb
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
   s.description = <<-EOS

data/lib/docsplit.rb CHANGED

@@ -5,7 +5,7 @@ require 'shellwords'
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.7.5' # Keep in sync with gemspec.
+  VERSION       = '0.7.6' # Keep in sync with gemspec.
   ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
@@ -16,7 +16,7 @@ module Docsplit
   GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
+  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -29,7 +29,14 @@ module Docsplit
     end
   end
-  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
+  # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
+  if DEPENDENCIES[:tesseract]
+    # osd will be listed in tesseract --listlangs
+    val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
+    DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
+  end
+    # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
   # broke.
   class ExtractionFailed < StandardError; end

data/lib/docsplit/command_line.rb CHANGED

@@ -96,7 +96,9 @@ Options:
         end
         opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
           @options[:language] = l
-          @options[:clean] = false
+        end
+        opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
+          @options[:detect_orientation] = false
         end
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true

data/lib/docsplit/pdf_extractor.rb CHANGED

@@ -61,6 +61,10 @@ module Docsplit
           /usr/lib64/openoffice
           /opt/openoffice.org3
           /app/vendor/libreoffice
+          /usr/bin/libreoffice
+          /usr/local/bin
+          /usr/lib64/libreoffice
+          /usr/lib64/openoffice.org3
         )
       end
       search_paths

data/lib/docsplit/text_extractor.rb CHANGED

@@ -60,13 +60,14 @@ module Docsplit
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
+      psm = @detect_orientation ? "-psm 1" : ""
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
+          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
@@ -74,7 +75,8 @@ module Docsplit
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
+        #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
+        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
@@ -117,14 +119,15 @@ module Docsplit
     end
     def extract_options(options)
-      @output     = options[:output] || '.'
-      @pages      = options[:pages]
-      @force_ocr  = options[:ocr] == true
-      @forbid_ocr = options[:ocr] == false
-      @clean_ocr  = !(options[:clean] == false)
-      @language   = options[:language] || 'eng'
+      @output             = options[:output] || '.'
+      @pages              = options[:pages]
+      @force_ocr          = options[:ocr] == true
+      @forbid_ocr         = options[:ocr] == false
+      @language           = options[:language] || 'eng'
+      @clean_ocr          = (!(options[:clean] == false) and @language == 'eng')
+      @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
     end
   end
-end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: burisu-docsplit
 version: !ruby/object:Gem::Version
-  version: 0.7.7
+  version: 0.7.8
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-10-18 00:00:00.000000000 Z
+date: 2015-06-26 00:00:00.000000000 Z
 dependencies: []
 description: |2
       Docsplit is a command-line utility and Ruby library for splitting apart
@@ -66,8 +66,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.2
+rubygems_version: 2.4.5
 signing_key:
 specification_version: 4
 summary: Break Apart Documents into Images, Text, Pages and PDFs
 test_files: []
+has_rdoc: