RubyGems - luccasmaso-docsplit - Versions diffs - 0.7.4.1 → 0.7.4.2 - Mend

luccasmaso-docsplit 0.7.4.1 → 0.7.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/docsplit.gemspec +3 -4
data/lib/docsplit.rb +10 -3
data/lib/docsplit/command_line.rb +3 -1
data/lib/docsplit/page_extractor.rb +4 -4
data/lib/docsplit/pdf_extractor.rb +5 -1
data/lib/docsplit/text_extractor.rb +12 -9
data/lib/docsplit/transparent_pdfs.rb +6 -3
metadata +3 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5c348a5547895d1dadbbd0330346bf1e295677a4
-  data.tar.gz: 5f90fad7ad4849b9b85854a4a50e8450b13b1b9e
+  metadata.gz: 503116ea38655488e9a9f29ca17b2be29a3585c0
+  data.tar.gz: 26b0bc22c27bda0ef2e711a7cb0ecd46852ee5f7
 SHA512:
-  metadata.gz: 90186b801914fe20ab18f1ab2c8bc5669a9933745e312f0a326d185c0060f4d7cc2d2b65194a8b9f1fe4db39495041a63222d6b2ead650ae991fd50124ea503c
-  data.tar.gz: eacf34acea2bfc17da2c0d84bdc2a696d9a32e22ca29326727a07cad422f82de6e11f98646d9b362d6cfb8f2dd248d96c091a965b048fd5e4f886ba5e3c7da4f
+  metadata.gz: 490f30ec2b1410e026c30a8fafc6676d5e117e33f4c326b11875a209d98df62964fca4ae1bf28397319408df425184c3b0d6b0fc9f8f98586545ae83df040cfa
+  data.tar.gz: 0dbac40654167ff244b0dea9552219537f4a0a8947d5587bd2fdb206d0d85916f0421ab19fc50d81ccaaeeb9c5b241dda3bee4bd7d74b52e8d15e0528a0222a6

data/docsplit.gemspec CHANGED

@@ -1,8 +1,7 @@
 Gem::Specification.new do |s|
-  s.name      = 'luccasmaso-docsplit'
-  s.version   = '0.7.4.1'         # Keep version in sync with docsplit.rb
-  s.date      = '2014-02-16'
+  s.name        = 'luccasmaso-docsplit'
+  s.version     = '0.7.4.2'         # Keep version in sync with docsplit.rb
+  s.date        = '2014-11-17'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
   s.description = <<-EOS

data/lib/docsplit.rb CHANGED

@@ -5,7 +5,7 @@ require 'shellwords'
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.7.4' # Keep in sync with gemspec.
+  VERSION       = '0.7.6' # Keep in sync with gemspec.
   ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
@@ -16,7 +16,7 @@ module Docsplit
   GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
+  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -29,7 +29,14 @@ module Docsplit
     end
   end
-  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
+  # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
+  if DEPENDENCIES[:tesseract]
+    # osd will be listed in tesseract --listlangs
+    val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
+    DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
+  end
+    # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
   # broke.
   class ExtractionFailed < StandardError; end

data/lib/docsplit/command_line.rb CHANGED

@@ -99,7 +99,9 @@ Options:
         end
         opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
           @options[:language] = l
-          @options[:clean] = false
+        end
+        opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
+          @options[:detect_orientation] = false
         end
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true

data/lib/docsplit/page_extractor.rb CHANGED

@@ -9,13 +9,13 @@ module Docsplit
       extract_options opts
       [pdfs].flatten.each do |pdf|
         pdf_name = File.basename(pdf, File.extname(pdf))
-        page_path = File.join(@output, "#{pdf_name}_%d.pdf")
+        page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
         FileUtils.mkdir_p @output unless File.exists?(@output)
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
-          "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
+          "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
         else
-          "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
+          "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
         end
         result = `#{cmd}`.chomp
         FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
@@ -33,4 +33,4 @@ module Docsplit
   end
-end
+end

data/lib/docsplit/pdf_extractor.rb CHANGED

@@ -23,7 +23,7 @@ module Docsplit
       unless @@version_string
         null = windows? ? "NUL" : "/dev/null"
         @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
-        if !!@@version_string.match(/[0-9]*/)
+        if !!@@version_string.to_s.match(/[0-9]*/)
           @@version_string = `#{office_executable} --version`.split("\n").first
         end
       end
@@ -61,6 +61,10 @@ module Docsplit
           /usr/lib64/openoffice
           /opt/openoffice.org3
           /app/vendor/libreoffice
+          /usr/bin/libreoffice
+          /usr/local/bin
+          /usr/lib64/libreoffice
+          /usr/lib64/openoffice.org3
         )
       end
       search_paths

data/lib/docsplit/text_extractor.rb CHANGED

@@ -60,13 +60,14 @@ module Docsplit
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
+      psm = @detect_orientation ? "-psm 1" : ""
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
+          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
@@ -74,7 +75,8 @@ module Docsplit
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
+        #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
+        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
@@ -117,14 +119,15 @@ module Docsplit
     end
     def extract_options(options)
-      @output     = options[:output] || '.'
-      @pages      = options[:pages]
-      @force_ocr  = options[:ocr] == true
-      @forbid_ocr = options[:ocr] == false
-      @clean_ocr  = !(options[:clean] == false)
-      @language   = options[:language] || 'eng'
+      @output             = options[:output] || '.'
+      @pages              = options[:pages]
+      @force_ocr          = options[:ocr] == true
+      @forbid_ocr         = options[:ocr] == false
+      @language           = options[:language] || 'eng'
+      @clean_ocr          = (!(options[:clean] == false) and @language == 'eng')
+      @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
     end
   end
-end
+end

data/lib/docsplit/transparent_pdfs.rb CHANGED

@@ -8,17 +8,20 @@ module Docsplit
     # through further extraction.
     def ensure_pdfs(docs)
       [docs].flatten.map do |doc|
-        ext = File.extname(doc)
-        if ext.downcase == '.pdf' || File.open(doc, &:readline).force_encoding("BINARY") =~ /\A\%PDF-\d+(\.\d+)?$/
+        if is_pdf?(doc)
           doc
         else
           tempdir = File.join(Dir.tmpdir, 'docsplit')
           extract_pdf([doc], {:output => tempdir})
-          File.join(tempdir, File.basename(doc, ext) + '.pdf')
+          File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
         end
       end
     end
+    def is_pdf?(doc)
+      File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline).force_encoding("BINARY") =~ /\A\%PDF-\d+(\.\d+)?/
+    end
   end
   extend TransparentPDFs

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: luccasmaso-docsplit
 version: !ruby/object:Gem::Version
-  version: 0.7.4.1
+  version: 0.7.4.2
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-02-16 00:00:00.000000000 Z
+date: 2014-11-17 00:00:00.000000000 Z
 dependencies: []
 description: |2
       Docsplit is a command-line utility and Ruby library for splitting apart
@@ -66,7 +66,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project: docsplit
-rubygems_version: 2.1.11
+rubygems_version: 2.0.14
 signing_key:
 specification_version: 4
 summary: Break Apart Documents into Images, Text, Pages and PDFs