RubyGems - burisu-docsplit - Versions diffs - 0.7.8 → 0.7.9 - Mend

burisu-docsplit 0.7.8 → 0.7.9

Files changed (14) hide show

checksums.yaml +4 -4
data/bin/docsplit +1 -1
data/docsplit.gemspec +7 -3
data/lib/docsplit.rb +15 -18
data/lib/docsplit/command_line.rb +20 -27
data/lib/docsplit/image_extractor.rb +18 -23
data/lib/docsplit/info_extractor.rb +14 -18
data/lib/docsplit/page_extractor.rb +8 -13
data/lib/docsplit/pdf_extractor.rb +38 -35
data/lib/docsplit/text_cleaner.rb +20 -24
data/lib/docsplit/text_extractor.rb +11 -16
data/lib/docsplit/transparent_pdfs.rb +2 -6
data/lib/docsplit/version.rb +3 -0
metadata +4 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cc1638e9d3bdaf775ea840631c3e342a68e33059
-  data.tar.gz: 83834d054f1c95520aa375ebbd4d3f3ced1617d3
+  metadata.gz: 2ad2a468e06ca5502c5899d1b7a7b5ea3ed0c42d
+  data.tar.gz: 48fdaf6262a31252476bb55c2a54ef6697799079
 SHA512:
-  metadata.gz: 93d0291009a6fb31e016f68862ee97a33cdd7f27f94c37a197078ccbe9c77f8ba5cbc5ccb19367991734e43065db7eb112510be9952c9fdbd16fb39fd5072f36
-  data.tar.gz: d0a3485206de1367d07b10ab800197396928b178c3516ae2796cda108ebb992cb19b2951579edaf7a6ec24e7ce7f0ad3a2f43ef20448664b84fe9285843eb709
+  metadata.gz: c5b222b0b49176dd10c3bf7583c74ecede1b40f8f00dc0bbee5f056997f305b4e1ac80f69956365432c37ea05e3d4143c740cd62be589d4888d0d1d320a0fd08
+  data.tar.gz: 144e647ee2207fe57ae7a7302fa6eb626c7f7be3a25dbe29d89edae0ebe33da692fa79654a1a266d68c8cef5b4d663baf1b069d503c8a86bcad4225a86432644

data/bin/docsplit CHANGED

@@ -2,4 +2,4 @@
 require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
-Docsplit::CommandLine.new
+Docsplit::CommandLine.new

data/docsplit.gemspec CHANGED

@@ -1,8 +1,12 @@
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'docsplit/version'
 Gem::Specification.new do |s|
   s.name      = 'burisu-docsplit'
-  s.version   = '0.7.8'         # Keep version in sync with docsplit.rb
-  s.homepage    = "http://documentcloud.github.com/docsplit/"
-  s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
+  s.version   = Docsplit::VERSION # Keep version in sync with docsplit.rb
+  s.homepage    = 'http://documentcloud.github.com/docsplit/'
+  s.summary     = 'Break Apart Documents into Images, Text, Pages and PDFs'
   s.description = <<-EOS
     Docsplit is a command-line utility and Ruby library for splitting apart
     documents into their component parts: searchable UTF-8 plain text, page

data/lib/docsplit.rb CHANGED

@@ -1,22 +1,20 @@
 require 'tmpdir'
 require 'fileutils'
 require 'shellwords'
+require 'docsplit/version'
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.7.6' # Keep in sync with gemspec.
-  ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
+  ESCAPE        = ->(x) { Shellwords.shellescape(x) }
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
   ESCAPED_ROOT  = ESCAPE[ROOT]
-  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
-  GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
+  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length].freeze
+  GM_FORMATS    = ['image/gif', 'image/jpeg', 'image/png', 'image/x-ms-bmp', 'image/svg+xml', 'image/tiff', 'image/x-portable-bitmap', 'application/postscript', 'image/x-portable-pixmap'].freeze
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
+  DEPENDENCIES  = { java: false, gm: false, pdftotext: false, pdftk: false, pdftailor: false, tesseract: false, osd: false }
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -32,28 +30,28 @@ module Docsplit
   # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
   if DEPENDENCIES[:tesseract]
     # osd will be listed in tesseract --listlangs
-    val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
+    val = `#{'tesseract --list-langs'} 2>&1 >/dev/null`
     DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
   end
-    # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
+  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
   # broke.
   class ExtractionFailed < StandardError; end
   # Use the ExtractPages Java class to burst a PDF into single pages.
-  def self.extract_pages(pdfs, opts={})
+  def self.extract_pages(pdfs, opts = {})
     pdfs = ensure_pdfs(pdfs)
     PageExtractor.new.extract(pdfs, opts)
   end
   # Use the ExtractText Java class to write out all embedded text.
-  def self.extract_text(pdfs, opts={})
+  def self.extract_text(pdfs, opts = {})
     pdfs = ensure_pdfs(pdfs)
     TextExtractor.new.extract(pdfs, opts)
   end
   # Use the ExtractImages Java class to rasterize a PDF into each page's image.
-  def self.extract_images(pdfs, opts={})
+  def self.extract_images(pdfs, opts = {})
     pdfs = ensure_pdfs(pdfs)
     opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
     ImageExtractor.new.extract(pdfs, opts)
@@ -61,7 +59,7 @@ module Docsplit
   # Use JODCConverter to extract the documents as PDFs.
   # If the document is in an image format, use GraphicsMagick to extract the PDF.
-  def self.extract_pdf(docs, opts={})
+  def self.extract_pdf(docs, opts = {})
     PdfExtractor.new.extract(docs, opts)
   end
@@ -75,8 +73,8 @@ module Docsplit
       end
     EOS
   end
-  def self.extract_info(pdfs, opts={})
+  def self.extract_info(pdfs, opts = {})
     pdfs = ensure_pdfs(pdfs)
     InfoExtractor.new.extract_all(pdfs, opts)
   end
@@ -93,11 +91,10 @@ module Docsplit
   def self.normalize_value(value)
     case value
     when Range then value.to_a.join(',')
-    when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
+    when Array then value.map! { |v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
     else            value.to_s
     end
   end
 end
 require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"

data/lib/docsplit/command_line.rb CHANGED

@@ -2,11 +2,9 @@ require 'optparse'
 require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
 module Docsplit
   # A single command-line utility to separate a PDF into all its component parts.
   class CommandLine
-    BANNER = <<-EOS
+    BANNER = <<-EOS.freeze
 docsplit breaks apart documents into images, text, or individual pages.
 It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
@@ -39,24 +37,22 @@ Options:
     # Delegate to the Docsplit Ruby API to perform all extractions.
     def run
-      begin
-        case @command
-        when :images  then Docsplit.extract_images(ARGV, @options)
-        when :pages   then Docsplit.extract_pages(ARGV, @options)
-        when :text    then Docsplit.extract_text(ARGV, @options)
-        when :pdf     then Docsplit.extract_pdf(ARGV, @options)
+      case @command
+      when :images  then Docsplit.extract_images(ARGV, @options)
+      when :pages   then Docsplit.extract_pages(ARGV, @options)
+      when :text    then Docsplit.extract_text(ARGV, @options)
+      when :pdf     then Docsplit.extract_pdf(ARGV, @options)
+      else
+        if METADATA_KEYS.include?(@command)
+          value = Docsplit.send("extract_#{@command}", ARGV, @options)
+          puts value unless value.nil?
         else
-          if METADATA_KEYS.include?(@command)
-            value = Docsplit.send("extract_#{@command}", ARGV, @options)
-            puts value unless value.nil?
-          else
-            usage
-          end
+          usage
         end
-      rescue ExtractionFailed => e
-        puts e.message.chomp
-        exit(1)
       end
+    rescue ExtractionFailed => e
+      puts e.message.chomp
+      exit(1)
     end
     # Print out the usage help message.
@@ -65,18 +61,17 @@ Options:
       exit
     end
     private
     # Use the OptionParser library to parse out all supported options. Return
     # options formatted for the Ruby API.
     def parse_options
-      @options = {:ocr => :default, :clean => true}
+      @options = { ocr: :default, clean: true }
       @option_parser = OptionParser.new do |opts|
         opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
           @options[:output] = d
         end
-        opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
+        opts.on('-p', '--pages [PAGES]', 'extract specific pages (eg: 5-10)') do |p|
           @options[:pages] = p
         end
         opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
@@ -91,16 +86,16 @@ Options:
         opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
           @options[:ocr] = o
         end
-        opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
+        opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |_c|
           @options[:clean] = false
         end
         opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
           @options[:language] = l
         end
-        opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
+        opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |_n|
           @options[:detect_orientation] = false
         end
-        opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
+        opts.on('-r', '--rolling', 'generate images from each previous image') do |_r|
           @options[:rolling] = true
         end
         opts.on_tail('-v', '--version', 'display docsplit version') do
@@ -119,7 +114,5 @@ Options:
         exit(1)
       end
     end
   end
-end
+end

data/lib/docsplit/image_extractor.rb CHANGED

@@ -1,12 +1,10 @@
 module Docsplit
   # Delegates to GraphicsMagick in order to convert PDF documents into
   # nicely sized images.
   class ImageExtractor
-    MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
+    MEMORY_ARGS     = '-limit memory 256MiB -limit map 512MiB'.freeze
     DEFAULT_FORMAT  = :png
-    DEFAULT_DENSITY = '150'
+    DEFAULT_DENSITY = '150'.freeze
     # Extract a list of PDFs as rasterized page images, according to the
     # configuration in options.
@@ -15,8 +13,8 @@ module Docsplit
       extract_options(options)
       @pdfs.each do |pdf|
         previous = nil
-        @sizes.each_with_index do |size, i|
-          @formats.each {|format| convert(pdf, size, format, previous) }
+        @sizes.each_with_index do |size, _i|
+          @formats.each { |format| convert(pdf, size, format, previous) }
           previous = size if @rolling
         end
       end
@@ -27,36 +25,35 @@ module Docsplit
     # we simply downsample that image, instead of re-rendering the entire PDF.
     # Now we generate one page at a time, a counterintuitive opimization
     # suggested by the GraphicsMagick list, that seems to work quite well.
-    def convert(pdf, size, format, previous=nil)
+    def convert(pdf, size, format, previous = nil)
       tempdir   = Dir.mktmpdir
       basename  = File.basename(pdf, File.extname(pdf))
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       escaped_pdf = ESCAPE[pdf]
-      FileUtils.mkdir_p(directory) unless File.exists?(directory)
-      common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
+      FileUtils.mkdir_p(directory) unless File.exist?(directory)
+      common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
         result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
-        raise ExtractionFailed, result if $? != 0
+        raise ExtractionFailed, result if $?.exitstatus.nonzero?
       else
         page_list(pages).each do |page|
-          out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
+          out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
           cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
-          raise ExtractionFailed, result if $? != 0
+          raise ExtractionFailed, result if $?.exitstatus.nonzero?
         end
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
     private
     # Extract the relevant GraphicsMagick options from the options hash.
     def extract_options(options)
-      @output  = options[:output]  || '.'
+      @output  = options[:output] || '.'
       @pages   = options[:pages]
       @density = options[:density] || DEFAULT_DENSITY
       @formats = [options[:format] || DEFAULT_FORMAT].flatten
@@ -80,24 +77,22 @@ module Docsplit
     # Generate the appropriate quality argument for the image format.
     def quality_arg(format)
       case format.to_s
-      when /jpe?g/ then "-quality 85"
-      when /png/   then "-quality 100"
-      else ""
+      when /jpe?g/ then '-quality 85'
+      when /png/   then '-quality 100'
+      else ''
       end
     end
     # Generate the expanded list of requested page numbers.
     def page_list(pages)
-      pages.split(',').map { |range|
+      pages.split(',').map do |range|
         if range.include?('-')
           range = range.split('-')
-          Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
+          Range.new(range.first.to_i, range.last.to_i).to_a.map(&:to_i)
         else
           range.to_i
         end
-      }.flatten.uniq.sort
+      end.flatten.uniq.sort
     end
   end
 end

data/lib/docsplit/info_extractor.rb CHANGED

@@ -1,36 +1,34 @@
 module Docsplit
   # Delegates to **pdfinfo** in order to extract information about a PDF file.
   class InfoExtractor
     # Regex matchers for different bits of information.
     MATCHERS = {
-      :author   => /^Author:\s+([^\n]+)/,
-      :date     => /^CreationDate:\s+([^\n]+)/,
-      :creator  => /^Creator:\s+([^\n]+)/,
-      :keywords => /^Keywords:\s+([^\n]+)/,
-      :producer => /^Producer:\s+([^\n]+)/,
-      :subject  => /^Subject:\s+([^\n]+)/,
-      :title    => /^Title:\s+([^\n]+)/,
-      :length   => /^Pages:\s+([^\n]+)/,
-    }
+      author: /^Author:\s+([^\n]+)/,
+      date: /^CreationDate:\s+([^\n]+)/,
+      creator: /^Creator:\s+([^\n]+)/,
+      keywords: /^Keywords:\s+([^\n]+)/,
+      producer: /^Producer:\s+([^\n]+)/,
+      subject: /^Subject:\s+([^\n]+)/,
+      title: /^Title:\s+([^\n]+)/,
+      length: /^Pages:\s+([^\n]+)/
+    }.freeze
     # Pull out a single datum from a pdf.
     def extract(key, pdfs, opts)
       extract_all(pdfs, opts)[key]
     end
-    def extract_all(pdfs, opts)
+    def extract_all(pdfs, _opts)
       pdf = [pdfs].flatten.first
       cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
       result = `#{cmd}`.chomp
-      raise ExtractionFailed, result if $? != 0
+      raise ExtractionFailed, result if $?.exitstatus.nonzero?
       # ruby  1.8 (iconv) and 1.9 (String#encode) :
       if String.method_defined?(:encode)
-        result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
+        result.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') unless result.valid_encoding?
       else
         require 'iconv' unless defined?(Iconv)
-        ic = Iconv.new('UTF-8//IGNORE','UTF-8')
+        ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
         result = ic.iconv(result)
       end
       info = {}
@@ -44,7 +42,5 @@ module Docsplit
       end
       info
     end
   end
 end

data/lib/docsplit/page_extractor.rb CHANGED

@@ -1,36 +1,31 @@
 module Docsplit
   # Delegates to **pdftk** in order to create bursted single pages from
   # a PDF document.
   class PageExtractor
     # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
     def extract(pdfs, opts)
       extract_options opts
       [pdfs].flatten.each do |pdf|
         pdf_name = File.basename(pdf, File.extname(pdf))
-        page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
-        FileUtils.mkdir_p @output unless File.exists?(@output)
+        page_path = ESCAPE[File.join(@output, pdf_name.to_s)] + '_%d.pdf'
+        FileUtils.mkdir_p @output unless File.exist?(@output)
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
-          "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
-        else
-          "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
+                "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
+              else
+                "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
         end
         result = `#{cmd}`.chomp
-        FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
-        raise ExtractionFailed, result if $? != 0
+        FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
+        raise ExtractionFailed, result if $?.exitstatus.nonzero?
         result
       end
     end
     private
     def extract_options(options)
       @output = options[:output] || '.'
     end
   end
 end

data/lib/docsplit/pdf_extractor.rb CHANGED

@@ -6,22 +6,24 @@ module Docsplit
     @@version_string = nil
     # Provide a set of helper functions to determine the OS.
-    HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
+    HOST_OS = (defined?('RbConfig') ? RbConfig : Config)::CONFIG['host_os']
     def windows?
       !!HOST_OS.match(/mswin|windows|cygwin/i)
     end
     def osx?
       !!HOST_OS.match(/darwin/i)
     end
     def linux?
       !!HOST_OS.match(/linux/i)
     end
     # The first line of the help output holds the name and version number
     # of the office software to be used for extraction.
     def version_string
       unless @@version_string
-        null = windows? ? "NUL" : "/dev/null"
+        null = windows? ? 'NUL' : '/dev/null'
         @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
         if !!@@version_string.to_s.match(/[0-9]*/)
           @@version_string = `#{office_executable} --version`.split("\n").first
@@ -29,23 +31,25 @@ module Docsplit
       end
       @@version_string
     end
     def libre_office?
       !!version_string.match(/^LibreOffice/)
     end
     def open_office?
       !!version_string.match(/^OpenOffice.org/)
     end
     # A set of default locations to search for office software
     # These have been extracted from JODConverter.  Each listed
-    # path should contain a directory "program" which in turn
+    # path should contain a directory "program" which in turn
     # contains the "soffice" executable.
     # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
     def office_search_paths
       if windows?
-        office_names       = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
-        program_files_path = ENV["CommonProgramFiles"]
-        search_paths       = office_names.map{ |program| File.join(program_files_path, program) }
+        office_names       = ['LibreOffice 3', 'LibreOffice 4', 'OpenOffice.org 3']
+        program_files_path = ENV['CommonProgramFiles']
+        search_paths       = office_names.map { |program| File.join(program_files_path, program) }
       elsif osx?
         search_paths = %w(
           /Applications/LibreOffice.app/Contents
@@ -69,7 +73,7 @@ module Docsplit
       end
       search_paths
     end
     # Identify the path to a working office executable.
     def office_executable
       paths = office_search_paths
@@ -78,45 +82,45 @@ module Docsplit
       # raise an error if that path isn't valid, otherwise, add
       # it to the front of our search paths.
       if ENV['OFFICE_PATH']
-        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
+        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
         paths.unshift(ENV['OFFICE_PATH'])
       end
       # The location of the office executable is OS dependent
-      path_pieces = ["soffice"]
+      path_pieces = ['soffice']
       if windows?
-        path_pieces += [["program", "soffice.bin"]]
+        path_pieces += [['program', 'soffice.bin']]
       elsif osx?
-        path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
+        path_pieces += [%w(MacOS soffice), %w(Contents MacOS soffice)]
       else
-        path_pieces += [["program", "soffice"]]
+        path_pieces += [%w(program soffice)]
       end
       # Search for the first suitable office executable
       # and short circuit an executable is found.
       paths.each do |path|
-        if File.exists? path
+        if File.exist? path
           @@executable ||= path unless File.directory? path
           path_pieces.each do |pieces|
             check_path = File.join(path, pieces)
-            @@executable ||= check_path if File.exists? check_path
+            @@executable ||= check_path if File.exist? check_path
           end
         end
         break if @@executable
       end
-      raise OfficeNotFound, "No office software found" unless @@executable
+      raise OfficeNotFound, 'No office software found' unless @@executable
       @@executable
     end
     # Used to specify the office location for JODConverter
     def office_path
       File.dirname(File.dirname(office_executable))
     end
     # Convert documents to PDF.
     def extract(docs, opts)
       out = opts[:output] || '.'
-      FileUtils.mkdir_p out unless File.exists?(out)
+      FileUtils.mkdir_p out unless File.exist?(out)
       [docs].flatten.each do |doc|
         ext = File.extname(doc)
         basename = File.basename(doc, ext)
@@ -127,12 +131,12 @@ module Docsplit
         else
           if libre_office?
             # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
-            ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
+            ENV['SYSUSERCONFIG'] = "file://#{File.expand_path(escaped_out)}"
             options = "--headless --invisible  --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
             cmd = "#{office_executable} #{options} 2>&1"
             result = `#{cmd}`.chomp
-            raise ExtractionFailed, result if $? != 0
+            raise ExtractionFailed, result if $?.exitstatus.nonzero?
             true
           else # open office presumably, rely on JODConverter to figure it out.
             options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
@@ -142,23 +146,22 @@ module Docsplit
       end
     end
-    CLASSPATH     = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
+    CLASSPATH     = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'".freeze
-    LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
+    LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties".freeze
+    HEADLESS      = '-Djava.awt.headless=true'.freeze
-    HEADLESS      = "-Djava.awt.headless=true"
     private
-    # Runs a Java command, with quieted logging, and the classpath set properly.
-    def run_jod(command, pdfs, opts, return_output=false)
-      pdfs   = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
+    # Runs a Java command, with quieted logging, and the classpath set properly.
+    def run_jod(command, pdfs, _opts, return_output = false)
+      pdfs   = [pdfs].flatten.map { |pdf| "\"#{pdf}\"" }.join(' ')
       office = osx? ? "-Doffice.home=#{office_path}" : office_path
       cmd    = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
       result = `#{cmd}`.chomp
-      raise ExtractionFailed, result if $? != 0
-      return return_output ? (result.empty? ? nil : result) : true
+      raise ExtractionFailed, result if $?.exitstatus.nonzero?
+      return_output ? (result.empty? ? nil : result) : true
     end
     class OfficeNotFound < StandardError; end

data/lib/docsplit/text_cleaner.rb CHANGED

@@ -1,7 +1,6 @@
 require 'strscan'
 module Docsplit
   # Cleans up OCR'd text by using a series of heuristics to remove garbage
   # words. Algorithms taken from:
   #
@@ -13,7 +12,6 @@ module Docsplit
   #       -- Kulp
   #
   class TextCleaner
     # Cached regexes we plan on using.
     WORD        = /\S+/
     SPACE       = /\s+/
@@ -36,7 +34,7 @@ module Docsplit
     # multibyte-aware version, coercing to ASCII first.
     def clean(text)
       if String.method_defined?(:encode)
-        text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
+        text.encode!('ascii', invalid: :replace, undef: :replace, replace: '?')
       else
         require 'iconv' unless defined?(Iconv)
         text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
@@ -67,33 +65,31 @@ module Docsplit
       # More than 30 bytes in length.
       (w.length > 30) ||
-      # If there are three or more identical characters in a row in the string.
-      (w =~ REPEAT) ||
+        # If there are three or more identical characters in a row in the string.
+        (w =~ REPEAT) ||
-      # More punctuation than alpha numerics.
-      (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
+        # More punctuation than alpha numerics.
+        (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
-      # Ignoring the first and last characters in the string, if there are three or
-      # more different punctuation characters in the string.
-      (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
+        # Ignoring the first and last characters in the string, if there are three or
+        # more different punctuation characters in the string.
+        (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
-      # Four or more consecutive vowels, or five or more consecutive consonants.
-      ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
+        # Four or more consecutive vowels, or five or more consecutive consonants.
+        ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
-      # Number of uppercase letters greater than lowercase letters, but the word is
-      # not all uppercase + punctuation.
-      (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
+        # Number of uppercase letters greater than lowercase letters, but the word is
+        # not all uppercase + punctuation.
+        (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
-      # Single letters that are not A or I.
-      (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
+        # Single letters that are not A or I.
+        (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
-      # All characters are alphabetic and there are 8 times more vowels than
-      # consonants, or 8 times more consonants than vowels.
-      (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
-        (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
-          (cons > vows * 8)))
+        # All characters are alphabetic and there are 8 times more vowels than
+        # consonants, or 8 times more consonants than vowels.
+        (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
+          (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
+            (cons > vows * 8)))
     end
   end
 end

data/lib/docsplit/text_extractor.rb CHANGED

@@ -1,5 +1,4 @@
 module Docsplit
   # Delegates to **pdftotext** and **tesseract** in order to extract text from
   # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
   # forbid OCR extraction, but by default the heuristic works like this:
@@ -13,11 +12,10 @@ module Docsplit
   #  * Re-OCR each page in the `@pages_to_ocr` list at the end.
   #
   class TextExtractor
     NO_TEXT_DETECTED = /---------\n\Z/
-    OCR_FLAGS   = '-density 400x400 -colorspace GRAY'
-    MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
+    OCR_FLAGS   = '-density 400x400 -colorspace GRAY'.freeze
+    MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'.freeze
     MIN_TEXT_PER_PAGE = 100 # in bytes
@@ -28,10 +26,10 @@ module Docsplit
     # Extract text from a list of PDFs.
     def extract(pdfs, opts)
       extract_options opts
-      FileUtils.mkdir_p @output unless File.exists?(@output)
+      FileUtils.mkdir_p @output unless File.exist?(@output)
       [pdfs].flatten.each do |pdf|
         @pdf_name = File.basename(pdf, File.extname(pdf))
-        pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
+        pages = @pages == 'all' ? 1..Docsplit.extract_length(pdf) : @pages
         if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
           extract_from_ocr(pdf, pages)
         else
@@ -52,7 +50,7 @@ module Docsplit
     # Extract a page range worth of text from a PDF, directly.
     def extract_from_pdf(pdf, pages)
       return extract_full(pdf) unless pages
-      pages.each {|page| extract_page(pdf, page) }
+      pages.each { |page| extract_page(pdf, page) }
     end
     # Extract a page range worth of text from a PDF via OCR.
@@ -60,7 +58,7 @@ module Docsplit
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
-      psm = @detect_orientation ? "-psm 1" : ""
+      psm = @detect_orientation ? '-psm 1' : ''
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
@@ -75,15 +73,14 @@ module Docsplit
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
+        # if the user says don't do orientation detection or the plugin is not installed, set psm to 0
         run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
     private
     def clean_text(file)
@@ -98,7 +95,7 @@ module Docsplit
     # Run an external process and raise an exception if it fails.
     def run(command)
       result = `#{command}`
-      raise ExtractionFailed, result if $? != 0
+      raise ExtractionFailed, result if $?.exitstatus.nonzero?
       result
     end
@@ -124,10 +121,8 @@ module Docsplit
       @force_ocr          = options[:ocr] == true
       @forbid_ocr         = options[:ocr] == false
       @language           = options[:language] || 'eng'
-      @clean_ocr          = (!(options[:clean] == false) and @language == 'eng')
-      @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
+      @clean_ocr          = (!(options[:clean] == false) && @language == 'eng')
+      @detect_orientation = ((options[:detect_orientation] != false) && DEPENDENCIES[:osd])
     end
   end
 end

data/lib/docsplit/transparent_pdfs.rb CHANGED

@@ -1,9 +1,7 @@
 module Docsplit
   # Include a method to transparently convert non-PDF arguments to temporary
   # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
   module TransparentPDFs
     # Temporarily convert any non-PDF documents to PDFs before running them
     # through further extraction.
     def ensure_pdfs(docs)
@@ -12,18 +10,16 @@ module Docsplit
           doc
         else
           tempdir = File.join(Dir.tmpdir, 'docsplit')
-          extract_pdf([doc], {:output => tempdir})
+          extract_pdf([doc], output: tempdir)
           File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
         end
       end
     end
     def is_pdf?(doc)
-      File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
+      File.extname(doc).casecmp('.pdf').zero? || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
     end
   end
   extend TransparentPDFs
 end

data/lib/docsplit/version.rb ADDED

@@ -0,0 +1,3 @@
+module Docsplit
+  VERSION = '0.7.9'.freeze
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: burisu-docsplit
 version: !ruby/object:Gem::Version
-  version: 0.7.8
+  version: 0.7.9
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-06-26 00:00:00.000000000 Z
+date: 2016-09-06 00:00:00.000000000 Z
 dependencies: []
 description: |2
       Docsplit is a command-line utility and Ruby library for splitting apart
@@ -36,6 +36,7 @@ files:
 - lib/docsplit/text_cleaner.rb
 - lib/docsplit/text_extractor.rb
 - lib/docsplit/transparent_pdfs.rb
+- lib/docsplit/version.rb
 - vendor/conf/document-formats.js
 - vendor/jodconverter/commons-cli-1.1.jar
 - vendor/jodconverter/commons-io-1.4.jar
@@ -66,9 +67,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.5
+rubygems_version: 2.4.5.1
 signing_key:
 specification_version: 4
 summary: Break Apart Documents into Images, Text, Pages and PDFs
 test_files: []
-has_rdoc: