RubyGems - burisu-docsplit - Versions diffs - 0.7.8 → 0.7.9 - Mend

burisu-docsplit 0.7.8 → 0.7.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/bin/docsplit +1 -1
data/docsplit.gemspec +7 -3
data/lib/docsplit.rb +15 -18
data/lib/docsplit/command_line.rb +20 -27
data/lib/docsplit/image_extractor.rb +18 -23
data/lib/docsplit/info_extractor.rb +14 -18
data/lib/docsplit/page_extractor.rb +8 -13
data/lib/docsplit/pdf_extractor.rb +38 -35
data/lib/docsplit/text_cleaner.rb +20 -24
data/lib/docsplit/text_extractor.rb +11 -16
data/lib/docsplit/transparent_pdfs.rb +2 -6
data/lib/docsplit/version.rb +3 -0
metadata +4 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cc1638e9d3bdaf775ea840631c3e342a68e33059
-  data.tar.gz: 83834d054f1c95520aa375ebbd4d3f3ced1617d3
+  metadata.gz: 2ad2a468e06ca5502c5899d1b7a7b5ea3ed0c42d
+  data.tar.gz: 48fdaf6262a31252476bb55c2a54ef6697799079
 SHA512:
-  metadata.gz: 93d0291009a6fb31e016f68862ee97a33cdd7f27f94c37a197078ccbe9c77f8ba5cbc5ccb19367991734e43065db7eb112510be9952c9fdbd16fb39fd5072f36
-  data.tar.gz: d0a3485206de1367d07b10ab800197396928b178c3516ae2796cda108ebb992cb19b2951579edaf7a6ec24e7ce7f0ad3a2f43ef20448664b84fe9285843eb709
+  metadata.gz: c5b222b0b49176dd10c3bf7583c74ecede1b40f8f00dc0bbee5f056997f305b4e1ac80f69956365432c37ea05e3d4143c740cd62be589d4888d0d1d320a0fd08
+  data.tar.gz: 144e647ee2207fe57ae7a7302fa6eb626c7f7be3a25dbe29d89edae0ebe33da692fa79654a1a266d68c8cef5b4d663baf1b069d503c8a86bcad4225a86432644

data/bin/docsplit CHANGED

@@ -2,4 +2,4 @@
 require "#{File.dirname(__FILE__)}/../lib/docsplit/command_line.rb"
-Docsplit::CommandLine.new
+Docsplit::CommandLine.new

data/docsplit.gemspec CHANGED

@@ -1,8 +1,12 @@
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'docsplit/version'
 Gem::Specification.new do |s|
   s.name      = 'burisu-docsplit'
-  s.version   = '0.7.8'         # Keep version in sync with docsplit.rb
-  s.homepage    = "http://documentcloud.github.com/docsplit/"
-  s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"
+  s.version   = Docsplit::VERSION # Keep version in sync with docsplit.rb
+  s.homepage    = 'http://documentcloud.github.com/docsplit/'
+  s.summary     = 'Break Apart Documents into Images, Text, Pages and PDFs'
   s.description = <<-EOS
     Docsplit is a command-line utility and Ruby library for splitting apart
     documents into their component parts: searchable UTF-8 plain text, page

data/lib/docsplit.rb CHANGED

@@ -1,22 +1,20 @@
 require 'tmpdir'
 require 'fileutils'
 require 'shellwords'
+require 'docsplit/version'
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.7.6' # Keep in sync with gemspec.
-  ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
+  ESCAPE        = ->(x) { Shellwords.shellescape(x) }
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
   ESCAPED_ROOT  = ESCAPE[ROOT]
-  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
-  GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
+  METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length].freeze
+  GM_FORMATS    = ['image/gif', 'image/jpeg', 'image/png', 'image/x-ms-bmp', 'image/svg+xml', 'image/tiff', 'image/x-portable-bitmap', 'application/postscript', 'image/x-portable-pixmap'].freeze
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
+  DEPENDENCIES  = { java: false, gm: false, pdftotext: false, pdftk: false, pdftailor: false, tesseract: false, osd: false }
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -32,28 +30,28 @@ module Docsplit
   # if tesseract is found check for the osd plugin so that we can do orientation independent OCR.
   if DEPENDENCIES[:tesseract]
     # osd will be listed in tesseract --listlangs
-    val = %x[ #{'tesseract --list-langs'} 2>&1 >/dev/null ]
+    val = `#{'tesseract --list-langs'} 2>&1 >/dev/null`
     DEPENDENCIES[:osd] = true if val =~ /\bosd\b/
   end
-    # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
+  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
   # broke.
   class ExtractionFailed < StandardError; end
   # Use the ExtractPages Java class to burst a PDF into single pages.
-  def self.extract_pages(pdfs, opts={})
+  def self.extract_pages(pdfs, opts = {})
     pdfs = ensure_pdfs(pdfs)
     PageExtractor.new.extract(pdfs, opts)
   end
   # Use the ExtractText Java class to write out all embedded text.
-  def self.extract_text(pdfs, opts={})
+  def self.extract_text(pdfs, opts = {})
     pdfs = ensure_pdfs(pdfs)
     TextExtractor.new.extract(pdfs, opts)
   end
   # Use the ExtractImages Java class to rasterize a PDF into each page's image.
-  def self.extract_images(pdfs, opts={})
+  def self.extract_images(pdfs, opts = {})
     pdfs = ensure_pdfs(pdfs)
     opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
     ImageExtractor.new.extract(pdfs, opts)
@@ -61,7 +59,7 @@ module Docsplit
   # Use JODCConverter to extract the documents as PDFs.
   # If the document is in an image format, use GraphicsMagick to extract the PDF.
-  def self.extract_pdf(docs, opts={})
+  def self.extract_pdf(docs, opts = {})
     PdfExtractor.new.extract(docs, opts)
   end
@@ -75,8 +73,8 @@ module Docsplit
       end
     EOS
   end
-  def self.extract_info(pdfs, opts={})
+  def self.extract_info(pdfs, opts = {})
     pdfs = ensure_pdfs(pdfs)
     InfoExtractor.new.extract_all(pdfs, opts)
   end
@@ -93,11 +91,10 @@ module Docsplit
   def self.normalize_value(value)
     case value
     when Range then value.to_a.join(',')
-    when Array then value.map! {|v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
+    when Array then value.map! { |v| v.is_a?(Range) ? normalize_value(v) : v }.join(',')
     else            value.to_s
     end
   end
 end
 require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"

data/lib/docsplit/command_line.rb CHANGED

@@ -2,11 +2,9 @@ require 'optparse'
 require File.expand_path(File.dirname(__FILE__) + '/../docsplit')
 module Docsplit
   # A single command-line utility to separate a PDF into all its component parts.
   class CommandLine
-    BANNER = <<-EOS
+    BANNER = <<-EOS.freeze
 docsplit breaks apart documents into images, text, or individual pages.
 It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
@@ -39,24 +37,22 @@ Options:
     # Delegate to the Docsplit Ruby API to perform all extractions.
     def run
-      begin
-        case @command
-        when :images  then Docsplit.extract_images(ARGV, @options)
-        when :pages   then Docsplit.extract_pages(ARGV, @options)
-        when :text    then Docsplit.extract_text(ARGV, @options)
-        when :pdf     then Docsplit.extract_pdf(ARGV, @options)
+      case @command
+      when :images  then Docsplit.extract_images(ARGV, @options)
+      when :pages   then Docsplit.extract_pages(ARGV, @options)
+      when :text    then Docsplit.extract_text(ARGV, @options)
+      when :pdf     then Docsplit.extract_pdf(ARGV, @options)
+      else
+        if METADATA_KEYS.include?(@command)
+          value = Docsplit.send("extract_#{@command}", ARGV, @options)
+          puts value unless value.nil?
         else
-          if METADATA_KEYS.include?(@command)
-            value = Docsplit.send("extract_#{@command}", ARGV, @options)
-            puts value unless value.nil?
-          else
-            usage
-          end
+          usage
         end
-      rescue ExtractionFailed => e
-        puts e.message.chomp
-        exit(1)
       end
+    rescue ExtractionFailed => e
+      puts e.message.chomp
+      exit(1)
     end
     # Print out the usage help message.
@@ -65,18 +61,17 @@ Options:
       exit
     end
     private
     # Use the OptionParser library to parse out all supported options. Return
     # options formatted for the Ruby API.
     def parse_options
-      @options = {:ocr => :default, :clean => true}
+      @options = { ocr: :default, clean: true }
       @option_parser = OptionParser.new do |opts|
         opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
           @options[:output] = d
         end
-        opts.on('-p', '--pages [PAGES]', "extract specific pages (eg: 5-10)") do |p|
+        opts.on('-p', '--pages [PAGES]', 'extract specific pages (eg: 5-10)') do |p|
           @options[:pages] = p
         end
         opts.on('-s', '--size [SIZE]', 'set a fixed size (eg: 50x75)') do |s|
@@ -91,16 +86,16 @@ Options:
         opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
           @options[:ocr] = o
         end
-        opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
+        opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |_c|
           @options[:clean] = false
         end
         opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
           @options[:language] = l
         end
-        opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |n|
+        opts.on('--no-orientation-detection', 'turn off automatic orientation detection in tesseract') do |_n|
           @options[:detect_orientation] = false
         end
-        opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
+        opts.on('-r', '--rolling', 'generate images from each previous image') do |_r|
           @options[:rolling] = true
         end
         opts.on_tail('-v', '--version', 'display docsplit version') do
@@ -119,7 +114,5 @@ Options:
         exit(1)
       end
     end
   end
-end
+end

data/lib/docsplit/image_extractor.rb CHANGED

@@ -1,12 +1,10 @@
 module Docsplit
   # Delegates to GraphicsMagick in order to convert PDF documents into
   # nicely sized images.
   class ImageExtractor
-    MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
+    MEMORY_ARGS     = '-limit memory 256MiB -limit map 512MiB'.freeze
     DEFAULT_FORMAT  = :png
-    DEFAULT_DENSITY = '150'
+    DEFAULT_DENSITY = '150'.freeze
     # Extract a list of PDFs as rasterized page images, according to the
     # configuration in options.
@@ -15,8 +13,8 @@ module Docsplit
       extract_options(options)
       @pdfs.each do |pdf|
         previous = nil
-        @sizes.each_with_index do |size, i|
-          @formats.each {|format| convert(pdf, size, format, previous) }
+        @sizes.each_with_index do |size, _i|
+          @formats.each { |format| convert(pdf, size, format, previous) }
           previous = size if @rolling
         end
       end
@@ -27,36 +25,35 @@ module Docsplit
     # we simply downsample that image, instead of re-rendering the entire PDF.
     # Now we generate one page at a time, a counterintuitive opimization
     # suggested by the GraphicsMagick list, that seems to work quite well.
-    def convert(pdf, size, format, previous=nil)
+    def convert(pdf, size, format, previous = nil)
       tempdir   = Dir.mktmpdir
       basename  = File.basename(pdf, File.extname(pdf))
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       escaped_pdf = ESCAPE[pdf]
-      FileUtils.mkdir_p(directory) unless File.exists?(directory)
-      common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
+      FileUtils.mkdir_p(directory) unless File.exist?(directory)
+      common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
         result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
-        raise ExtractionFailed, result if $? != 0
+        raise ExtractionFailed, result if $?.exitstatus.nonzero?
       else
         page_list(pages).each do |page|
-          out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
+          out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
           cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
-          raise ExtractionFailed, result if $? != 0
+          raise ExtractionFailed, result if $?.exitstatus.nonzero?
         end
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
     private
     # Extract the relevant GraphicsMagick options from the options hash.
     def extract_options(options)
-      @output  = options[:output]  || '.'
+      @output  = options[:output] || '.'
       @pages   = options[:pages]
       @density = options[:density] || DEFAULT_DENSITY
       @formats = [options[:format] || DEFAULT_FORMAT].flatten
@@ -80,24 +77,22 @@ module Docsplit
     # Generate the appropriate quality argument for the image format.
     def quality_arg(format)
       case format.to_s
-      when /jpe?g/ then "-quality 85"
-      when /png/   then "-quality 100"
-      else ""
+      when /jpe?g/ then '-quality 85'
+      when /png/   then '-quality 100'
+      else ''
       end
     end
     # Generate the expanded list of requested page numbers.
     def page_list(pages)
-      pages.split(',').map { |range|
+      pages.split(',').map do |range|
         if range.include?('-')
           range = range.split('-')
-          Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
+          Range.new(range.first.to_i, range.last.to_i).to_a.map(&:to_i)
         else
           range.to_i
         end
-      }.flatten.uniq.sort
+      end.flatten.uniq.sort
     end
   end
 end

data/lib/docsplit/info_extractor.rb CHANGED

@@ -1,36 +1,34 @@
 module Docsplit
   # Delegates to **pdfinfo** in order to extract information about a PDF file.
   class InfoExtractor
     # Regex matchers for different bits of information.
     MATCHERS = {
-      :author   => /^Author:\s+([^\n]+)/,
-      :date     => /^CreationDate:\s+([^\n]+)/,
-      :creator  => /^Creator:\s+([^\n]+)/,
-      :keywords => /^Keywords:\s+([^\n]+)/,
-      :producer => /^Producer:\s+([^\n]+)/,
-      :subject  => /^Subject:\s+([^\n]+)/,
-      :title    => /^Title:\s+([^\n]+)/,
-      :length   => /^Pages:\s+([^\n]+)/,
-    }
+      author: /^Author:\s+([^\n]+)/,
+      date: /^CreationDate:\s+([^\n]+)/,
+      creator: /^Creator:\s+([^\n]+)/,
+      keywords: /^Keywords:\s+([^\n]+)/,
+      producer: /^Producer:\s+([^\n]+)/,
+      subject: /^Subject:\s+([^\n]+)/,
+      title: /^Title:\s+([^\n]+)/,
+      length: /^Pages:\s+([^\n]+)/
+    }.freeze
     # Pull out a single datum from a pdf.
     def extract(key, pdfs, opts)
       extract_all(pdfs, opts)[key]
     end
-    def extract_all(pdfs, opts)
+    def extract_all(pdfs, _opts)
       pdf = [pdfs].flatten.first
       cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
       result = `#{cmd}`.chomp
-      raise ExtractionFailed, result if $? != 0
+      raise ExtractionFailed, result if $?.exitstatus.nonzero?
       # ruby  1.8 (iconv) and 1.9 (String#encode) :
       if String.method_defined?(:encode)
-        result.encode!('UTF-8', 'binary', :invalid => :replace, :undef => :replace, :replace => "") unless result.valid_encoding?
+        result.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') unless result.valid_encoding?
       else
         require 'iconv' unless defined?(Iconv)
-        ic = Iconv.new('UTF-8//IGNORE','UTF-8')
+        ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
         result = ic.iconv(result)
       end
       info = {}
@@ -44,7 +42,5 @@ module Docsplit
       end
       info
     end
   end
 end

data/lib/docsplit/page_extractor.rb CHANGED

@@ -1,36 +1,31 @@
 module Docsplit
   # Delegates to **pdftk** in order to create bursted single pages from
   # a PDF document.
   class PageExtractor
     # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
     def extract(pdfs, opts)
       extract_options opts
       [pdfs].flatten.each do |pdf|
         pdf_name = File.basename(pdf, File.extname(pdf))
-        page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf"
-        FileUtils.mkdir_p @output unless File.exists?(@output)
+        page_path = ESCAPE[File.join(@output, pdf_name.to_s)] + '_%d.pdf'
+        FileUtils.mkdir_p @output unless File.exist?(@output)
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
-          "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
-        else
-          "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
+                "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1"
+              else
+                "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1"
         end
         result = `#{cmd}`.chomp
-        FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
-        raise ExtractionFailed, result if $? != 0
+        FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt')
+        raise ExtractionFailed, result if $?.exitstatus.nonzero?
         result
       end
     end
     private
     def extract_options(options)
       @output = options[:output] || '.'
     end
   end
 end

data/lib/docsplit/pdf_extractor.rb CHANGED

@@ -6,22 +6,24 @@ module Docsplit
     @@version_string = nil
     # Provide a set of helper functions to determine the OS.
-    HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
+    HOST_OS = (defined?('RbConfig') ? RbConfig : Config)::CONFIG['host_os']
     def windows?
       !!HOST_OS.match(/mswin|windows|cygwin/i)
     end
     def osx?
       !!HOST_OS.match(/darwin/i)
     end
     def linux?
       !!HOST_OS.match(/linux/i)
     end
     # The first line of the help output holds the name and version number
     # of the office software to be used for extraction.
     def version_string
       unless @@version_string
-        null = windows? ? "NUL" : "/dev/null"
+        null = windows? ? 'NUL' : '/dev/null'
         @@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
         if !!@@version_string.to_s.match(/[0-9]*/)
           @@version_string = `#{office_executable} --version`.split("\n").first
@@ -29,23 +31,25 @@ module Docsplit
       end
       @@version_string
     end
     def libre_office?
       !!version_string.match(/^LibreOffice/)
     end
     def open_office?
       !!version_string.match(/^OpenOffice.org/)
     end
     # A set of default locations to search for office software
     # These have been extracted from JODConverter.  Each listed
-    # path should contain a directory "program" which in turn
+    # path should contain a directory "program" which in turn
     # contains the "soffice" executable.
     # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
     def office_search_paths
       if windows?
-        office_names       = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
-        program_files_path = ENV["CommonProgramFiles"]
-        search_paths       = office_names.map{ |program| File.join(program_files_path, program) }
+        office_names       = ['LibreOffice 3', 'LibreOffice 4', 'OpenOffice.org 3']
+        program_files_path = ENV['CommonProgramFiles']
+        search_paths       = office_names.map { |program| File.join(program_files_path, program) }
       elsif osx?
         search_paths = %w(
           /Applications/LibreOffice.app/Contents
@@ -69,7 +73,7 @@ module Docsplit
       end
       search_paths
     end
     # Identify the path to a working office executable.
     def office_executable
       paths = office_search_paths
@@ -78,45 +82,45 @@ module Docsplit
       # raise an error if that path isn't valid, otherwise, add
       # it to the front of our search paths.
       if ENV['OFFICE_PATH']
-        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
+        raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
         paths.unshift(ENV['OFFICE_PATH'])
       end
       # The location of the office executable is OS dependent
-      path_pieces = ["soffice"]
+      path_pieces = ['soffice']
       if windows?
-        path_pieces += [["program", "soffice.bin"]]
+        path_pieces += [['program', 'soffice.bin']]
       elsif osx?
-        path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
+        path_pieces += [%w(MacOS soffice), %w(Contents MacOS soffice)]
       else
-        path_pieces += [["program", "soffice"]]
+        path_pieces += [%w(program soffice)]
       end
       # Search for the first suitable office executable
       # and short circuit an executable is found.
       paths.each do |path|
-        if File.exists? path
+        if File.exist? path
           @@executable ||= path unless File.directory? path
           path_pieces.each do |pieces|
             check_path = File.join(path, pieces)
-            @@executable ||= check_path if File.exists? check_path
+            @@executable ||= check_path if File.exist? check_path
           end
         end
         break if @@executable
       end
-      raise OfficeNotFound, "No office software found" unless @@executable
+      raise OfficeNotFound, 'No office software found' unless @@executable
       @@executable
     end
     # Used to specify the office location for JODConverter
     def office_path
       File.dirname(File.dirname(office_executable))
     end
     # Convert documents to PDF.
     def extract(docs, opts)
       out = opts[:output] || '.'
-      FileUtils.mkdir_p out unless File.exists?(out)
+      FileUtils.mkdir_p out unless File.exist?(out)
       [docs].flatten.each do |doc|
         ext = File.extname(doc)
         basename = File.basename(doc, ext)
@@ -127,12 +131,12 @@ module Docsplit
         else
           if libre_office?
             # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
-            ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
+            ENV['SYSUSERCONFIG'] = "file://#{File.expand_path(escaped_out)}"
             options = "--headless --invisible  --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
             cmd = "#{office_executable} #{options} 2>&1"
             result = `#{cmd}`.chomp
-            raise ExtractionFailed, result if $? != 0
+            raise ExtractionFailed, result if $?.exitstatus.nonzero?
             true
           else # open office presumably, rely on JODConverter to figure it out.
             options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
@@ -142,23 +146,22 @@ module Docsplit
       end
     end
-    CLASSPATH     = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
+    CLASSPATH     = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'".freeze
-    LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
+    LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties".freeze
+    HEADLESS      = '-Djava.awt.headless=true'.freeze
-    HEADLESS      = "-Djava.awt.headless=true"
     private
-    # Runs a Java command, with quieted logging, and the classpath set properly.
-    def run_jod(command, pdfs, opts, return_output=false)
-      pdfs   = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
+    # Runs a Java command, with quieted logging, and the classpath set properly.
+    def run_jod(command, pdfs, _opts, return_output = false)
+      pdfs   = [pdfs].flatten.map { |pdf| "\"#{pdf}\"" }.join(' ')
       office = osx? ? "-Doffice.home=#{office_path}" : office_path
       cmd    = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
       result = `#{cmd}`.chomp
-      raise ExtractionFailed, result if $? != 0
-      return return_output ? (result.empty? ? nil : result) : true
+      raise ExtractionFailed, result if $?.exitstatus.nonzero?
+      return_output ? (result.empty? ? nil : result) : true
     end
     class OfficeNotFound < StandardError; end

data/lib/docsplit/text_cleaner.rb CHANGED

@@ -1,7 +1,6 @@
 require 'strscan'
 module Docsplit
   # Cleans up OCR'd text by using a series of heuristics to remove garbage
   # words. Algorithms taken from:
   #
@@ -13,7 +12,6 @@ module Docsplit
   #       -- Kulp
   #
   class TextCleaner
     # Cached regexes we plan on using.
     WORD        = /\S+/
     SPACE       = /\s+/
@@ -36,7 +34,7 @@ module Docsplit
     # multibyte-aware version, coercing to ASCII first.
     def clean(text)
       if String.method_defined?(:encode)
-        text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
+        text.encode!('ascii', invalid: :replace, undef: :replace, replace: '?')
       else
         require 'iconv' unless defined?(Iconv)
         text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
@@ -67,33 +65,31 @@ module Docsplit
       # More than 30 bytes in length.
       (w.length > 30) ||
-      # If there are three or more identical characters in a row in the string.
-      (w =~ REPEAT) ||
+        # If there are three or more identical characters in a row in the string.
+        (w =~ REPEAT) ||
-      # More punctuation than alpha numerics.
-      (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
+        # More punctuation than alpha numerics.
+        (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
-      # Ignoring the first and last characters in the string, if there are three or
-      # more different punctuation characters in the string.
-      (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
+        # Ignoring the first and last characters in the string, if there are three or
+        # more different punctuation characters in the string.
+        (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
-      # Four or more consecutive vowels, or five or more consecutive consonants.
-      ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
+        # Four or more consecutive vowels, or five or more consecutive consonants.
+        ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
-      # Number of uppercase letters greater than lowercase letters, but the word is
-      # not all uppercase + punctuation.
-      (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
+        # Number of uppercase letters greater than lowercase letters, but the word is
+        # not all uppercase + punctuation.
+        (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
-      # Single letters that are not A or I.
-      (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
+        # Single letters that are not A or I.
+        (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
-      # All characters are alphabetic and there are 8 times more vowels than
-      # consonants, or 8 times more consonants than vowels.
-      (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
-        (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
-          (cons > vows * 8)))
+        # All characters are alphabetic and there are 8 times more vowels than
+        # consonants, or 8 times more consonants than vowels.
+        (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
+          (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
+            (cons > vows * 8)))
     end
   end
 end

data/lib/docsplit/text_extractor.rb CHANGED

@@ -1,5 +1,4 @@
 module Docsplit
   # Delegates to **pdftotext** and **tesseract** in order to extract text from
   # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
   # forbid OCR extraction, but by default the heuristic works like this:
@@ -13,11 +12,10 @@ module Docsplit
   #  * Re-OCR each page in the `@pages_to_ocr` list at the end.
   #
   class TextExtractor
     NO_TEXT_DETECTED = /---------\n\Z/
-    OCR_FLAGS   = '-density 400x400 -colorspace GRAY'
-    MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
+    OCR_FLAGS   = '-density 400x400 -colorspace GRAY'.freeze
+    MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'.freeze
     MIN_TEXT_PER_PAGE = 100 # in bytes
@@ -28,10 +26,10 @@ module Docsplit
     # Extract text from a list of PDFs.
     def extract(pdfs, opts)
       extract_options opts
-      FileUtils.mkdir_p @output unless File.exists?(@output)
+      FileUtils.mkdir_p @output unless File.exist?(@output)
       [pdfs].flatten.each do |pdf|
         @pdf_name = File.basename(pdf, File.extname(pdf))
-        pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
+        pages = @pages == 'all' ? 1..Docsplit.extract_length(pdf) : @pages
         if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
           extract_from_ocr(pdf, pages)
         else
@@ -52,7 +50,7 @@ module Docsplit
     # Extract a page range worth of text from a PDF, directly.
     def extract_from_pdf(pdf, pages)
       return extract_full(pdf) unless pages
-      pages.each {|page| extract_page(pdf, page) }
+      pages.each { |page| extract_page(pdf, page) }
     end
     # Extract a page range worth of text from a PDF via OCR.
@@ -60,7 +58,7 @@ module Docsplit
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
-      psm = @detect_orientation ? "-psm 1" : ""
+      psm = @detect_orientation ? '-psm 1' : ''
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
@@ -75,15 +73,14 @@ module Docsplit
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
+        # if the user says don't do orientation detection or the plugin is not installed, set psm to 0
         run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
-      FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
+      FileUtils.remove_entry_secure tempdir if File.exist?(tempdir)
     end
     private
     def clean_text(file)
@@ -98,7 +95,7 @@ module Docsplit
     # Run an external process and raise an exception if it fails.
     def run(command)
       result = `#{command}`
-      raise ExtractionFailed, result if $? != 0
+      raise ExtractionFailed, result if $?.exitstatus.nonzero?
       result
     end
@@ -124,10 +121,8 @@ module Docsplit
       @force_ocr          = options[:ocr] == true
       @forbid_ocr         = options[:ocr] == false
       @language           = options[:language] || 'eng'
-      @clean_ocr          = (!(options[:clean] == false) and @language == 'eng')
-      @detect_orientation = ((options[:detect_orientation] != false) and DEPENDENCIES[:osd])
+      @clean_ocr          = (!(options[:clean] == false) && @language == 'eng')
+      @detect_orientation = ((options[:detect_orientation] != false) && DEPENDENCIES[:osd])
     end
   end
 end

data/lib/docsplit/transparent_pdfs.rb CHANGED

@@ -1,9 +1,7 @@
 module Docsplit
   # Include a method to transparently convert non-PDF arguments to temporary
   # PDFs. Allows us to pretend to natively support docs, rtf, ppt, and so on.
   module TransparentPDFs
     # Temporarily convert any non-PDF documents to PDFs before running them
     # through further extraction.
     def ensure_pdfs(docs)
@@ -12,18 +10,16 @@ module Docsplit
           doc
         else
           tempdir = File.join(Dir.tmpdir, 'docsplit')
-          extract_pdf([doc], {:output => tempdir})
+          extract_pdf([doc], output: tempdir)
           File.join(tempdir, File.basename(doc, File.extname(doc)) + '.pdf')
         end
       end
     end
     def is_pdf?(doc)
-      File.extname(doc).downcase == '.pdf' || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
+      File.extname(doc).casecmp('.pdf').zero? || File.open(doc, 'rb', &:readline) =~ /\A\%PDF-\d+(\.\d+)?/
     end
   end
   extend TransparentPDFs
 end

data/lib/docsplit/version.rb ADDED

@@ -0,0 +1,3 @@
+module Docsplit
+  VERSION = '0.7.9'.freeze
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: burisu-docsplit
 version: !ruby/object:Gem::Version
-  version: 0.7.8
+  version: 0.7.9
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -10,7 +10,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-06-26 00:00:00.000000000 Z
+date: 2016-09-06 00:00:00.000000000 Z
 dependencies: []
 description: |2
       Docsplit is a command-line utility and Ruby library for splitting apart
@@ -36,6 +36,7 @@ files:
 - lib/docsplit/text_cleaner.rb
 - lib/docsplit/text_extractor.rb
 - lib/docsplit/transparent_pdfs.rb
+- lib/docsplit/version.rb
 - vendor/conf/document-formats.js
 - vendor/jodconverter/commons-cli-1.1.jar
 - vendor/jodconverter/commons-io-1.4.jar
@@ -66,9 +67,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.5
+rubygems_version: 2.4.5.1
 signing_key:
 specification_version: 4
 summary: Break Apart Documents into Images, Text, Pages and PDFs
 test_files: []
-has_rdoc: