RubyGems - docsplit - Versions diffs - 0.4.1 → 0.5.0 - Mend

docsplit 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/docsplit.gemspec +2 -2
data/lib/docsplit.rb +7 -1
data/lib/docsplit/command_line.rb +4 -1
data/lib/docsplit/text_cleaner.rb +94 -0
data/lib/docsplit/text_extractor.rb +14 -1
metadata +6 -5

data/docsplit.gemspec CHANGED

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.4.1'         # Keep version in sync with docsplit.rb
-  s.date      = '2010-8-23'
+  s.version   = '0.5.0'         # Keep version in sync with docsplit.rb
+  s.date      = '2010-10-18'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"

data/lib/docsplit.rb CHANGED

@@ -1,7 +1,7 @@
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.4.1' # Keep in sync with gemspec.
+  VERSION       = '0.5.0' # Keep in sync with gemspec.
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')
@@ -72,6 +72,11 @@ module Docsplit
     EOS
   end
+  # Utility method to clean OCR'd text with garbage characters.
+  def self.clean_text(text)
+    TextCleaner.new.clean(text)
+  end
   private
@@ -103,3 +108,4 @@ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
 require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"

data/lib/docsplit/command_line.rb CHANGED

@@ -71,7 +71,7 @@ Options:
     # Use the OptionParser library to parse out all supported options. Return
     # options formatted for the Ruby API.
     def parse_options
-      @options = {:ocr => :default}
+      @options = {:ocr => :default, :clean => true}
       @option_parser = OptionParser.new do |opts|
         opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
           @options[:output] = d
@@ -88,6 +88,9 @@ Options:
         opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
           @options[:ocr] = o
         end
+        opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
+          @options[:clean] = false
+        end
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true
         end

data/lib/docsplit/text_cleaner.rb ADDED

@@ -0,0 +1,94 @@
+require 'iconv'
+require 'strscan'
+module Docsplit
+  # Cleans up OCR'd text by using a series of heuristics to remove garbage
+  # words. Algorithms taken from:
+  #
+  #     Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
+  #       -- Taghva, Nartker, Condit, and Borsack
+  #
+  #     Improving Search and Retrieval Performance through Shortening Documents,
+  #     Detecting Garbage, and Throwing out Jargon
+  #       -- Kulp
+  #
+  class TextCleaner
+    # Cached regexes we plan on using.
+    WORD        = /\S+/
+    SPACE       = /\s+/
+    NEWLINE     = /[\r\n]/
+    ALNUM       = /[a-z0-9]/i
+    PUNCT       = /[[:punct:]]/i
+    REPEAT      = /([^0-9])\1{2,}/
+    UPPER       = /[A-Z]/
+    LOWER       = /[a-z]/
+    ACRONYM     = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/
+    ALL_ALPHA   = /^[a-z]+$/i
+    CONSONANT   = /(^y|[bcdfghjklmnpqrstvwxz])/i
+    VOWEL       = /([aeiou]|y$)/i
+    CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
+    VOWEL_4     = /[aeiou]{4}/i
+    REPEATED    = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
+    SINGLETONS  = /^[AaIi]$/
+    # For the time being, `clean` uses the regular StringScanner, and not the
+    # multibyte-aware version, coercing to ASCII first.
+    def clean(text)
+      text    = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
+      scanner = StringScanner.new(text)
+      cleaned = []
+      spaced  = false
+      loop do
+        if space = scanner.scan(SPACE)
+          cleaned.push(space) unless spaced && (space !~ NEWLINE)
+          spaced = true
+        elsif word = scanner.scan(WORD)
+          unless garbage(word)
+            cleaned.push(word)
+            spaced = false
+          end
+        elsif scanner.eos?
+          return cleaned.join('').gsub(REPEATED, '')
+        end
+      end
+    end
+    # Is a given word OCR garbage?
+    def garbage(w)
+      acronym = w =~ ACRONYM
+      # More than 30 bytes in length.
+      (w.length > 30) ||
+      # If there are three or more identical characters in a row in the string.
+      (w =~ REPEAT) ||
+      # More punctuation than alpha numerics.
+      (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
+      # Ignoring the first and last characters in the string, if there are three or
+      # more different punctuation characters in the string.
+      (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
+      # Four or more consecutive vowels, or five or more consecutive consonants.
+      ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
+      # Number of uppercase letters greater than lowercase letters, but the word is
+      # not all uppercase + punctuation.
+      (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
+      # Single letters that are not A or I.
+      (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
+      # All characters are alphabetic and there are 8 times more vowels than
+      # consonants, or 8 times more consonants than vowels.
+      (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
+        (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
+          (cons > vows * 8)))
+    end
+  end
+end

data/lib/docsplit/text_extractor.rb CHANGED

@@ -62,14 +62,17 @@ module Docsplit
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
+          file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
-          run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
+          run "tesseract #{tiff} #{file} 2>&1"
+          clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
         run "tesseract #{tiff} #{base_path} -l eng 2>&1"
+        clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
@@ -78,6 +81,15 @@ module Docsplit
     private
+    def clean_text(file)
+      File.open(file, 'r+') do |f|
+        text = f.read
+        f.truncate(0)
+        f.rewind
+        f.write(Docsplit.clean_text(text))
+      end
+    end
     # Run an external process and raise an exception if it fails.
     def run(command)
       result = `#{command}`
@@ -106,6 +118,7 @@ module Docsplit
       @pages      = options[:pages]
       @force_ocr  = options[:ocr] == true
       @forbid_ocr = options[:ocr] == false
+      @clean_ocr  = !(options[:clean] == false)
     end
   end

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: docsplit
 version: !ruby/object:Gem::Version
-  hash: 13
+  hash: 11
   prerelease: false
   segments:
   - 0
-  - 4
-  - 1
-  version: 0.4.1
+  - 5
+  - 0
+  version: 0.5.0
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-08-23 00:00:00 -04:00
+date: 2010-10-18 00:00:00 -04:00
 default_executable:
 dependencies: []
@@ -33,6 +33,7 @@ files:
 - lib/docsplit/image_extractor.rb
 - lib/docsplit/info_extractor.rb
 - lib/docsplit/page_extractor.rb
+- lib/docsplit/text_cleaner.rb
 - lib/docsplit/text_extractor.rb
 - lib/docsplit/transparent_pdfs.rb
 - lib/docsplit.rb