RubyGems - docsplit - Versions diffs - 0.5.0 → 0.5.1 - Mend

docsplit 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/docsplit.gemspec +2 -2
data/lib/docsplit.rb +1 -1
data/lib/docsplit/text_cleaner.rb +3 -3
data/lib/docsplit/text_extractor.rb +1 -1
metadata +6 -8

data/docsplit.gemspec CHANGED Viewed

@@ -1,7 +1,7 @@
 Gem::Specification.new do |s|
   s.name      = 'docsplit'
-  s.version   = '0.5.0'         # Keep version in sync with docsplit.rb
-  s.date      = '2010-10-18'
+  s.version   = '0.5.1'         # Keep version in sync with docsplit.rb
+  s.date      = '2010-04-26'
   s.homepage    = "http://documentcloud.github.com/docsplit/"
   s.summary     = "Break Apart Documents into Images, Text, Pages and PDFs"

data/lib/docsplit.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # The Docsplit module delegates to the Java PDF extractors.
 module Docsplit
-  VERSION       = '0.5.0' # Keep in sync with gemspec.
+  VERSION       = '0.5.1' # Keep in sync with gemspec.
   ROOT          = File.expand_path(File.dirname(__FILE__) + '/..')

data/lib/docsplit/text_cleaner.rb CHANGED Viewed

@@ -24,12 +24,12 @@ module Docsplit
     REPEAT      = /([^0-9])\1{2,}/
     UPPER       = /[A-Z]/
     LOWER       = /[a-z]/
-    ACRONYM     = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/
+    ACRONYM     = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
     ALL_ALPHA   = /^[a-z]+$/i
     CONSONANT   = /(^y|[bcdfghjklmnpqrstvwxz])/i
     VOWEL       = /([aeiou]|y$)/i
     CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
-    VOWEL_4     = /[aeiou]{4}/i
+    VOWEL_5     = /[aeiou]{5}/i
     REPEATED    = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
     SINGLETONS  = /^[AaIi]$/
@@ -73,7 +73,7 @@ module Docsplit
       (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
       # Four or more consecutive vowels, or five or more consecutive consonants.
-      ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
+      ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
       # Number of uppercase letters greater than lowercase letters, but the word is
       # not all uppercase + punctuation.

data/lib/docsplit/text_extractor.rb CHANGED Viewed

@@ -64,7 +64,7 @@ module Docsplit
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
-          run "tesseract #{tiff} #{file} 2>&1"
+          run "tesseract #{tiff} #{file} -l eng 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: docsplit
 version: !ruby/object:Gem::Version
-  hash: 11
-  prerelease: false
+  hash: 9
+  prerelease:
   segments:
   - 0
   - 5
-  - 0
-  version: 0.5.0
+  - 1
+  version: 0.5.1
 platform: ruby
 authors:
 - Jeremy Ashkenas
@@ -16,8 +16,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-10-18 00:00:00 -04:00
-default_executable:
+date: 2010-04-26 00:00:00 Z
 dependencies: []
 description: "    Docsplit is a command-line utility and Ruby library for splitting apart\n    documents into their component parts: searchable UTF-8 plain text, page\n    images or thumbnails in any format, PDFs, single pages, and document\n    metadata (title, author, number of pages...)\n"
@@ -51,7 +50,6 @@ files:
 - docsplit.gemspec
 - LICENSE
 - README
-has_rdoc: false
 homepage: http://documentcloud.github.com/docsplit/
 licenses: []
@@ -81,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: docsplit
-rubygems_version: 1.3.7
+rubygems_version: 1.7.2
 signing_key:
 specification_version: 3
 summary: Break Apart Documents into Images, Text, Pages and PDFs