docsplit 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.5.0' # Keep version in sync with docsplit.rb
4
- s.date = '2010-10-18'
3
+ s.version = '0.5.1' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-04-26'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.5.0' # Keep in sync with gemspec.
4
+ VERSION = '0.5.1' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -24,12 +24,12 @@ module Docsplit
24
24
  REPEAT = /([^0-9])\1{2,}/
25
25
  UPPER = /[A-Z]/
26
26
  LOWER = /[a-z]/
27
- ACRONYM = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/
27
+ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
28
28
  ALL_ALPHA = /^[a-z]+$/i
29
29
  CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
30
30
  VOWEL = /([aeiou]|y$)/i
31
31
  CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
32
- VOWEL_4 = /[aeiou]{4}/i
32
+ VOWEL_5 = /[aeiou]{5}/i
33
33
  REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
34
34
  SINGLETONS = /^[AaIi]$/
35
35
 
@@ -73,7 +73,7 @@ module Docsplit
73
73
  (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
74
74
 
75
75
  # Four or more consecutive vowels, or five or more consecutive consonants.
76
- ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
76
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
77
77
 
78
78
  # Number of uppercase letters greater than lowercase letters, but the word is
79
79
  # not all uppercase + punctuation.
@@ -64,7 +64,7 @@ module Docsplit
64
64
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
65
65
  file = "#{base_path}_#{page}"
66
66
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
67
- run "tesseract #{tiff} #{file} 2>&1"
67
+ run "tesseract #{tiff} #{file} -l eng 2>&1"
68
68
  clean_text(file + '.txt') if @clean_ocr
69
69
  FileUtils.remove_entry_secure tiff
70
70
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
5
- prerelease: false
4
+ hash: 9
5
+ prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 5
9
- - 0
10
- version: 0.5.0
9
+ - 1
10
+ version: 0.5.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,8 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-10-18 00:00:00 -04:00
20
- default_executable:
19
+ date: 2010-04-26 00:00:00 Z
21
20
  dependencies: []
22
21
 
23
22
  description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
@@ -51,7 +50,6 @@ files:
51
50
  - docsplit.gemspec
52
51
  - LICENSE
53
52
  - README
54
- has_rdoc: false
55
53
  homepage: http://documentcloud.github.com/docsplit/
56
54
  licenses: []
57
55
 
@@ -81,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
81
79
  requirements: []
82
80
 
83
81
  rubyforge_project: docsplit
84
- rubygems_version: 1.3.7
82
+ rubygems_version: 1.7.2
85
83
  signing_key:
86
84
  specification_version: 3
87
85
  summary: Break Apart Documents into Images, Text, Pages and PDFs