docsplit 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.5.0' # Keep version in sync with docsplit.rb
4
- s.date = '2010-10-18'
3
+ s.version = '0.5.1' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-04-26'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.5.0' # Keep in sync with gemspec.
4
+ VERSION = '0.5.1' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -24,12 +24,12 @@ module Docsplit
24
24
  REPEAT = /([^0-9])\1{2,}/
25
25
  UPPER = /[A-Z]/
26
26
  LOWER = /[a-z]/
27
- ACRONYM = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/
27
+ ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
28
28
  ALL_ALPHA = /^[a-z]+$/i
29
29
  CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
30
30
  VOWEL = /([aeiou]|y$)/i
31
31
  CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
32
- VOWEL_4 = /[aeiou]{4}/i
32
+ VOWEL_5 = /[aeiou]{5}/i
33
33
  REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
34
34
  SINGLETONS = /^[AaIi]$/
35
35
 
@@ -73,7 +73,7 @@ module Docsplit
73
73
  (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
74
74
 
75
75
  # Four or more consecutive vowels, or five or more consecutive consonants.
76
- ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
76
+ ((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
77
77
 
78
78
  # Number of uppercase letters greater than lowercase letters, but the word is
79
79
  # not all uppercase + punctuation.
@@ -64,7 +64,7 @@ module Docsplit
64
64
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
65
65
  file = "#{base_path}_#{page}"
66
66
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
67
- run "tesseract #{tiff} #{file} 2>&1"
67
+ run "tesseract #{tiff} #{file} -l eng 2>&1"
68
68
  clean_text(file + '.txt') if @clean_ocr
69
69
  FileUtils.remove_entry_secure tiff
70
70
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
5
- prerelease: false
4
+ hash: 9
5
+ prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 5
9
- - 0
10
- version: 0.5.0
9
+ - 1
10
+ version: 0.5.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,8 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-10-18 00:00:00 -04:00
20
- default_executable:
19
+ date: 2010-04-26 00:00:00 Z
21
20
  dependencies: []
22
21
 
23
22
  description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
@@ -51,7 +50,6 @@ files:
51
50
  - docsplit.gemspec
52
51
  - LICENSE
53
52
  - README
54
- has_rdoc: false
55
53
  homepage: http://documentcloud.github.com/docsplit/
56
54
  licenses: []
57
55
 
@@ -81,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
81
79
  requirements: []
82
80
 
83
81
  rubyforge_project: docsplit
84
- rubygems_version: 1.3.7
82
+ rubygems_version: 1.7.2
85
83
  signing_key:
86
84
  specification_version: 3
87
85
  summary: Break Apart Documents into Images, Text, Pages and PDFs