docsplit 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +1 -1
- data/lib/docsplit/text_cleaner.rb +3 -3
- data/lib/docsplit/text_extractor.rb +1 -1
- metadata +6 -8
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.5.
|
4
|
-
s.date = '2010-
|
3
|
+
s.version = '0.5.1' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-04-26'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -24,12 +24,12 @@ module Docsplit
|
|
24
24
|
REPEAT = /([^0-9])\1{2,}/
|
25
25
|
UPPER = /[A-Z]/
|
26
26
|
LOWER = /[a-z]/
|
27
|
-
ACRONYM = /^\(?[A-Z0-9
|
27
|
+
ACRONYM = /^\(?[A-Z0-9\.-]+('?s)?\)?[.,:]?$/
|
28
28
|
ALL_ALPHA = /^[a-z]+$/i
|
29
29
|
CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
|
30
30
|
VOWEL = /([aeiou]|y$)/i
|
31
31
|
CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
|
32
|
-
|
32
|
+
VOWEL_5 = /[aeiou]{5}/i
|
33
33
|
REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
|
34
34
|
SINGLETONS = /^[AaIi]$/
|
35
35
|
|
@@ -73,7 +73,7 @@ module Docsplit
|
|
73
73
|
(w[1...-1].scan(PUNCT).uniq.length >= 3) ||
|
74
74
|
|
75
75
|
# Four or more consecutive vowels, or five or more consecutive consonants.
|
76
|
-
((w =~
|
76
|
+
((w =~ VOWEL_5) || (w =~ CONSONANT_5)) ||
|
77
77
|
|
78
78
|
# Number of uppercase letters greater than lowercase letters, but the word is
|
79
79
|
# not all uppercase + punctuation.
|
@@ -64,7 +64,7 @@ module Docsplit
|
|
64
64
|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
65
65
|
file = "#{base_path}_#{page}"
|
66
66
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
|
67
|
-
run "tesseract #{tiff} #{file} 2>&1"
|
67
|
+
run "tesseract #{tiff} #{file} -l eng 2>&1"
|
68
68
|
clean_text(file + '.txt') if @clean_ocr
|
69
69
|
FileUtils.remove_entry_secure tiff
|
70
70
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 9
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 5
|
9
|
-
-
|
10
|
-
version: 0.5.
|
9
|
+
- 1
|
10
|
+
version: 0.5.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jeremy Ashkenas
|
@@ -16,8 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2010-
|
20
|
-
default_executable:
|
19
|
+
date: 2010-04-26 00:00:00 Z
|
21
20
|
dependencies: []
|
22
21
|
|
23
22
|
description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
|
@@ -51,7 +50,6 @@ files:
|
|
51
50
|
- docsplit.gemspec
|
52
51
|
- LICENSE
|
53
52
|
- README
|
54
|
-
has_rdoc: false
|
55
53
|
homepage: http://documentcloud.github.com/docsplit/
|
56
54
|
licenses: []
|
57
55
|
|
@@ -81,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
81
79
|
requirements: []
|
82
80
|
|
83
81
|
rubyforge_project: docsplit
|
84
|
-
rubygems_version: 1.
|
82
|
+
rubygems_version: 1.7.2
|
85
83
|
signing_key:
|
86
84
|
specification_version: 3
|
87
85
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|