docsplit 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +7 -1
- data/lib/docsplit/command_line.rb +4 -1
- data/lib/docsplit/text_cleaner.rb +94 -0
- data/lib/docsplit/text_extractor.rb +14 -1
- metadata +6 -5
data/docsplit.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'docsplit'
|
3
|
-
s.version = '0.
|
4
|
-
s.date = '2010-
|
3
|
+
s.version = '0.5.0' # Keep version in sync with docsplit.rb
|
4
|
+
s.date = '2010-10-18'
|
5
5
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
2
2
|
module Docsplit
|
3
3
|
|
4
|
-
VERSION = '0.
|
4
|
+
VERSION = '0.5.0' # Keep in sync with gemspec.
|
5
5
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
7
7
|
|
@@ -72,6 +72,11 @@ module Docsplit
|
|
72
72
|
EOS
|
73
73
|
end
|
74
74
|
|
75
|
+
# Utility method to clean OCR'd text with garbage characters.
|
76
|
+
def self.clean_text(text)
|
77
|
+
TextCleaner.new.clean(text)
|
78
|
+
end
|
79
|
+
|
75
80
|
|
76
81
|
private
|
77
82
|
|
@@ -103,3 +108,4 @@ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
|
103
108
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
104
109
|
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
|
105
110
|
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
|
111
|
+
require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
|
@@ -71,7 +71,7 @@ Options:
|
|
71
71
|
# Use the OptionParser library to parse out all supported options. Return
|
72
72
|
# options formatted for the Ruby API.
|
73
73
|
def parse_options
|
74
|
-
@options = {:ocr => :default}
|
74
|
+
@options = {:ocr => :default, :clean => true}
|
75
75
|
@option_parser = OptionParser.new do |opts|
|
76
76
|
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
|
77
77
|
@options[:output] = d
|
@@ -88,6 +88,9 @@ Options:
|
|
88
88
|
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
|
89
89
|
@options[:ocr] = o
|
90
90
|
end
|
91
|
+
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
|
92
|
+
@options[:clean] = false
|
93
|
+
end
|
91
94
|
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
92
95
|
@options[:rolling] = true
|
93
96
|
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'iconv'
|
2
|
+
require 'strscan'
|
3
|
+
|
4
|
+
module Docsplit
|
5
|
+
|
6
|
+
# Cleans up OCR'd text by using a series of heuristics to remove garbage
|
7
|
+
# words. Algorithms taken from:
|
8
|
+
#
|
9
|
+
# Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
|
10
|
+
# -- Taghva, Nartker, Condit, and Borsack
|
11
|
+
#
|
12
|
+
# Improving Search and Retrieval Performance through Shortening Documents,
|
13
|
+
# Detecting Garbage, and Throwing out Jargon
|
14
|
+
# -- Kulp
|
15
|
+
#
|
16
|
+
class TextCleaner
|
17
|
+
|
18
|
+
# Cached regexes we plan on using.
|
19
|
+
WORD = /\S+/
|
20
|
+
SPACE = /\s+/
|
21
|
+
NEWLINE = /[\r\n]/
|
22
|
+
ALNUM = /[a-z0-9]/i
|
23
|
+
PUNCT = /[[:punct:]]/i
|
24
|
+
REPEAT = /([^0-9])\1{2,}/
|
25
|
+
UPPER = /[A-Z]/
|
26
|
+
LOWER = /[a-z]/
|
27
|
+
ACRONYM = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/
|
28
|
+
ALL_ALPHA = /^[a-z]+$/i
|
29
|
+
CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
|
30
|
+
VOWEL = /([aeiou]|y$)/i
|
31
|
+
CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
|
32
|
+
VOWEL_4 = /[aeiou]{4}/i
|
33
|
+
REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
|
34
|
+
SINGLETONS = /^[AaIi]$/
|
35
|
+
|
36
|
+
# For the time being, `clean` uses the regular StringScanner, and not the
|
37
|
+
# multibyte-aware version, coercing to ASCII first.
|
38
|
+
def clean(text)
|
39
|
+
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
|
40
|
+
scanner = StringScanner.new(text)
|
41
|
+
cleaned = []
|
42
|
+
spaced = false
|
43
|
+
loop do
|
44
|
+
if space = scanner.scan(SPACE)
|
45
|
+
cleaned.push(space) unless spaced && (space !~ NEWLINE)
|
46
|
+
spaced = true
|
47
|
+
elsif word = scanner.scan(WORD)
|
48
|
+
unless garbage(word)
|
49
|
+
cleaned.push(word)
|
50
|
+
spaced = false
|
51
|
+
end
|
52
|
+
elsif scanner.eos?
|
53
|
+
return cleaned.join('').gsub(REPEATED, '')
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Is a given word OCR garbage?
|
59
|
+
def garbage(w)
|
60
|
+
acronym = w =~ ACRONYM
|
61
|
+
|
62
|
+
# More than 30 bytes in length.
|
63
|
+
(w.length > 30) ||
|
64
|
+
|
65
|
+
# If there are three or more identical characters in a row in the string.
|
66
|
+
(w =~ REPEAT) ||
|
67
|
+
|
68
|
+
# More punctuation than alpha numerics.
|
69
|
+
(!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
|
70
|
+
|
71
|
+
# Ignoring the first and last characters in the string, if there are three or
|
72
|
+
# more different punctuation characters in the string.
|
73
|
+
(w[1...-1].scan(PUNCT).uniq.length >= 3) ||
|
74
|
+
|
75
|
+
# Four or more consecutive vowels, or five or more consecutive consonants.
|
76
|
+
((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
|
77
|
+
|
78
|
+
# Number of uppercase letters greater than lowercase letters, but the word is
|
79
|
+
# not all uppercase + punctuation.
|
80
|
+
(!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
|
81
|
+
|
82
|
+
# Single letters that are not A or I.
|
83
|
+
(w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
|
84
|
+
|
85
|
+
# All characters are alphabetic and there are 8 times more vowels than
|
86
|
+
# consonants, or 8 times more consonants than vowels.
|
87
|
+
(!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
|
88
|
+
(((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
|
89
|
+
(cons > vows * 8)))
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
@@ -62,14 +62,17 @@ module Docsplit
|
|
62
62
|
if pages
|
63
63
|
pages.each do |page|
|
64
64
|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
65
|
+
file = "#{base_path}_#{page}"
|
65
66
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
|
66
|
-
run "tesseract #{tiff} #{
|
67
|
+
run "tesseract #{tiff} #{file} 2>&1"
|
68
|
+
clean_text(file + '.txt') if @clean_ocr
|
67
69
|
FileUtils.remove_entry_secure tiff
|
68
70
|
end
|
69
71
|
else
|
70
72
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
71
73
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
72
74
|
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
|
75
|
+
clean_text(base_path + '.txt') if @clean_ocr
|
73
76
|
end
|
74
77
|
ensure
|
75
78
|
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
@@ -78,6 +81,15 @@ module Docsplit
|
|
78
81
|
|
79
82
|
private
|
80
83
|
|
84
|
+
def clean_text(file)
|
85
|
+
File.open(file, 'r+') do |f|
|
86
|
+
text = f.read
|
87
|
+
f.truncate(0)
|
88
|
+
f.rewind
|
89
|
+
f.write(Docsplit.clean_text(text))
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
81
93
|
# Run an external process and raise an exception if it fails.
|
82
94
|
def run(command)
|
83
95
|
result = `#{command}`
|
@@ -106,6 +118,7 @@ module Docsplit
|
|
106
118
|
@pages = options[:pages]
|
107
119
|
@force_ocr = options[:ocr] == true
|
108
120
|
@forbid_ocr = options[:ocr] == false
|
121
|
+
@clean_ocr = !(options[:clean] == false)
|
109
122
|
end
|
110
123
|
|
111
124
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 11
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 5
|
9
|
+
- 0
|
10
|
+
version: 0.5.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jeremy Ashkenas
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2010-
|
19
|
+
date: 2010-10-18 00:00:00 -04:00
|
20
20
|
default_executable:
|
21
21
|
dependencies: []
|
22
22
|
|
@@ -33,6 +33,7 @@ files:
|
|
33
33
|
- lib/docsplit/image_extractor.rb
|
34
34
|
- lib/docsplit/info_extractor.rb
|
35
35
|
- lib/docsplit/page_extractor.rb
|
36
|
+
- lib/docsplit/text_cleaner.rb
|
36
37
|
- lib/docsplit/text_extractor.rb
|
37
38
|
- lib/docsplit/transparent_pdfs.rb
|
38
39
|
- lib/docsplit.rb
|