docsplit 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/docsplit.gemspec +2 -2
- data/lib/docsplit.rb +7 -1
- data/lib/docsplit/command_line.rb +4 -1
- data/lib/docsplit/text_cleaner.rb +94 -0
- data/lib/docsplit/text_extractor.rb +14 -1
- metadata +6 -5
data/docsplit.gemspec
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Gem::Specification.new do |s|
|
|
2
2
|
s.name = 'docsplit'
|
|
3
|
-
s.version = '0.
|
|
4
|
-
s.date = '2010-
|
|
3
|
+
s.version = '0.5.0' # Keep version in sync with docsplit.rb
|
|
4
|
+
s.date = '2010-10-18'
|
|
5
5
|
|
|
6
6
|
s.homepage = "http://documentcloud.github.com/docsplit/"
|
|
7
7
|
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
|
data/lib/docsplit.rb
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# The Docsplit module delegates to the Java PDF extractors.
|
|
2
2
|
module Docsplit
|
|
3
3
|
|
|
4
|
-
VERSION = '0.
|
|
4
|
+
VERSION = '0.5.0' # Keep in sync with gemspec.
|
|
5
5
|
|
|
6
6
|
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
|
|
7
7
|
|
|
@@ -72,6 +72,11 @@ module Docsplit
|
|
|
72
72
|
EOS
|
|
73
73
|
end
|
|
74
74
|
|
|
75
|
+
# Utility method to clean OCR'd text with garbage characters.
|
|
76
|
+
def self.clean_text(text)
|
|
77
|
+
TextCleaner.new.clean(text)
|
|
78
|
+
end
|
|
79
|
+
|
|
75
80
|
|
|
76
81
|
private
|
|
77
82
|
|
|
@@ -103,3 +108,4 @@ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
|
|
103
108
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
|
104
109
|
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
|
|
105
110
|
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
|
|
111
|
+
require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
|
|
@@ -71,7 +71,7 @@ Options:
|
|
|
71
71
|
# Use the OptionParser library to parse out all supported options. Return
|
|
72
72
|
# options formatted for the Ruby API.
|
|
73
73
|
def parse_options
|
|
74
|
-
@options = {:ocr => :default}
|
|
74
|
+
@options = {:ocr => :default, :clean => true}
|
|
75
75
|
@option_parser = OptionParser.new do |opts|
|
|
76
76
|
opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
|
|
77
77
|
@options[:output] = d
|
|
@@ -88,6 +88,9 @@ Options:
|
|
|
88
88
|
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
|
|
89
89
|
@options[:ocr] = o
|
|
90
90
|
end
|
|
91
|
+
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
|
|
92
|
+
@options[:clean] = false
|
|
93
|
+
end
|
|
91
94
|
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
|
|
92
95
|
@options[:rolling] = true
|
|
93
96
|
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
require 'iconv'
|
|
2
|
+
require 'strscan'
|
|
3
|
+
|
|
4
|
+
module Docsplit
|
|
5
|
+
|
|
6
|
+
# Cleans up OCR'd text by using a series of heuristics to remove garbage
|
|
7
|
+
# words. Algorithms taken from:
|
|
8
|
+
#
|
|
9
|
+
# Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
|
|
10
|
+
# -- Taghva, Nartker, Condit, and Borsack
|
|
11
|
+
#
|
|
12
|
+
# Improving Search and Retrieval Performance through Shortening Documents,
|
|
13
|
+
# Detecting Garbage, and Throwing out Jargon
|
|
14
|
+
# -- Kulp
|
|
15
|
+
#
|
|
16
|
+
class TextCleaner
|
|
17
|
+
|
|
18
|
+
# Cached regexes we plan on using.
|
|
19
|
+
WORD = /\S+/
|
|
20
|
+
SPACE = /\s+/
|
|
21
|
+
NEWLINE = /[\r\n]/
|
|
22
|
+
ALNUM = /[a-z0-9]/i
|
|
23
|
+
PUNCT = /[[:punct:]]/i
|
|
24
|
+
REPEAT = /([^0-9])\1{2,}/
|
|
25
|
+
UPPER = /[A-Z]/
|
|
26
|
+
LOWER = /[a-z]/
|
|
27
|
+
ACRONYM = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/
|
|
28
|
+
ALL_ALPHA = /^[a-z]+$/i
|
|
29
|
+
CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
|
|
30
|
+
VOWEL = /([aeiou]|y$)/i
|
|
31
|
+
CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
|
|
32
|
+
VOWEL_4 = /[aeiou]{4}/i
|
|
33
|
+
REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
|
|
34
|
+
SINGLETONS = /^[AaIi]$/
|
|
35
|
+
|
|
36
|
+
# For the time being, `clean` uses the regular StringScanner, and not the
|
|
37
|
+
# multibyte-aware version, coercing to ASCII first.
|
|
38
|
+
def clean(text)
|
|
39
|
+
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
|
|
40
|
+
scanner = StringScanner.new(text)
|
|
41
|
+
cleaned = []
|
|
42
|
+
spaced = false
|
|
43
|
+
loop do
|
|
44
|
+
if space = scanner.scan(SPACE)
|
|
45
|
+
cleaned.push(space) unless spaced && (space !~ NEWLINE)
|
|
46
|
+
spaced = true
|
|
47
|
+
elsif word = scanner.scan(WORD)
|
|
48
|
+
unless garbage(word)
|
|
49
|
+
cleaned.push(word)
|
|
50
|
+
spaced = false
|
|
51
|
+
end
|
|
52
|
+
elsif scanner.eos?
|
|
53
|
+
return cleaned.join('').gsub(REPEATED, '')
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Is a given word OCR garbage?
|
|
59
|
+
def garbage(w)
|
|
60
|
+
acronym = w =~ ACRONYM
|
|
61
|
+
|
|
62
|
+
# More than 30 bytes in length.
|
|
63
|
+
(w.length > 30) ||
|
|
64
|
+
|
|
65
|
+
# If there are three or more identical characters in a row in the string.
|
|
66
|
+
(w =~ REPEAT) ||
|
|
67
|
+
|
|
68
|
+
# More punctuation than alpha numerics.
|
|
69
|
+
(!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
|
|
70
|
+
|
|
71
|
+
# Ignoring the first and last characters in the string, if there are three or
|
|
72
|
+
# more different punctuation characters in the string.
|
|
73
|
+
(w[1...-1].scan(PUNCT).uniq.length >= 3) ||
|
|
74
|
+
|
|
75
|
+
# Four or more consecutive vowels, or five or more consecutive consonants.
|
|
76
|
+
((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
|
|
77
|
+
|
|
78
|
+
# Number of uppercase letters greater than lowercase letters, but the word is
|
|
79
|
+
# not all uppercase + punctuation.
|
|
80
|
+
(!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
|
|
81
|
+
|
|
82
|
+
# Single letters that are not A or I.
|
|
83
|
+
(w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
|
|
84
|
+
|
|
85
|
+
# All characters are alphabetic and there are 8 times more vowels than
|
|
86
|
+
# consonants, or 8 times more consonants than vowels.
|
|
87
|
+
(!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
|
|
88
|
+
(((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
|
|
89
|
+
(cons > vows * 8)))
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
end
|
|
@@ -62,14 +62,17 @@ module Docsplit
|
|
|
62
62
|
if pages
|
|
63
63
|
pages.each do |page|
|
|
64
64
|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
|
|
65
|
+
file = "#{base_path}_#{page}"
|
|
65
66
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
|
|
66
|
-
run "tesseract #{tiff} #{
|
|
67
|
+
run "tesseract #{tiff} #{file} 2>&1"
|
|
68
|
+
clean_text(file + '.txt') if @clean_ocr
|
|
67
69
|
FileUtils.remove_entry_secure tiff
|
|
68
70
|
end
|
|
69
71
|
else
|
|
70
72
|
tiff = "#{tempdir}/#{@pdf_name}.tif"
|
|
71
73
|
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
|
|
72
74
|
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
|
|
75
|
+
clean_text(base_path + '.txt') if @clean_ocr
|
|
73
76
|
end
|
|
74
77
|
ensure
|
|
75
78
|
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
|
|
@@ -78,6 +81,15 @@ module Docsplit
|
|
|
78
81
|
|
|
79
82
|
private
|
|
80
83
|
|
|
84
|
+
def clean_text(file)
|
|
85
|
+
File.open(file, 'r+') do |f|
|
|
86
|
+
text = f.read
|
|
87
|
+
f.truncate(0)
|
|
88
|
+
f.rewind
|
|
89
|
+
f.write(Docsplit.clean_text(text))
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
81
93
|
# Run an external process and raise an exception if it fails.
|
|
82
94
|
def run(command)
|
|
83
95
|
result = `#{command}`
|
|
@@ -106,6 +118,7 @@ module Docsplit
|
|
|
106
118
|
@pages = options[:pages]
|
|
107
119
|
@force_ocr = options[:ocr] == true
|
|
108
120
|
@forbid_ocr = options[:ocr] == false
|
|
121
|
+
@clean_ocr = !(options[:clean] == false)
|
|
109
122
|
end
|
|
110
123
|
|
|
111
124
|
end
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: docsplit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
hash:
|
|
4
|
+
hash: 11
|
|
5
5
|
prerelease: false
|
|
6
6
|
segments:
|
|
7
7
|
- 0
|
|
8
|
-
-
|
|
9
|
-
-
|
|
10
|
-
version: 0.
|
|
8
|
+
- 5
|
|
9
|
+
- 0
|
|
10
|
+
version: 0.5.0
|
|
11
11
|
platform: ruby
|
|
12
12
|
authors:
|
|
13
13
|
- Jeremy Ashkenas
|
|
@@ -16,7 +16,7 @@ autorequire:
|
|
|
16
16
|
bindir: bin
|
|
17
17
|
cert_chain: []
|
|
18
18
|
|
|
19
|
-
date: 2010-
|
|
19
|
+
date: 2010-10-18 00:00:00 -04:00
|
|
20
20
|
default_executable:
|
|
21
21
|
dependencies: []
|
|
22
22
|
|
|
@@ -33,6 +33,7 @@ files:
|
|
|
33
33
|
- lib/docsplit/image_extractor.rb
|
|
34
34
|
- lib/docsplit/info_extractor.rb
|
|
35
35
|
- lib/docsplit/page_extractor.rb
|
|
36
|
+
- lib/docsplit/text_cleaner.rb
|
|
36
37
|
- lib/docsplit/text_extractor.rb
|
|
37
38
|
- lib/docsplit/transparent_pdfs.rb
|
|
38
39
|
- lib/docsplit.rb
|