docsplit 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.4.1' # Keep version in sync with docsplit.rb
4
- s.date = '2010-8-23'
3
+ s.version = '0.5.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-10-18'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.4.1' # Keep in sync with gemspec.
4
+ VERSION = '0.5.0' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -72,6 +72,11 @@ module Docsplit
72
72
  EOS
73
73
  end
74
74
 
75
+ # Utility method to clean OCR'd text with garbage characters.
76
+ def self.clean_text(text)
77
+ TextCleaner.new.clean(text)
78
+ end
79
+
75
80
 
76
81
  private
77
82
 
@@ -103,3 +108,4 @@ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
103
108
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
104
109
  require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
105
110
  require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
111
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
@@ -71,7 +71,7 @@ Options:
71
71
  # Use the OptionParser library to parse out all supported options. Return
72
72
  # options formatted for the Ruby API.
73
73
  def parse_options
74
- @options = {:ocr => :default}
74
+ @options = {:ocr => :default, :clean => true}
75
75
  @option_parser = OptionParser.new do |opts|
76
76
  opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
77
  @options[:output] = d
@@ -88,6 +88,9 @@ Options:
88
88
  opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
89
89
  @options[:ocr] = o
90
90
  end
91
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
92
+ @options[:clean] = false
93
+ end
91
94
  opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
92
95
  @options[:rolling] = true
93
96
  end
@@ -0,0 +1,94 @@
1
+ require 'iconv'
2
+ require 'strscan'
3
+
4
+ module Docsplit
5
+
6
+ # Cleans up OCR'd text by using a series of heuristics to remove garbage
7
+ # words. Algorithms taken from:
8
+ #
9
+ # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
10
+ # -- Taghva, Nartker, Condit, and Borsack
11
+ #
12
+ # Improving Search and Retrieval Performance through Shortening Documents,
13
+ # Detecting Garbage, and Throwing out Jargon
14
+ # -- Kulp
15
+ #
16
+ class TextCleaner
17
+
18
+ # Cached regexes we plan on using.
19
+ WORD = /\S+/
20
+ SPACE = /\s+/
21
+ NEWLINE = /[\r\n]/
22
+ ALNUM = /[a-z0-9]/i
23
+ PUNCT = /[[:punct:]]/i
24
+ REPEAT = /([^0-9])\1{2,}/
25
+ UPPER = /[A-Z]/
26
+ LOWER = /[a-z]/
27
+ ACRONYM = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/
28
+ ALL_ALPHA = /^[a-z]+$/i
29
+ CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
30
+ VOWEL = /([aeiou]|y$)/i
31
+ CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
32
+ VOWEL_4 = /[aeiou]{4}/i
33
+ REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
34
+ SINGLETONS = /^[AaIi]$/
35
+
36
+ # For the time being, `clean` uses the regular StringScanner, and not the
37
+ # multibyte-aware version, coercing to ASCII first.
38
+ def clean(text)
39
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
40
+ scanner = StringScanner.new(text)
41
+ cleaned = []
42
+ spaced = false
43
+ loop do
44
+ if space = scanner.scan(SPACE)
45
+ cleaned.push(space) unless spaced && (space !~ NEWLINE)
46
+ spaced = true
47
+ elsif word = scanner.scan(WORD)
48
+ unless garbage(word)
49
+ cleaned.push(word)
50
+ spaced = false
51
+ end
52
+ elsif scanner.eos?
53
+ return cleaned.join('').gsub(REPEATED, '')
54
+ end
55
+ end
56
+ end
57
+
58
+ # Is a given word OCR garbage?
59
+ def garbage(w)
60
+ acronym = w =~ ACRONYM
61
+
62
+ # More than 30 bytes in length.
63
+ (w.length > 30) ||
64
+
65
+ # If there are three or more identical characters in a row in the string.
66
+ (w =~ REPEAT) ||
67
+
68
+ # More punctuation than alpha numerics.
69
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
70
+
71
+ # Ignoring the first and last characters in the string, if there are three or
72
+ # more different punctuation characters in the string.
73
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
74
+
75
+ # Four or more consecutive vowels, or five or more consecutive consonants.
76
+ ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
77
+
78
+ # Number of uppercase letters greater than lowercase letters, but the word is
79
+ # not all uppercase + punctuation.
80
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
81
+
82
+ # Single letters that are not A or I.
83
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
84
+
85
+ # All characters are alphabetic and there are 8 times more vowels than
86
+ # consonants, or 8 times more consonants than vowels.
87
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
88
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
89
+ (cons > vows * 8)))
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -62,14 +62,17 @@ module Docsplit
62
62
  if pages
63
63
  pages.each do |page|
64
64
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
65
+ file = "#{base_path}_#{page}"
65
66
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
66
- run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
67
+ run "tesseract #{tiff} #{file} 2>&1"
68
+ clean_text(file + '.txt') if @clean_ocr
67
69
  FileUtils.remove_entry_secure tiff
68
70
  end
69
71
  else
70
72
  tiff = "#{tempdir}/#{@pdf_name}.tif"
71
73
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
72
74
  run "tesseract #{tiff} #{base_path} -l eng 2>&1"
75
+ clean_text(base_path + '.txt') if @clean_ocr
73
76
  end
74
77
  ensure
75
78
  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
@@ -78,6 +81,15 @@ module Docsplit
78
81
 
79
82
  private
80
83
 
84
+ def clean_text(file)
85
+ File.open(file, 'r+') do |f|
86
+ text = f.read
87
+ f.truncate(0)
88
+ f.rewind
89
+ f.write(Docsplit.clean_text(text))
90
+ end
91
+ end
92
+
81
93
  # Run an external process and raise an exception if it fails.
82
94
  def run(command)
83
95
  result = `#{command}`
@@ -106,6 +118,7 @@ module Docsplit
106
118
  @pages = options[:pages]
107
119
  @force_ocr = options[:ocr] == true
108
120
  @forbid_ocr = options[:ocr] == false
121
+ @clean_ocr = !(options[:clean] == false)
109
122
  end
110
123
 
111
124
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 13
4
+ hash: 11
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 4
9
- - 1
10
- version: 0.4.1
8
+ - 5
9
+ - 0
10
+ version: 0.5.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-08-23 00:00:00 -04:00
19
+ date: 2010-10-18 00:00:00 -04:00
20
20
  default_executable:
21
21
  dependencies: []
22
22
 
@@ -33,6 +33,7 @@ files:
33
33
  - lib/docsplit/image_extractor.rb
34
34
  - lib/docsplit/info_extractor.rb
35
35
  - lib/docsplit/page_extractor.rb
36
+ - lib/docsplit/text_cleaner.rb
36
37
  - lib/docsplit/text_extractor.rb
37
38
  - lib/docsplit/transparent_pdfs.rb
38
39
  - lib/docsplit.rb