docsplit 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.4.1' # Keep version in sync with docsplit.rb
4
- s.date = '2010-8-23'
3
+ s.version = '0.5.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2010-10-18'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.4.1' # Keep in sync with gemspec.
4
+ VERSION = '0.5.0' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -72,6 +72,11 @@ module Docsplit
72
72
  EOS
73
73
  end
74
74
 
75
+ # Utility method to clean OCR'd text with garbage characters.
76
+ def self.clean_text(text)
77
+ TextCleaner.new.clean(text)
78
+ end
79
+
75
80
 
76
81
  private
77
82
 
@@ -103,3 +108,4 @@ require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
103
108
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
104
109
  require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
105
110
  require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
111
+ require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"
@@ -71,7 +71,7 @@ Options:
71
71
  # Use the OptionParser library to parse out all supported options. Return
72
72
  # options formatted for the Ruby API.
73
73
  def parse_options
74
- @options = {:ocr => :default}
74
+ @options = {:ocr => :default, :clean => true}
75
75
  @option_parser = OptionParser.new do |opts|
76
76
  opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
77
77
  @options[:output] = d
@@ -88,6 +88,9 @@ Options:
88
88
  opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
89
89
  @options[:ocr] = o
90
90
  end
91
+ opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
92
+ @options[:clean] = false
93
+ end
91
94
  opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
92
95
  @options[:rolling] = true
93
96
  end
@@ -0,0 +1,94 @@
1
+ require 'iconv'
2
+ require 'strscan'
3
+
4
+ module Docsplit
5
+
6
+ # Cleans up OCR'd text by using a series of heuristics to remove garbage
7
+ # words. Algorithms taken from:
8
+ #
9
+ # Automatic Removal of "Garbage Strings" in OCR Text: An Implementation
10
+ # -- Taghva, Nartker, Condit, and Borsack
11
+ #
12
+ # Improving Search and Retrieval Performance through Shortening Documents,
13
+ # Detecting Garbage, and Throwing out Jargon
14
+ # -- Kulp
15
+ #
16
+ class TextCleaner
17
+
18
+ # Cached regexes we plan on using.
19
+ WORD = /\S+/
20
+ SPACE = /\s+/
21
+ NEWLINE = /[\r\n]/
22
+ ALNUM = /[a-z0-9]/i
23
+ PUNCT = /[[:punct:]]/i
24
+ REPEAT = /([^0-9])\1{2,}/
25
+ UPPER = /[A-Z]/
26
+ LOWER = /[a-z]/
27
+ ACRONYM = /^\(?[A-Z0-9\.]+('?s)?\)?[.,:]?$/
28
+ ALL_ALPHA = /^[a-z]+$/i
29
+ CONSONANT = /(^y|[bcdfghjklmnpqrstvwxz])/i
30
+ VOWEL = /([aeiou]|y$)/i
31
+ CONSONANT_5 = /[bcdfghjklmnpqrstvwxyz]{5}/i
32
+ VOWEL_4 = /[aeiou]{4}/i
33
+ REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
34
+ SINGLETONS = /^[AaIi]$/
35
+
36
+ # For the time being, `clean` uses the regular StringScanner, and not the
37
+ # multibyte-aware version, coercing to ASCII first.
38
+ def clean(text)
39
+ text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
40
+ scanner = StringScanner.new(text)
41
+ cleaned = []
42
+ spaced = false
43
+ loop do
44
+ if space = scanner.scan(SPACE)
45
+ cleaned.push(space) unless spaced && (space !~ NEWLINE)
46
+ spaced = true
47
+ elsif word = scanner.scan(WORD)
48
+ unless garbage(word)
49
+ cleaned.push(word)
50
+ spaced = false
51
+ end
52
+ elsif scanner.eos?
53
+ return cleaned.join('').gsub(REPEATED, '')
54
+ end
55
+ end
56
+ end
57
+
58
+ # Is a given word OCR garbage?
59
+ def garbage(w)
60
+ acronym = w =~ ACRONYM
61
+
62
+ # More than 30 bytes in length.
63
+ (w.length > 30) ||
64
+
65
+ # If there are three or more identical characters in a row in the string.
66
+ (w =~ REPEAT) ||
67
+
68
+ # More punctuation than alpha numerics.
69
+ (!acronym && (w.scan(ALNUM).length < w.scan(PUNCT).length)) ||
70
+
71
+ # Ignoring the first and last characters in the string, if there are three or
72
+ # more different punctuation characters in the string.
73
+ (w[1...-1].scan(PUNCT).uniq.length >= 3) ||
74
+
75
+ # Four or more consecutive vowels, or five or more consecutive consonants.
76
+ ((w =~ VOWEL_4) || (w =~ CONSONANT_5)) ||
77
+
78
+ # Number of uppercase letters greater than lowercase letters, but the word is
79
+ # not all uppercase + punctuation.
80
+ (!acronym && (w.scan(UPPER).length > w.scan(LOWER).length)) ||
81
+
82
+ # Single letters that are not A or I.
83
+ (w.length == 1 && (w =~ ALL_ALPHA) && (w !~ SINGLETONS)) ||
84
+
85
+ # All characters are alphabetic and there are 8 times more vowels than
86
+ # consonants, or 8 times more consonants than vowels.
87
+ (!acronym && (w.length > 2 && (w =~ ALL_ALPHA)) &&
88
+ (((vows = w.scan(VOWEL).length) > (cons = w.scan(CONSONANT).length) * 8) ||
89
+ (cons > vows * 8)))
90
+ end
91
+
92
+ end
93
+
94
+ end
@@ -62,14 +62,17 @@ module Docsplit
62
62
  if pages
63
63
  pages.each do |page|
64
64
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
65
+ file = "#{base_path}_#{page}"
65
66
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
66
- run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
67
+ run "tesseract #{tiff} #{file} 2>&1"
68
+ clean_text(file + '.txt') if @clean_ocr
67
69
  FileUtils.remove_entry_secure tiff
68
70
  end
69
71
  else
70
72
  tiff = "#{tempdir}/#{@pdf_name}.tif"
71
73
  run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
72
74
  run "tesseract #{tiff} #{base_path} -l eng 2>&1"
75
+ clean_text(base_path + '.txt') if @clean_ocr
73
76
  end
74
77
  ensure
75
78
  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
@@ -78,6 +81,15 @@ module Docsplit
78
81
 
79
82
  private
80
83
 
84
+ def clean_text(file)
85
+ File.open(file, 'r+') do |f|
86
+ text = f.read
87
+ f.truncate(0)
88
+ f.rewind
89
+ f.write(Docsplit.clean_text(text))
90
+ end
91
+ end
92
+
81
93
  # Run an external process and raise an exception if it fails.
82
94
  def run(command)
83
95
  result = `#{command}`
@@ -106,6 +118,7 @@ module Docsplit
106
118
  @pages = options[:pages]
107
119
  @force_ocr = options[:ocr] == true
108
120
  @forbid_ocr = options[:ocr] == false
121
+ @clean_ocr = !(options[:clean] == false)
109
122
  end
110
123
 
111
124
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 13
4
+ hash: 11
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 4
9
- - 1
10
- version: 0.4.1
8
+ - 5
9
+ - 0
10
+ version: 0.5.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-08-23 00:00:00 -04:00
19
+ date: 2010-10-18 00:00:00 -04:00
20
20
  default_executable:
21
21
  dependencies: []
22
22
 
@@ -33,6 +33,7 @@ files:
33
33
  - lib/docsplit/image_extractor.rb
34
34
  - lib/docsplit/info_extractor.rb
35
35
  - lib/docsplit/page_extractor.rb
36
+ - lib/docsplit/text_cleaner.rb
36
37
  - lib/docsplit/text_extractor.rb
37
38
  - lib/docsplit/transparent_pdfs.rb
38
39
  - lib/docsplit.rb