tesseract-ocr 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/tesseract-train.rb +17 -13
- data/examples/nerdz-captcha-breaker/break.rb +14 -6
- data/lib/tesseract.rb +1 -2
- data/lib/tesseract/engine.rb +8 -8
- data/lib/tesseract/engine/iterator.rb +1 -1
- data/lib/tesseract/extensions.rb +2 -2
- data/lib/tesseract/version.rb +1 -1
- data/tesseract-ocr.gemspec +1 -2
- metadata +11 -22
data/bin/tesseract-train.rb
CHANGED
@@ -7,12 +7,12 @@ require 'shellwords'
|
|
7
7
|
options = {}
|
8
8
|
|
9
9
|
OptionParser.new do |o|
|
10
|
-
o.on '-b', '--box FILE', 'the box
|
11
|
-
options[:box] = File.realpath(
|
10
|
+
o.on '-b', '--box FILE...', Array, 'the box files to use' do |value|
|
11
|
+
options[:box] = value.map { |path| File.realpath(path) }
|
12
12
|
end
|
13
13
|
|
14
|
-
o.on '-i', '--image FILE', 'the image
|
15
|
-
options[:image] = File.realpath(
|
14
|
+
o.on '-i', '--image FILE...', Array, 'the image files to use' do |value|
|
15
|
+
options[:image] = value.map { |path| File.realpath(path) }
|
16
16
|
end
|
17
17
|
|
18
18
|
o.on '-o', '--output FILE', 'the path where to output the traineddata' do |value|
|
@@ -25,24 +25,28 @@ if language = ARGV.shift
|
|
25
25
|
options[:image] = File.realpath("#{language}.tif")
|
26
26
|
options[:output] = File.expand_path("#{language}.traineddata")
|
27
27
|
else
|
28
|
-
language = options[:
|
28
|
+
language = options[:output][/^(.*?)\./, 1]
|
29
29
|
end
|
30
30
|
|
31
31
|
Dir.chdir FileUtils.mkpath(File.join(Dir.tmpdir, rand.to_s)).first
|
32
32
|
|
33
33
|
language = language.shellescape
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
35
|
+
options[:box].each_with_index {|box, index|
|
36
|
+
%x{
|
37
|
+
cp #{box.shellescape} #{language}.#{index}.box
|
38
|
+
cp #{options[:image][index].shellescape} #{language}.#{index}#{File.extname(options[:image][index]}
|
39
|
+
|
40
|
+
tesseract #{language}#{File.extname(options[:image])} #{language} nobatch box.train.stderr
|
38
41
|
|
39
|
-
|
42
|
+
unicharset_extractor #{language}.box
|
40
43
|
|
41
|
-
|
44
|
+
echo #{language}.#{index} 0 0 0 0 0 >> font_properties
|
45
|
+
mftraining -F font_properties -U unicharset -O #{language}.unicharset #{language}.tr
|
46
|
+
}
|
47
|
+
}
|
42
48
|
|
43
|
-
|
44
|
-
mftraining -F font_properties -U unicharset #{language}.tr
|
45
|
-
mftraining -F font_properties -U unicharset -O #{language}.unicharset #{language}.tr
|
49
|
+
%x{
|
46
50
|
cntraining #{language}.tr
|
47
51
|
|
48
52
|
mv Microfeat #{language}.Microfeat
|
@@ -18,6 +18,14 @@ def near (x, y)
|
|
18
18
|
]
|
19
19
|
end
|
20
20
|
|
21
|
+
class Magick::Pixel
|
22
|
+
def =~ (other)
|
23
|
+
other = Magick::Pixel.from_color(other) if other.is_a?(String)
|
24
|
+
|
25
|
+
red == other.red && green == other.green && blue == other.blue
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
21
29
|
ENV['TESSDATA_PREFIX'] = './'
|
22
30
|
|
23
31
|
Tesseract::Engine.new {|engine|
|
@@ -33,20 +41,20 @@ Tesseract::Engine.new {|engine|
|
|
33
41
|
pixels[p] += 1
|
34
42
|
}
|
35
43
|
|
36
|
-
pixels.
|
44
|
+
pixels.reject! { |p| p =~ 'black' }
|
37
45
|
|
38
46
|
text_color, count = pixels.max { |a, b| a.last <=> b.last }
|
39
47
|
|
40
48
|
image.each_pixel {|p, x, y|
|
41
|
-
next unless p
|
49
|
+
next unless p =~ text_color or p =~ 'black'
|
42
50
|
|
43
|
-
image.pixel_color x, y, p
|
51
|
+
image.pixel_color x, y, p =~ text_color ? 'black' : 'white'
|
44
52
|
}
|
45
53
|
|
46
54
|
image.each_pixel {|p, x, y|
|
47
|
-
next if p
|
55
|
+
next if p =~ 'black' || p =~ 'white'
|
48
56
|
|
49
|
-
if near(x, y).map { |(x, y)| image.pixel_color x, y }.any? { |p| p
|
57
|
+
if near(x, y).map { |(x, y)| image.pixel_color x, y }.any? { |p| p =~ 'black' }
|
50
58
|
image.pixel_color x, y, 'gray'
|
51
59
|
else
|
52
60
|
image.pixel_color x, y, 'white'
|
@@ -54,7 +62,7 @@ Tesseract::Engine.new {|engine|
|
|
54
62
|
}
|
55
63
|
|
56
64
|
image.each_pixel {|p, x, y|
|
57
|
-
next unless p
|
65
|
+
next unless p =~ 'gray'
|
58
66
|
|
59
67
|
image.pixel_color x, y, 'black'
|
60
68
|
}
|
data/lib/tesseract.rb
CHANGED
data/lib/tesseract/engine.rb
CHANGED
@@ -31,7 +31,7 @@ module Tesseract
|
|
31
31
|
class Engine
|
32
32
|
attr_reader :config
|
33
33
|
|
34
|
-
|
34
|
+
named :path, :language, :mode, :variables,
|
35
35
|
:optional => { :path => '.', :language => :eng, :mode => :DEFAULT, :variables => {}, :config => [] },
|
36
36
|
:alias => { :data => :path, :lang => :language }
|
37
37
|
def initialize (path = '.', language = :eng, mode = :DEFAULT, variables = {}, config = [], &block) # :yields: self
|
@@ -135,14 +135,14 @@ class Engine
|
|
135
135
|
@image = image
|
136
136
|
end
|
137
137
|
|
138
|
-
|
138
|
+
named :x, :y, :width, :height,
|
139
139
|
:optional => 0 .. -1,
|
140
140
|
:alias => { :w => :width, :h => :height }
|
141
141
|
def select (x = nil, y = nil, width = nil, height = nil)
|
142
142
|
@rectangle = [x, y, width, height]
|
143
143
|
end
|
144
144
|
|
145
|
-
|
145
|
+
named :image, :x, :y, :width, :height,
|
146
146
|
:optional => 0 .. -1,
|
147
147
|
:alias => { :w => :width, :h => :height }
|
148
148
|
def text_for (image = nil, x = nil, y = nil, width = nil, height = nil)
|
@@ -160,7 +160,7 @@ class Engine
|
|
160
160
|
}
|
161
161
|
end
|
162
162
|
|
163
|
-
|
163
|
+
named :x, :y, :width, :height,
|
164
164
|
:optional => 0 .. -1,
|
165
165
|
:alias => { :w => :width, :h => :height }
|
166
166
|
def text_at (x = nil, y = nil, width = nil, height = nil)
|
@@ -178,7 +178,7 @@ class Engine
|
|
178
178
|
_iterator.__send__ "each_#{level}", &block
|
179
179
|
end
|
180
180
|
|
181
|
-
|
181
|
+
named :image, :x, :y, :width, :height,
|
182
182
|
:optional => 0 .. -1,
|
183
183
|
:alias => { :w => :width, :h => :height }
|
184
184
|
define_method "each_#{level}_for" do |image = nil, x = nil, y = nil, width = nil, height = nil, &block|
|
@@ -188,7 +188,7 @@ class Engine
|
|
188
188
|
__send__ "each_#{level}", &block
|
189
189
|
end
|
190
190
|
|
191
|
-
|
191
|
+
named :x, :y, :width, :height,
|
192
192
|
:optional => 0 .. -1,
|
193
193
|
:alias => { :w => :width, :h => :height }
|
194
194
|
define_method "each_#{level}_at" do |x = nil, y = nil, width = nil, height = nil, &block|
|
@@ -199,7 +199,7 @@ class Engine
|
|
199
199
|
_iterator.__send__ "#{level}s"
|
200
200
|
end
|
201
201
|
|
202
|
-
|
202
|
+
named :image, :x, :y, :width, :height,
|
203
203
|
:optional => 0 .. -1,
|
204
204
|
:alias => { :w => :width, :h => :height }
|
205
205
|
define_method "#{level}s_for" do |image = nil, x = nil, y = nil, width = nil, height = nil|
|
@@ -209,7 +209,7 @@ class Engine
|
|
209
209
|
__send__ "#{level}s"
|
210
210
|
end
|
211
211
|
|
212
|
-
|
212
|
+
named :x, :y, :width, :height,
|
213
213
|
:optional => 0 .. -1,
|
214
214
|
:alias => { :w => :width, :h => :height }
|
215
215
|
define_method "#{level}s_at" do |x = nil, y = nil, width = nil, height = nil|
|
data/lib/tesseract/extensions.rb
CHANGED
data/lib/tesseract/version.rb
CHANGED
data/tesseract-ocr.gemspec
CHANGED
@@ -14,8 +14,7 @@ Gem::Specification.new {|s|
|
|
14
14
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
15
15
|
s.require_paths = ['lib']
|
16
16
|
|
17
|
-
s.add_dependency '
|
18
|
-
s.add_dependency 'memoized'
|
17
|
+
s.add_dependency 'call-me'
|
19
18
|
s.add_dependency 'iso-639'
|
20
19
|
|
21
20
|
s.add_dependency 'ffi-extra'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tesseract-ocr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-12-
|
12
|
+
date: 2011-12-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
16
|
-
requirement: &
|
15
|
+
name: call-me
|
16
|
+
requirement: &15076460 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,21 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: memoized
|
27
|
-
requirement: &20730960 !ruby/object:Gem::Requirement
|
28
|
-
none: false
|
29
|
-
requirements:
|
30
|
-
- - ! '>='
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: '0'
|
33
|
-
type: :runtime
|
34
|
-
prerelease: false
|
35
|
-
version_requirements: *20730960
|
24
|
+
version_requirements: *15076460
|
36
25
|
- !ruby/object:Gem::Dependency
|
37
26
|
name: iso-639
|
38
|
-
requirement: &
|
27
|
+
requirement: &15075360 !ruby/object:Gem::Requirement
|
39
28
|
none: false
|
40
29
|
requirements:
|
41
30
|
- - ! '>='
|
@@ -43,10 +32,10 @@ dependencies:
|
|
43
32
|
version: '0'
|
44
33
|
type: :runtime
|
45
34
|
prerelease: false
|
46
|
-
version_requirements: *
|
35
|
+
version_requirements: *15075360
|
47
36
|
- !ruby/object:Gem::Dependency
|
48
37
|
name: ffi-extra
|
49
|
-
requirement: &
|
38
|
+
requirement: &15074140 !ruby/object:Gem::Requirement
|
50
39
|
none: false
|
51
40
|
requirements:
|
52
41
|
- - ! '>='
|
@@ -54,10 +43,10 @@ dependencies:
|
|
54
43
|
version: '0'
|
55
44
|
type: :runtime
|
56
45
|
prerelease: false
|
57
|
-
version_requirements: *
|
46
|
+
version_requirements: *15074140
|
58
47
|
- !ruby/object:Gem::Dependency
|
59
48
|
name: ffi-inliner
|
60
|
-
requirement: &
|
49
|
+
requirement: &15072700 !ruby/object:Gem::Requirement
|
61
50
|
none: false
|
62
51
|
requirements:
|
63
52
|
- - ! '>='
|
@@ -65,7 +54,7 @@ dependencies:
|
|
65
54
|
version: '0'
|
66
55
|
type: :runtime
|
67
56
|
prerelease: false
|
68
|
-
version_requirements: *
|
57
|
+
version_requirements: *15072700
|
69
58
|
description:
|
70
59
|
email: meh@paranoici.org
|
71
60
|
executables:
|