tesseract-ocr 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/tesseract-train.rb +17 -13
- data/examples/nerdz-captcha-breaker/break.rb +14 -6
- data/lib/tesseract.rb +1 -2
- data/lib/tesseract/engine.rb +8 -8
- data/lib/tesseract/engine/iterator.rb +1 -1
- data/lib/tesseract/extensions.rb +2 -2
- data/lib/tesseract/version.rb +1 -1
- data/tesseract-ocr.gemspec +1 -2
- metadata +11 -22
data/bin/tesseract-train.rb
CHANGED
@@ -7,12 +7,12 @@ require 'shellwords'
|
|
7
7
|
options = {}
|
8
8
|
|
9
9
|
OptionParser.new do |o|
|
10
|
-
o.on '-b', '--box FILE', 'the box
|
11
|
-
options[:box] = File.realpath(
|
10
|
+
o.on '-b', '--box FILE...', Array, 'the box files to use' do |value|
|
11
|
+
options[:box] = value.map { |path| File.realpath(path) }
|
12
12
|
end
|
13
13
|
|
14
|
-
o.on '-i', '--image FILE', 'the image
|
15
|
-
options[:image] = File.realpath(
|
14
|
+
o.on '-i', '--image FILE...', Array, 'the image files to use' do |value|
|
15
|
+
options[:image] = value.map { |path| File.realpath(path) }
|
16
16
|
end
|
17
17
|
|
18
18
|
o.on '-o', '--output FILE', 'the path where to output the traineddata' do |value|
|
@@ -25,24 +25,28 @@ if language = ARGV.shift
|
|
25
25
|
options[:image] = File.realpath("#{language}.tif")
|
26
26
|
options[:output] = File.expand_path("#{language}.traineddata")
|
27
27
|
else
|
28
|
-
language = options[:
|
28
|
+
language = options[:output][/^(.*?)\./, 1]
|
29
29
|
end
|
30
30
|
|
31
31
|
Dir.chdir FileUtils.mkpath(File.join(Dir.tmpdir, rand.to_s)).first
|
32
32
|
|
33
33
|
language = language.shellescape
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
35
|
+
options[:box].each_with_index {|box, index|
|
36
|
+
%x{
|
37
|
+
cp #{box.shellescape} #{language}.#{index}.box
|
38
|
+
cp #{options[:image][index].shellescape} #{language}.#{index}#{File.extname(options[:image][index]}
|
39
|
+
|
40
|
+
tesseract #{language}#{File.extname(options[:image])} #{language} nobatch box.train.stderr
|
38
41
|
|
39
|
-
|
42
|
+
unicharset_extractor #{language}.box
|
40
43
|
|
41
|
-
|
44
|
+
echo #{language}.#{index} 0 0 0 0 0 >> font_properties
|
45
|
+
mftraining -F font_properties -U unicharset -O #{language}.unicharset #{language}.tr
|
46
|
+
}
|
47
|
+
}
|
42
48
|
|
43
|
-
|
44
|
-
mftraining -F font_properties -U unicharset #{language}.tr
|
45
|
-
mftraining -F font_properties -U unicharset -O #{language}.unicharset #{language}.tr
|
49
|
+
%x{
|
46
50
|
cntraining #{language}.tr
|
47
51
|
|
48
52
|
mv Microfeat #{language}.Microfeat
|
@@ -18,6 +18,14 @@ def near (x, y)
|
|
18
18
|
]
|
19
19
|
end
|
20
20
|
|
21
|
+
class Magick::Pixel
|
22
|
+
def =~ (other)
|
23
|
+
other = Magick::Pixel.from_color(other) if other.is_a?(String)
|
24
|
+
|
25
|
+
red == other.red && green == other.green && blue == other.blue
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
21
29
|
ENV['TESSDATA_PREFIX'] = './'
|
22
30
|
|
23
31
|
Tesseract::Engine.new {|engine|
|
@@ -33,20 +41,20 @@ Tesseract::Engine.new {|engine|
|
|
33
41
|
pixels[p] += 1
|
34
42
|
}
|
35
43
|
|
36
|
-
pixels.
|
44
|
+
pixels.reject! { |p| p =~ 'black' }
|
37
45
|
|
38
46
|
text_color, count = pixels.max { |a, b| a.last <=> b.last }
|
39
47
|
|
40
48
|
image.each_pixel {|p, x, y|
|
41
|
-
next unless p
|
49
|
+
next unless p =~ text_color or p =~ 'black'
|
42
50
|
|
43
|
-
image.pixel_color x, y, p
|
51
|
+
image.pixel_color x, y, p =~ text_color ? 'black' : 'white'
|
44
52
|
}
|
45
53
|
|
46
54
|
image.each_pixel {|p, x, y|
|
47
|
-
next if p
|
55
|
+
next if p =~ 'black' || p =~ 'white'
|
48
56
|
|
49
|
-
if near(x, y).map { |(x, y)| image.pixel_color x, y }.any? { |p| p
|
57
|
+
if near(x, y).map { |(x, y)| image.pixel_color x, y }.any? { |p| p =~ 'black' }
|
50
58
|
image.pixel_color x, y, 'gray'
|
51
59
|
else
|
52
60
|
image.pixel_color x, y, 'white'
|
@@ -54,7 +62,7 @@ Tesseract::Engine.new {|engine|
|
|
54
62
|
}
|
55
63
|
|
56
64
|
image.each_pixel {|p, x, y|
|
57
|
-
next unless p
|
65
|
+
next unless p =~ 'gray'
|
58
66
|
|
59
67
|
image.pixel_color x, y, 'black'
|
60
68
|
}
|
data/lib/tesseract.rb
CHANGED
data/lib/tesseract/engine.rb
CHANGED
@@ -31,7 +31,7 @@ module Tesseract
|
|
31
31
|
class Engine
|
32
32
|
attr_reader :config
|
33
33
|
|
34
|
-
|
34
|
+
named :path, :language, :mode, :variables,
|
35
35
|
:optional => { :path => '.', :language => :eng, :mode => :DEFAULT, :variables => {}, :config => [] },
|
36
36
|
:alias => { :data => :path, :lang => :language }
|
37
37
|
def initialize (path = '.', language = :eng, mode = :DEFAULT, variables = {}, config = [], &block) # :yields: self
|
@@ -135,14 +135,14 @@ class Engine
|
|
135
135
|
@image = image
|
136
136
|
end
|
137
137
|
|
138
|
-
|
138
|
+
named :x, :y, :width, :height,
|
139
139
|
:optional => 0 .. -1,
|
140
140
|
:alias => { :w => :width, :h => :height }
|
141
141
|
def select (x = nil, y = nil, width = nil, height = nil)
|
142
142
|
@rectangle = [x, y, width, height]
|
143
143
|
end
|
144
144
|
|
145
|
-
|
145
|
+
named :image, :x, :y, :width, :height,
|
146
146
|
:optional => 0 .. -1,
|
147
147
|
:alias => { :w => :width, :h => :height }
|
148
148
|
def text_for (image = nil, x = nil, y = nil, width = nil, height = nil)
|
@@ -160,7 +160,7 @@ class Engine
|
|
160
160
|
}
|
161
161
|
end
|
162
162
|
|
163
|
-
|
163
|
+
named :x, :y, :width, :height,
|
164
164
|
:optional => 0 .. -1,
|
165
165
|
:alias => { :w => :width, :h => :height }
|
166
166
|
def text_at (x = nil, y = nil, width = nil, height = nil)
|
@@ -178,7 +178,7 @@ class Engine
|
|
178
178
|
_iterator.__send__ "each_#{level}", &block
|
179
179
|
end
|
180
180
|
|
181
|
-
|
181
|
+
named :image, :x, :y, :width, :height,
|
182
182
|
:optional => 0 .. -1,
|
183
183
|
:alias => { :w => :width, :h => :height }
|
184
184
|
define_method "each_#{level}_for" do |image = nil, x = nil, y = nil, width = nil, height = nil, &block|
|
@@ -188,7 +188,7 @@ class Engine
|
|
188
188
|
__send__ "each_#{level}", &block
|
189
189
|
end
|
190
190
|
|
191
|
-
|
191
|
+
named :x, :y, :width, :height,
|
192
192
|
:optional => 0 .. -1,
|
193
193
|
:alias => { :w => :width, :h => :height }
|
194
194
|
define_method "each_#{level}_at" do |x = nil, y = nil, width = nil, height = nil, &block|
|
@@ -199,7 +199,7 @@ class Engine
|
|
199
199
|
_iterator.__send__ "#{level}s"
|
200
200
|
end
|
201
201
|
|
202
|
-
|
202
|
+
named :image, :x, :y, :width, :height,
|
203
203
|
:optional => 0 .. -1,
|
204
204
|
:alias => { :w => :width, :h => :height }
|
205
205
|
define_method "#{level}s_for" do |image = nil, x = nil, y = nil, width = nil, height = nil|
|
@@ -209,7 +209,7 @@ class Engine
|
|
209
209
|
__send__ "#{level}s"
|
210
210
|
end
|
211
211
|
|
212
|
-
|
212
|
+
named :x, :y, :width, :height,
|
213
213
|
:optional => 0 .. -1,
|
214
214
|
:alias => { :w => :width, :h => :height }
|
215
215
|
define_method "#{level}s_at" do |x = nil, y = nil, width = nil, height = nil|
|
data/lib/tesseract/extensions.rb
CHANGED
data/lib/tesseract/version.rb
CHANGED
data/tesseract-ocr.gemspec
CHANGED
@@ -14,8 +14,7 @@ Gem::Specification.new {|s|
|
|
14
14
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
15
15
|
s.require_paths = ['lib']
|
16
16
|
|
17
|
-
s.add_dependency '
|
18
|
-
s.add_dependency 'memoized'
|
17
|
+
s.add_dependency 'call-me'
|
19
18
|
s.add_dependency 'iso-639'
|
20
19
|
|
21
20
|
s.add_dependency 'ffi-extra'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tesseract-ocr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-12-
|
12
|
+
date: 2011-12-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
16
|
-
requirement: &
|
15
|
+
name: call-me
|
16
|
+
requirement: &15076460 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,21 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: memoized
|
27
|
-
requirement: &20730960 !ruby/object:Gem::Requirement
|
28
|
-
none: false
|
29
|
-
requirements:
|
30
|
-
- - ! '>='
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: '0'
|
33
|
-
type: :runtime
|
34
|
-
prerelease: false
|
35
|
-
version_requirements: *20730960
|
24
|
+
version_requirements: *15076460
|
36
25
|
- !ruby/object:Gem::Dependency
|
37
26
|
name: iso-639
|
38
|
-
requirement: &
|
27
|
+
requirement: &15075360 !ruby/object:Gem::Requirement
|
39
28
|
none: false
|
40
29
|
requirements:
|
41
30
|
- - ! '>='
|
@@ -43,10 +32,10 @@ dependencies:
|
|
43
32
|
version: '0'
|
44
33
|
type: :runtime
|
45
34
|
prerelease: false
|
46
|
-
version_requirements: *
|
35
|
+
version_requirements: *15075360
|
47
36
|
- !ruby/object:Gem::Dependency
|
48
37
|
name: ffi-extra
|
49
|
-
requirement: &
|
38
|
+
requirement: &15074140 !ruby/object:Gem::Requirement
|
50
39
|
none: false
|
51
40
|
requirements:
|
52
41
|
- - ! '>='
|
@@ -54,10 +43,10 @@ dependencies:
|
|
54
43
|
version: '0'
|
55
44
|
type: :runtime
|
56
45
|
prerelease: false
|
57
|
-
version_requirements: *
|
46
|
+
version_requirements: *15074140
|
58
47
|
- !ruby/object:Gem::Dependency
|
59
48
|
name: ffi-inliner
|
60
|
-
requirement: &
|
49
|
+
requirement: &15072700 !ruby/object:Gem::Requirement
|
61
50
|
none: false
|
62
51
|
requirements:
|
63
52
|
- - ! '>='
|
@@ -65,7 +54,7 @@ dependencies:
|
|
65
54
|
version: '0'
|
66
55
|
type: :runtime
|
67
56
|
prerelease: false
|
68
|
-
version_requirements: *
|
57
|
+
version_requirements: *15072700
|
69
58
|
description:
|
70
59
|
email: meh@paranoici.org
|
71
60
|
executables:
|