tesseract-ocr 0.0.4 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +54 -16
- data/bin/tesseract-train.rb +63 -0
- data/examples/nerdz-captcha-breaker/break.rb +18 -10
- data/examples/nerdz-captcha-breaker/captchas/001.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/002.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/003.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/004.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/005.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/006.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/007.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/008.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/009.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/010.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/011.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/012.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/013.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/014.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/015.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/016.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/017.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/018.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/019.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/020.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/021.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/022.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/023.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/024.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/025.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/026.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/027.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/028.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/029.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/030.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/031.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/032.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/033.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/034.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/035.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/036.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/037.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/038.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/039.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/040.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/041.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/042.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/043.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/044.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/045.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/046.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/047.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/048.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/049.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/050.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/051.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/052.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/053.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/054.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/055.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/056.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/057.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/058.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/059.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/060.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/061.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/062.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/063.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/064.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/065.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/066.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/067.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/068.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/069.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/070.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/071.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/072.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/073.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/074.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/075.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/076.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/077.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/078.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/079.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/080.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/081.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/082.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/083.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/084.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/085.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/086.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/087.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/088.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/089.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/090.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/091.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/092.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/093.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/094.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/095.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/096.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/097.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/098.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/099.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/100.png +0 -0
- data/examples/nerdz-captcha-breaker/captchas/captchas.txt +100 -0
- data/examples/nerdz-captcha-breaker/tessdata/generate.rb +21 -0
- data/examples/nerdz-captcha-breaker/tessdata/lol.box +600 -0
- data/examples/nerdz-captcha-breaker/tessdata/lol.tif +0 -0
- data/examples/nerdz-captcha-breaker/tessdata/lol.traineddata +0 -0
- data/examples/nerdz-captcha-breaker/test.rb +11 -0
- data/lib/tesseract/engine.rb +19 -2
- data/lib/tesseract/extensions.rb +0 -1
- data/lib/tesseract/version.rb +1 -1
- data/test/jsmj.png +0 -0
- data/test/tesseract_spec.rb +9 -0
- metadata +122 -14
- data/examples/nerdz-captcha-breaker/lol.gd-giant.exp.box +0 -112
- data/examples/nerdz-captcha-breaker/lol.gd-giant.exp.tif +0 -0
data/README.md
CHANGED
@@ -12,8 +12,14 @@ because it's still under review for upstream merging.
|
|
12
12
|
|
13
13
|
The gem is called `tesseract-ocr`.
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
If you're having any problem requiring tesseract or any dependencies, check the permissions of the installed
|
16
|
+
gems.
|
17
|
+
|
18
|
+
Examples
|
19
|
+
--------
|
20
|
+
Following are some examples that show the functionalities provided by tesseract-ocr.
|
21
|
+
|
22
|
+
### Basic functionality of tesseract
|
17
23
|
|
18
24
|
```ruby
|
19
25
|
require 'tesseract'
|
@@ -24,26 +30,58 @@ e = Tesseract::Engine.new {|e|
|
|
24
30
|
}
|
25
31
|
|
26
32
|
e.text_for('test/first.png').strip # => 'ABC'
|
27
|
-
|
28
|
-
e.words_for('test/second.png') # [
|
29
|
-
# [ 0] #<Tesseract(93.41653442382812): "|'m">,
|
30
|
-
# [ 1] #<Tesseract(91.11811828613281): "12">,
|
31
|
-
# [ 2] #<Tesseract(85.71760559082031): "and">,
|
32
|
-
# [ 3] #<Tesseract(83.4853515625): "what">,
|
33
|
-
# [ 4] #<Tesseract(86.71072387695312): "is">,
|
34
|
-
# [ 5] #<Tesseract(83.2227783203125): "this.">,
|
35
|
-
# [ 6] #<Tesseract(82.81439208984375): "INSTALL">,
|
36
|
-
# [ 7] #<Tesseract(86.46566772460938): "GENTOO">,
|
37
|
-
# [ 8] #<Tesseract(93.19613647460938): "OH">,
|
38
|
-
# [ 9] #<Tesseract(82.81439208984375): "HAI">,
|
39
|
-
# [10] #<Tesseract(85.9158935546875): "1234">
|
40
|
-
# ]
|
41
33
|
```
|
42
34
|
|
43
35
|
You can pass to `#text_for` either a path, an IO object, a string containing the image or
|
44
36
|
an object that responds to `#to_blob` (for example Magick::Image), keep in mind that
|
45
37
|
the format has to be supported by leptonica.
|
46
38
|
|
39
|
+
### Accessing advanced features
|
40
|
+
|
41
|
+
With advanced features you get access to blocks, paragraphs, lines, words and symbols.
|
42
|
+
|
43
|
+
There are lot of way to access those levels, the methods are the following (replace level
|
44
|
+
with one of the accessible features, so `each_level` can be `each_block` or `each_paragraph`
|
45
|
+
etc.)
|
46
|
+
|
47
|
+
The following kind of accessors need a block to be passed and they pass to the block each
|
48
|
+
`Element` object. The Element object has various getters to access certain features, I'll
|
49
|
+
talk about them later.
|
50
|
+
|
51
|
+
The methods are:
|
52
|
+
|
53
|
+
* `each_level`
|
54
|
+
* `each_level_for`
|
55
|
+
* `each_level_at`
|
56
|
+
|
57
|
+
The following accessors instead return an `Array` of `Element`s with cached getters, the getters
|
58
|
+
are cached beacause the values accessible in the `Element` are linked to the state of the internal
|
59
|
+
API, and that state changes if you access something else.
|
60
|
+
|
61
|
+
The methods are:
|
62
|
+
|
63
|
+
* `levels`
|
64
|
+
* `levels_for`
|
65
|
+
* `levels_at`
|
66
|
+
|
67
|
+
Again, to `*_for` methods you can pass what you can pass to a `#text_for`.
|
68
|
+
|
69
|
+
Each `Element` object has the following getters:
|
70
|
+
|
71
|
+
* `bounding_box`, this will return the box where the element is confined into
|
72
|
+
* `binary_image`, this will return the bichromatic image of the element
|
73
|
+
* `image`, this will return the image of the element
|
74
|
+
* `baseline`, this will return the line where the text is with a pair of coordinates
|
75
|
+
* `orientation`, this will return the orientation of the element
|
76
|
+
* `text`, this will return the text of the element
|
77
|
+
* `confidence`, this will return the confidence of correctness for the element
|
78
|
+
|
79
|
+
`Block` elements also have `type` accessors that specify the type of the block.
|
80
|
+
|
81
|
+
`Word` elements also have `font_attributes`, `from_dictionary?` and `numeric?` getters.
|
82
|
+
|
83
|
+
`Symbol` elements also have `superscript?`, `subscript?` and `dropcap?` getters.
|
84
|
+
|
47
85
|
Using the binary
|
48
86
|
----------------
|
49
87
|
You can also use the shipped executable in the following way:
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'tmpdir'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'shellwords'
|
6
|
+
|
7
|
+
options = {}
|
8
|
+
|
9
|
+
OptionParser.new do |o|
|
10
|
+
o.on '-b', '--box FILE', 'the box file to use' do |value|
|
11
|
+
options[:box] = File.realpath(value)
|
12
|
+
end
|
13
|
+
|
14
|
+
o.on '-i', '--image FILE', 'the image file to use' do |value|
|
15
|
+
options[:image] = File.realpath(value)
|
16
|
+
end
|
17
|
+
|
18
|
+
o.on '-o', '--output FILE', 'the path where to output the traineddata' do |value|
|
19
|
+
options[:output] = File.expand_path(value)
|
20
|
+
end
|
21
|
+
end.parse!
|
22
|
+
|
23
|
+
if language = ARGV.shift
|
24
|
+
options[:box] = File.realpath("#{language}.box")
|
25
|
+
options[:image] = File.realpath("#{language}.tif")
|
26
|
+
options[:output] = File.expand_path("#{language}.traineddata")
|
27
|
+
else
|
28
|
+
language = options[:box][/^(.*?)\./, 1]
|
29
|
+
end
|
30
|
+
|
31
|
+
Dir.chdir FileUtils.mkpath(File.join(Dir.tmpdir, rand.to_s)).first
|
32
|
+
|
33
|
+
language = language.shellescape
|
34
|
+
|
35
|
+
%x{
|
36
|
+
cp #{options[:box].shellescape} #{language}.box
|
37
|
+
cp #{options[:image].shellescape} #{language}#{File.extname(options[:image])}
|
38
|
+
|
39
|
+
tesseract #{language}#{File.extname(options[:image])} #{language} nobatch box.train.stderr
|
40
|
+
|
41
|
+
unicharset_extractor #{language}.box
|
42
|
+
|
43
|
+
echo #{language} 0 0 0 0 0 > font_properties
|
44
|
+
mftraining -F font_properties -U unicharset #{language}.tr
|
45
|
+
mftraining -F font_properties -U unicharset -O #{language}.unicharset #{language}.tr
|
46
|
+
cntraining #{language}.tr
|
47
|
+
|
48
|
+
mv Microfeat #{language}.Microfeat
|
49
|
+
mv normproto #{language}.normproto
|
50
|
+
mv pffmtable #{language}.pffmtable
|
51
|
+
mv mfunicharset #{language}.mfunicharset
|
52
|
+
mv inttemp #{language}.inttemp
|
53
|
+
|
54
|
+
combine_tessdata #{language}.
|
55
|
+
|
56
|
+
mv #{language}.traineddata #{options[:output].shellescape}
|
57
|
+
}
|
58
|
+
|
59
|
+
path = File.realpath(Dir.pwd)
|
60
|
+
|
61
|
+
Dir.chdir '/'
|
62
|
+
|
63
|
+
FileUtils.rm_rf path
|
@@ -2,21 +2,29 @@
|
|
2
2
|
require 'tesseract'
|
3
3
|
require 'RMagick'
|
4
4
|
|
5
|
+
# this function is used to get points near the current pixel to
|
6
|
+
# cleanup the oblique lines mess, horizontal points seem to output
|
7
|
+
# better cleanup
|
5
8
|
def near (x, y)
|
6
9
|
[
|
7
|
-
[x - 1, y - 1],
|
8
|
-
[x, y - 1],
|
9
|
-
[x + 1, y - 1],
|
10
|
+
# [x - 1, y - 1],
|
11
|
+
# [x, y - 1],
|
12
|
+
# [x + 1, y - 1],
|
10
13
|
[x - 1, y ],
|
11
|
-
# FIRE IN THE HOLE
|
12
14
|
[x + 1, y ],
|
13
|
-
[x - 1, y + 1],
|
14
|
-
[x, y + 1],
|
15
|
-
[x + 1, y + 1]
|
15
|
+
# [x - 1, y + 1],
|
16
|
+
# [x, y + 1],
|
17
|
+
# [x + 1, y + 1]
|
16
18
|
]
|
17
19
|
end
|
18
20
|
|
19
|
-
|
21
|
+
ENV['TESSDATA_PREFIX'] = './'
|
22
|
+
|
23
|
+
Tesseract::Engine.new {|engine|
|
24
|
+
engine.language = :lol
|
25
|
+
engine.page_segmentation_mode = 8
|
26
|
+
engine.whitelist = [*'a'..'z', *'A'..'Z', *0..9].join
|
27
|
+
}.tap {|engine|
|
20
28
|
ARGV.each {|path|
|
21
29
|
image = Magick::Image.read(path).first
|
22
30
|
pixels = Hash.new { |h, k| h[k] = 0 }
|
@@ -51,8 +59,8 @@ Tesseract::Engine.new.tap {|engine|
|
|
51
59
|
image.pixel_color x, y, 'black'
|
52
60
|
}
|
53
61
|
|
54
|
-
|
62
|
+
image.scale(4).display if ENV['DEBUG']
|
55
63
|
|
56
|
-
puts engine.text_for(image.
|
64
|
+
puts "#{path}: #{engine.text_for(image.scale(4)).strip}"
|
57
65
|
}
|
58
66
|
}
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,100 @@
|
|
1
|
+
enKzaV
|
2
|
+
CZU6tf
|
3
|
+
ZO5mGY
|
4
|
+
FNNhZv
|
5
|
+
Dwp1Vy
|
6
|
+
JYsDAi
|
7
|
+
dld510
|
8
|
+
yG615j
|
9
|
+
WHxAUZ
|
10
|
+
k9IhZu
|
11
|
+
qWIPSr
|
12
|
+
nDSXc5
|
13
|
+
9iTYeZ
|
14
|
+
s44iQ9
|
15
|
+
VPNXWy
|
16
|
+
80zxvW
|
17
|
+
QA7IYj
|
18
|
+
D8Ro4U
|
19
|
+
OiEg1U
|
20
|
+
pJS7Z8
|
21
|
+
6w8eik
|
22
|
+
s5igED
|
23
|
+
7bJe8p
|
24
|
+
VtYdW3
|
25
|
+
jNNdcO
|
26
|
+
neLPNV
|
27
|
+
KONPnl
|
28
|
+
Q8aSXJ
|
29
|
+
kIwSqv
|
30
|
+
8LQExn
|
31
|
+
RwcDU2
|
32
|
+
LMLg5K
|
33
|
+
C0YmdD
|
34
|
+
mqAvES
|
35
|
+
Ai0Wxi
|
36
|
+
bopETp
|
37
|
+
L3yP5u
|
38
|
+
w4rw3b
|
39
|
+
oSEUMU
|
40
|
+
bftqDK
|
41
|
+
mM7cKE
|
42
|
+
rYl6x4
|
43
|
+
3hVI8X
|
44
|
+
Tm2PPp
|
45
|
+
VfmqQ6
|
46
|
+
0EZAgC
|
47
|
+
QW6gBS
|
48
|
+
UTS137
|
49
|
+
YXXTqk
|
50
|
+
a6LU3K
|
51
|
+
SVzguN
|
52
|
+
l9G8Y9
|
53
|
+
ZP9TDM
|
54
|
+
yj7zmS
|
55
|
+
sD0Ub9
|
56
|
+
XeWY3A
|
57
|
+
w8EKVl
|
58
|
+
3gO266
|
59
|
+
yYN3oJ
|
60
|
+
NumjLi
|
61
|
+
EEzwCz
|
62
|
+
8bUSjW
|
63
|
+
GAo6ap
|
64
|
+
AXcn6K
|
65
|
+
KquWmp
|
66
|
+
PM8LYt
|
67
|
+
uGS7GO
|
68
|
+
mucVzf
|
69
|
+
UIZ6cf
|
70
|
+
mXRdGq
|
71
|
+
lcn8OP
|
72
|
+
lKqkw7
|
73
|
+
CTHnR1
|
74
|
+
ShvmSD
|
75
|
+
klCoXI
|
76
|
+
8epftU
|
77
|
+
nA277p
|
78
|
+
uXfavQ
|
79
|
+
EhInXB
|
80
|
+
KJLUZf
|
81
|
+
nAWswu
|
82
|
+
savOX1
|
83
|
+
bJtoCK
|
84
|
+
cTnmnF
|
85
|
+
hoYGbS
|
86
|
+
d3J04A
|
87
|
+
c18WaN
|
88
|
+
aO5VVN
|
89
|
+
DvP3vf
|
90
|
+
7cuFOq
|
91
|
+
mPaSmA
|
92
|
+
k0rNqQ
|
93
|
+
tvG40k
|
94
|
+
154xFW
|
95
|
+
x9ioCI
|
96
|
+
p9rR4J
|
97
|
+
DNLPJA
|
98
|
+
wGR2O1
|
99
|
+
zVr96r
|
100
|
+
856sHR
|