tesseract-ocr 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/examples/{nerdz-captcha-breaker.rb → nerdz-captcha-breaker/break.rb} +2 -0
- data/examples/nerdz-captcha-breaker/lol.gd-giant.exp.box +112 -0
- data/examples/nerdz-captcha-breaker/lol.gd-giant.exp.tif +0 -0
- data/lib/tesseract/api/iterator.rb +2 -1
- data/lib/tesseract/api.rb +32 -5
- data/lib/tesseract/c/baseapi.rb +12 -0
- data/lib/tesseract/c.rb +28 -1
- data/lib/tesseract/engine.rb +23 -1
- data/lib/tesseract/version.rb +1 -1
- data/test/tesseract_spec.rb +25 -1
- metadata +15 -13
@@ -0,0 +1,112 @@
|
|
1
|
+
T 31 46 39 56 0
|
2
|
+
h 40 46 48 56 0
|
3
|
+
e 49 46 57 53 0
|
4
|
+
( 69 45 74 57 0
|
5
|
+
q 76 44 84 53 0
|
6
|
+
u 85 46 93 53 0
|
7
|
+
i 95 46 101 56 0
|
8
|
+
c 103 46 111 53 0
|
9
|
+
k 113 46 120 56 0
|
10
|
+
) 122 45 127 57 0
|
11
|
+
b 139 46 147 56 0
|
12
|
+
r 148 46 156 53 0
|
13
|
+
o 157 46 165 53 0
|
14
|
+
w 166 46 174 53 0
|
15
|
+
n 175 46 183 53 0
|
16
|
+
{ 195 46 200 56 0
|
17
|
+
f 202 46 210 56 0
|
18
|
+
o 211 46 219 53 0
|
19
|
+
x 220 46 228 53 0
|
20
|
+
} 230 46 235 56 0
|
21
|
+
j 247 44 254 56 0
|
22
|
+
u 256 46 264 53 0
|
23
|
+
m 265 46 273 53 0
|
24
|
+
p 274 44 282 53 0
|
25
|
+
s 283 46 291 53 0
|
26
|
+
! 295 45 297 56 0
|
27
|
+
o 310 46 318 53 0
|
28
|
+
v 319 46 327 53 0
|
29
|
+
e 328 46 336 53 0
|
30
|
+
r 337 46 345 53 0
|
31
|
+
t 355 46 363 55 0
|
32
|
+
h 364 46 372 56 0
|
33
|
+
e 373 46 381 53 0
|
34
|
+
$ 392 45 399 56 0
|
35
|
+
3 400 46 408 56 0
|
36
|
+
, 410 45 415 48 0
|
37
|
+
4 418 46 426 56 0
|
38
|
+
5 427 46 435 56 0
|
39
|
+
6 436 46 444 56 0
|
40
|
+
. 447 45 451 48 0
|
41
|
+
7 454 46 462 56 0
|
42
|
+
8 463 46 471 56 0
|
43
|
+
< 482 46 488 56 0
|
44
|
+
l 492 46 496 56 0
|
45
|
+
a 499 46 507 53 0
|
46
|
+
z 509 46 515 53 0
|
47
|
+
y 517 44 525 53 0
|
48
|
+
> 526 46 532 56 0
|
49
|
+
# 544 47 551 55 0
|
50
|
+
9 553 46 561 56 0
|
51
|
+
0 562 46 570 56 0
|
52
|
+
d 580 46 588 56 0
|
53
|
+
o 589 46 597 53 0
|
54
|
+
g 598 44 606 53 0
|
55
|
+
& 616 46 624 56 0
|
56
|
+
d 634 46 642 56 0
|
57
|
+
u 643 46 651 53 0
|
58
|
+
c 652 46 660 53 0
|
59
|
+
k 662 46 669 56 0
|
60
|
+
/ 670 46 678 56 0
|
61
|
+
g 679 44 687 53 0
|
62
|
+
o 688 46 696 53 0
|
63
|
+
o 697 46 705 53 0
|
64
|
+
s 706 46 714 53 0
|
65
|
+
e 715 46 723 53 0
|
66
|
+
, 725 45 730 48 0
|
67
|
+
a 742 46 750 53 0
|
68
|
+
s 751 46 759 53 0
|
69
|
+
1 770 46 776 56 0
|
70
|
+
2 778 46 786 56 0
|
71
|
+
. 789 45 793 48 0
|
72
|
+
5 796 46 804 56 0
|
73
|
+
% 805 46 813 56 0
|
74
|
+
o 823 46 831 53 0
|
75
|
+
f 832 46 840 56 0
|
76
|
+
E 850 46 857 56 0
|
77
|
+
- 859 50 867 51 0
|
78
|
+
m 868 46 876 53 0
|
79
|
+
a 877 46 885 53 0
|
80
|
+
i 887 46 893 56 0
|
81
|
+
l 897 46 901 56 0
|
82
|
+
f 913 46 921 56 0
|
83
|
+
r 922 46 930 53 0
|
84
|
+
o 931 46 939 53 0
|
85
|
+
m 940 46 948 53 0
|
86
|
+
a 958 46 966 53 0
|
87
|
+
s 967 46 975 53 0
|
88
|
+
p 976 44 984 53 0
|
89
|
+
a 985 46 993 53 0
|
90
|
+
m 994 46 1002 53 0
|
91
|
+
m 1003 46 1011 53 0
|
92
|
+
e 1012 46 1020 53 0
|
93
|
+
r 1021 46 1029 53 0
|
94
|
+
@ 1030 46 1038 56 0
|
95
|
+
w 1039 46 1047 53 0
|
96
|
+
e 1048 46 1056 53 0
|
97
|
+
b 1057 46 1065 56 0
|
98
|
+
s 1066 46 1074 53 0
|
99
|
+
i 1076 46 1082 56 0
|
100
|
+
t 1084 46 1092 55 0
|
101
|
+
e 1093 46 1101 53 0
|
102
|
+
. 1104 45 1108 48 0
|
103
|
+
c 1111 46 1119 53 0
|
104
|
+
o 1120 46 1128 53 0
|
105
|
+
m 1129 46 1137 53 0
|
106
|
+
i 1148 46 1154 56 0
|
107
|
+
s 1156 46 1164 53 0
|
108
|
+
s 1174 46 1182 53 0
|
109
|
+
p 1183 44 1191 53 0
|
110
|
+
a 1192 46 1200 53 0
|
111
|
+
m 1201 46 1209 53 0
|
112
|
+
? 1211 46 1218 56 0
|
Binary file
|
@@ -77,9 +77,10 @@ class Iterator
|
|
77
77
|
pointer = C::Iterator.get_utf8_text(to_ffi, C.for_enum(level))
|
78
78
|
result = pointer.read_string
|
79
79
|
result.force_encoding 'UTF-8'
|
80
|
-
C.free_string(pointer)
|
81
80
|
|
82
81
|
result
|
82
|
+
ensure
|
83
|
+
C.free_array_of_char(pointer)
|
83
84
|
end
|
84
85
|
|
85
86
|
def confidence (level = :word)
|
data/lib/tesseract/api.rb
CHANGED
@@ -65,11 +65,11 @@ class API
|
|
65
65
|
C::BaseAPI.version(to_ffi)
|
66
66
|
end
|
67
67
|
|
68
|
-
def
|
68
|
+
def set_input_name (name)
|
69
69
|
C::BaseAPI.set_input_name(to_ffi, name)
|
70
70
|
end
|
71
71
|
|
72
|
-
def
|
72
|
+
def set_output_name (name)
|
73
73
|
C::BaseAPI.set_output_name(to_ffi, name)
|
74
74
|
end
|
75
75
|
|
@@ -125,6 +125,30 @@ class API
|
|
125
125
|
C::BaseAPI.set_rectangle(to_ffi, left, top, width, height)
|
126
126
|
end
|
127
127
|
|
128
|
+
def process_pages (name)
|
129
|
+
result = C.create_string
|
130
|
+
|
131
|
+
unless C::BaseAPI.process_pages(to_ffi, name, result)
|
132
|
+
raise 'process_pages failed'
|
133
|
+
end
|
134
|
+
|
135
|
+
C.string_content(result).read_string(C.string_length(result))
|
136
|
+
ensure
|
137
|
+
C.destroy_string(result)
|
138
|
+
end
|
139
|
+
|
140
|
+
def process_page (pix, page = 0, name = "")
|
141
|
+
result = C.create_string
|
142
|
+
|
143
|
+
unless C::BaseAPI.process_page(to_ffi, pix.is_a?(Image) ? pix.to_ffi : pix, page, name, result)
|
144
|
+
raise 'process_page failed'
|
145
|
+
end
|
146
|
+
|
147
|
+
C.string_content(result).read_string(C.string_length(result))
|
148
|
+
ensure
|
149
|
+
C.destroy_string(result)
|
150
|
+
end
|
151
|
+
|
128
152
|
def get_iterator
|
129
153
|
Iterator.new(C::BaseAPI.get_iterator(to_ffi))
|
130
154
|
end
|
@@ -133,27 +157,30 @@ class API
|
|
133
157
|
pointer = C::BaseAPI.get_utf8_text(to_ffi)
|
134
158
|
result = pointer.read_string
|
135
159
|
result.force_encoding 'UTF-8'
|
136
|
-
C.free_string(pointer)
|
137
160
|
|
138
161
|
result
|
162
|
+
ensure
|
163
|
+
C.free_array_of_char(pointer)
|
139
164
|
end
|
140
165
|
|
141
166
|
def get_box (page = 0)
|
142
167
|
pointer = C::BaseAPI.get_box_text(to_ffi, page)
|
143
168
|
result = pointer.read_string
|
144
169
|
result.force_encoding 'UTF-8'
|
145
|
-
C.free_string(pointer)
|
146
170
|
|
147
171
|
result
|
172
|
+
ensure
|
173
|
+
C.free_array_of_char(pointer)
|
148
174
|
end
|
149
175
|
|
150
176
|
def get_unlv
|
151
177
|
pointer = C::BaseAPI.get_unlv_text(to_ffi)
|
152
178
|
result = pointer.read_string
|
153
179
|
result.force_encoding 'ISO8859-1'
|
154
|
-
C.free_string(pointer)
|
155
180
|
|
156
181
|
result
|
182
|
+
ensure
|
183
|
+
C.free_array_of_char(pointer)
|
157
184
|
end
|
158
185
|
|
159
186
|
def mean_text_confidence
|
data/lib/tesseract/c/baseapi.rb
CHANGED
@@ -184,6 +184,18 @@ module BaseAPI
|
|
184
184
|
}
|
185
185
|
}
|
186
186
|
|
187
|
+
cpp.function %{
|
188
|
+
bool process_pages (TessBaseAPI* api, const char* filename, STRING* output) {
|
189
|
+
return api->ProcessPages(filename, NULL, 0, output);
|
190
|
+
}
|
191
|
+
}
|
192
|
+
|
193
|
+
cpp.function %{
|
194
|
+
bool process_page (TessBaseAPI* api, Pix* pix, int page_index, const char* filename, STRING* output) {
|
195
|
+
return api->ProcessPage(pix, page_index, filename, NULL, 0, output);
|
196
|
+
}
|
197
|
+
}
|
198
|
+
|
187
199
|
cpp.function %{
|
188
200
|
ResultIterator* get_iterator (TessBaseAPI* api) {
|
189
201
|
return api->GetIterator();
|
data/lib/tesseract/c.rb
CHANGED
@@ -32,8 +32,11 @@ module C
|
|
32
32
|
extend FFI::Inliner
|
33
33
|
|
34
34
|
inline 'C++' do |cpp|
|
35
|
+
cpp.include 'tesseract/strngs.h'
|
36
|
+
cpp.libraries 'tesseract'
|
37
|
+
|
35
38
|
cpp.function %{
|
36
|
-
void
|
39
|
+
void free_array_of_char (char* pointer) {
|
37
40
|
delete [] pointer;
|
38
41
|
}
|
39
42
|
}
|
@@ -43,6 +46,30 @@ module C
|
|
43
46
|
delete [] pointer;
|
44
47
|
}
|
45
48
|
}
|
49
|
+
|
50
|
+
cpp.function %{
|
51
|
+
STRING* create_string (void) {
|
52
|
+
return new STRING();
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
cpp.function %{
|
57
|
+
void destroy_string (STRING* value) {
|
58
|
+
delete value;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
|
62
|
+
cpp.function %{
|
63
|
+
int string_length (STRING* value) {
|
64
|
+
return value->length();
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
cpp.function %{
|
69
|
+
const char* string_content (STRING* value) {
|
70
|
+
return value->string();
|
71
|
+
}
|
72
|
+
}
|
46
73
|
end
|
47
74
|
|
48
75
|
def self.for_enum (what)
|
data/lib/tesseract/engine.rb
CHANGED
@@ -75,6 +75,14 @@ class Engine
|
|
75
75
|
}
|
76
76
|
end
|
77
77
|
|
78
|
+
def input= (name)
|
79
|
+
@api.set_input_name(name)
|
80
|
+
end
|
81
|
+
|
82
|
+
def output= (name)
|
83
|
+
@api.set_output_name(name)
|
84
|
+
end
|
85
|
+
|
78
86
|
def set (name, value)
|
79
87
|
@variables[name] = value
|
80
88
|
|
@@ -118,7 +126,9 @@ class Engine
|
|
118
126
|
end
|
119
127
|
|
120
128
|
def page_segmentation_mode= (value)
|
121
|
-
@
|
129
|
+
@psm = C.for_enum(value)
|
130
|
+
|
131
|
+
@api.set_page_seg_mode @psm
|
122
132
|
end
|
123
133
|
|
124
134
|
def image= (image)
|
@@ -190,6 +200,16 @@ class Engine
|
|
190
200
|
end
|
191
201
|
}
|
192
202
|
|
203
|
+
def process (image, page = nil)
|
204
|
+
if page
|
205
|
+
@api.process_page(API.image_for(image), page)
|
206
|
+
else
|
207
|
+
raise ArgumentError, 'the path does not exist' unless File.exists?(image)
|
208
|
+
|
209
|
+
@api.process_pages(image)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
193
213
|
protected
|
194
214
|
def _init
|
195
215
|
@api.end
|
@@ -203,6 +223,8 @@ protected
|
|
203
223
|
@config.each {|conf|
|
204
224
|
@api.read_config_file(conf)
|
205
225
|
}
|
226
|
+
|
227
|
+
@api.set_page_seg_mode @psm if @psm
|
206
228
|
end
|
207
229
|
|
208
230
|
def _setup (image = nil, x = nil, y = nil, width = nil, height = nil)
|
data/lib/tesseract/version.rb
CHANGED
data/test/tesseract_spec.rb
CHANGED
@@ -78,9 +78,33 @@ describe Tesseract::Engine do
|
|
78
78
|
end
|
79
79
|
end
|
80
80
|
|
81
|
-
describe '#
|
81
|
+
describe '#blocks' do
|
82
82
|
it 'works properly with first image' do
|
83
|
+
engine.blocks_for('first.png').first.to_s.should == "ABC\n"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
describe '#paragraphs' do
|
88
|
+
it 'works properly with first image' do
|
89
|
+
engine.paragraphs_for('first.png').first.to_s.should == "ABC\n"
|
90
|
+
end
|
91
|
+
end
|
83
92
|
|
93
|
+
describe '#lines' do
|
94
|
+
it 'works properly with first image' do
|
95
|
+
engine.lines_for('first.png').first.to_s.should == "ABC\n"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
describe '#words' do
|
100
|
+
it 'works properly with first image' do
|
101
|
+
engine.words_for('first.png').first.to_s.should == 'ABC'
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
describe '#symbols' do
|
106
|
+
it 'works properly with first image' do
|
107
|
+
engine.symbols_for('first.png').first.to_s.should == 'A'
|
84
108
|
end
|
85
109
|
end
|
86
110
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tesseract-ocr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-11-
|
12
|
+
date: 2011-11-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: namedic
|
16
|
-
requirement: &
|
16
|
+
requirement: &12863500 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *12863500
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: memoized
|
27
|
-
requirement: &
|
27
|
+
requirement: &12861460 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *12861460
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: iso-639
|
38
|
-
requirement: &
|
38
|
+
requirement: &12888100 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *12888100
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: ffi-extra
|
49
|
-
requirement: &
|
49
|
+
requirement: &12886020 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *12886020
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: ffi-inliner
|
60
|
-
requirement: &
|
60
|
+
requirement: &12882760 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *12882760
|
69
69
|
description:
|
70
70
|
email: meh@paranoici.org
|
71
71
|
executables:
|
@@ -76,7 +76,9 @@ files:
|
|
76
76
|
- README.md
|
77
77
|
- Rakefile
|
78
78
|
- bin/tesseract.rb
|
79
|
-
- examples/nerdz-captcha-breaker.rb
|
79
|
+
- examples/nerdz-captcha-breaker/break.rb
|
80
|
+
- examples/nerdz-captcha-breaker/lol.gd-giant.exp.box
|
81
|
+
- examples/nerdz-captcha-breaker/lol.gd-giant.exp.tif
|
80
82
|
- lib/tesseract-ocr.rb
|
81
83
|
- lib/tesseract.rb
|
82
84
|
- lib/tesseract/api.rb
|