tesseract-ocr 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/examples/{nerdz-captcha-breaker.rb → nerdz-captcha-breaker/break.rb} +2 -0
- data/examples/nerdz-captcha-breaker/lol.gd-giant.exp.box +112 -0
- data/examples/nerdz-captcha-breaker/lol.gd-giant.exp.tif +0 -0
- data/lib/tesseract/api/iterator.rb +2 -1
- data/lib/tesseract/api.rb +32 -5
- data/lib/tesseract/c/baseapi.rb +12 -0
- data/lib/tesseract/c.rb +28 -1
- data/lib/tesseract/engine.rb +23 -1
- data/lib/tesseract/version.rb +1 -1
- data/test/tesseract_spec.rb +25 -1
- metadata +15 -13
@@ -0,0 +1,112 @@
|
|
1
|
+
T 31 46 39 56 0
|
2
|
+
h 40 46 48 56 0
|
3
|
+
e 49 46 57 53 0
|
4
|
+
( 69 45 74 57 0
|
5
|
+
q 76 44 84 53 0
|
6
|
+
u 85 46 93 53 0
|
7
|
+
i 95 46 101 56 0
|
8
|
+
c 103 46 111 53 0
|
9
|
+
k 113 46 120 56 0
|
10
|
+
) 122 45 127 57 0
|
11
|
+
b 139 46 147 56 0
|
12
|
+
r 148 46 156 53 0
|
13
|
+
o 157 46 165 53 0
|
14
|
+
w 166 46 174 53 0
|
15
|
+
n 175 46 183 53 0
|
16
|
+
{ 195 46 200 56 0
|
17
|
+
f 202 46 210 56 0
|
18
|
+
o 211 46 219 53 0
|
19
|
+
x 220 46 228 53 0
|
20
|
+
} 230 46 235 56 0
|
21
|
+
j 247 44 254 56 0
|
22
|
+
u 256 46 264 53 0
|
23
|
+
m 265 46 273 53 0
|
24
|
+
p 274 44 282 53 0
|
25
|
+
s 283 46 291 53 0
|
26
|
+
! 295 45 297 56 0
|
27
|
+
o 310 46 318 53 0
|
28
|
+
v 319 46 327 53 0
|
29
|
+
e 328 46 336 53 0
|
30
|
+
r 337 46 345 53 0
|
31
|
+
t 355 46 363 55 0
|
32
|
+
h 364 46 372 56 0
|
33
|
+
e 373 46 381 53 0
|
34
|
+
$ 392 45 399 56 0
|
35
|
+
3 400 46 408 56 0
|
36
|
+
, 410 45 415 48 0
|
37
|
+
4 418 46 426 56 0
|
38
|
+
5 427 46 435 56 0
|
39
|
+
6 436 46 444 56 0
|
40
|
+
. 447 45 451 48 0
|
41
|
+
7 454 46 462 56 0
|
42
|
+
8 463 46 471 56 0
|
43
|
+
< 482 46 488 56 0
|
44
|
+
l 492 46 496 56 0
|
45
|
+
a 499 46 507 53 0
|
46
|
+
z 509 46 515 53 0
|
47
|
+
y 517 44 525 53 0
|
48
|
+
> 526 46 532 56 0
|
49
|
+
# 544 47 551 55 0
|
50
|
+
9 553 46 561 56 0
|
51
|
+
0 562 46 570 56 0
|
52
|
+
d 580 46 588 56 0
|
53
|
+
o 589 46 597 53 0
|
54
|
+
g 598 44 606 53 0
|
55
|
+
& 616 46 624 56 0
|
56
|
+
d 634 46 642 56 0
|
57
|
+
u 643 46 651 53 0
|
58
|
+
c 652 46 660 53 0
|
59
|
+
k 662 46 669 56 0
|
60
|
+
/ 670 46 678 56 0
|
61
|
+
g 679 44 687 53 0
|
62
|
+
o 688 46 696 53 0
|
63
|
+
o 697 46 705 53 0
|
64
|
+
s 706 46 714 53 0
|
65
|
+
e 715 46 723 53 0
|
66
|
+
, 725 45 730 48 0
|
67
|
+
a 742 46 750 53 0
|
68
|
+
s 751 46 759 53 0
|
69
|
+
1 770 46 776 56 0
|
70
|
+
2 778 46 786 56 0
|
71
|
+
. 789 45 793 48 0
|
72
|
+
5 796 46 804 56 0
|
73
|
+
% 805 46 813 56 0
|
74
|
+
o 823 46 831 53 0
|
75
|
+
f 832 46 840 56 0
|
76
|
+
E 850 46 857 56 0
|
77
|
+
- 859 50 867 51 0
|
78
|
+
m 868 46 876 53 0
|
79
|
+
a 877 46 885 53 0
|
80
|
+
i 887 46 893 56 0
|
81
|
+
l 897 46 901 56 0
|
82
|
+
f 913 46 921 56 0
|
83
|
+
r 922 46 930 53 0
|
84
|
+
o 931 46 939 53 0
|
85
|
+
m 940 46 948 53 0
|
86
|
+
a 958 46 966 53 0
|
87
|
+
s 967 46 975 53 0
|
88
|
+
p 976 44 984 53 0
|
89
|
+
a 985 46 993 53 0
|
90
|
+
m 994 46 1002 53 0
|
91
|
+
m 1003 46 1011 53 0
|
92
|
+
e 1012 46 1020 53 0
|
93
|
+
r 1021 46 1029 53 0
|
94
|
+
@ 1030 46 1038 56 0
|
95
|
+
w 1039 46 1047 53 0
|
96
|
+
e 1048 46 1056 53 0
|
97
|
+
b 1057 46 1065 56 0
|
98
|
+
s 1066 46 1074 53 0
|
99
|
+
i 1076 46 1082 56 0
|
100
|
+
t 1084 46 1092 55 0
|
101
|
+
e 1093 46 1101 53 0
|
102
|
+
. 1104 45 1108 48 0
|
103
|
+
c 1111 46 1119 53 0
|
104
|
+
o 1120 46 1128 53 0
|
105
|
+
m 1129 46 1137 53 0
|
106
|
+
i 1148 46 1154 56 0
|
107
|
+
s 1156 46 1164 53 0
|
108
|
+
s 1174 46 1182 53 0
|
109
|
+
p 1183 44 1191 53 0
|
110
|
+
a 1192 46 1200 53 0
|
111
|
+
m 1201 46 1209 53 0
|
112
|
+
? 1211 46 1218 56 0
|
Binary file
|
@@ -77,9 +77,10 @@ class Iterator
|
|
77
77
|
pointer = C::Iterator.get_utf8_text(to_ffi, C.for_enum(level))
|
78
78
|
result = pointer.read_string
|
79
79
|
result.force_encoding 'UTF-8'
|
80
|
-
C.free_string(pointer)
|
81
80
|
|
82
81
|
result
|
82
|
+
ensure
|
83
|
+
C.free_array_of_char(pointer)
|
83
84
|
end
|
84
85
|
|
85
86
|
def confidence (level = :word)
|
data/lib/tesseract/api.rb
CHANGED
@@ -65,11 +65,11 @@ class API
|
|
65
65
|
C::BaseAPI.version(to_ffi)
|
66
66
|
end
|
67
67
|
|
68
|
-
def
|
68
|
+
def set_input_name (name)
|
69
69
|
C::BaseAPI.set_input_name(to_ffi, name)
|
70
70
|
end
|
71
71
|
|
72
|
-
def
|
72
|
+
def set_output_name (name)
|
73
73
|
C::BaseAPI.set_output_name(to_ffi, name)
|
74
74
|
end
|
75
75
|
|
@@ -125,6 +125,30 @@ class API
|
|
125
125
|
C::BaseAPI.set_rectangle(to_ffi, left, top, width, height)
|
126
126
|
end
|
127
127
|
|
128
|
+
def process_pages (name)
|
129
|
+
result = C.create_string
|
130
|
+
|
131
|
+
unless C::BaseAPI.process_pages(to_ffi, name, result)
|
132
|
+
raise 'process_pages failed'
|
133
|
+
end
|
134
|
+
|
135
|
+
C.string_content(result).read_string(C.string_length(result))
|
136
|
+
ensure
|
137
|
+
C.destroy_string(result)
|
138
|
+
end
|
139
|
+
|
140
|
+
def process_page (pix, page = 0, name = "")
|
141
|
+
result = C.create_string
|
142
|
+
|
143
|
+
unless C::BaseAPI.process_page(to_ffi, pix.is_a?(Image) ? pix.to_ffi : pix, page, name, result)
|
144
|
+
raise 'process_page failed'
|
145
|
+
end
|
146
|
+
|
147
|
+
C.string_content(result).read_string(C.string_length(result))
|
148
|
+
ensure
|
149
|
+
C.destroy_string(result)
|
150
|
+
end
|
151
|
+
|
128
152
|
def get_iterator
|
129
153
|
Iterator.new(C::BaseAPI.get_iterator(to_ffi))
|
130
154
|
end
|
@@ -133,27 +157,30 @@ class API
|
|
133
157
|
pointer = C::BaseAPI.get_utf8_text(to_ffi)
|
134
158
|
result = pointer.read_string
|
135
159
|
result.force_encoding 'UTF-8'
|
136
|
-
C.free_string(pointer)
|
137
160
|
|
138
161
|
result
|
162
|
+
ensure
|
163
|
+
C.free_array_of_char(pointer)
|
139
164
|
end
|
140
165
|
|
141
166
|
def get_box (page = 0)
|
142
167
|
pointer = C::BaseAPI.get_box_text(to_ffi, page)
|
143
168
|
result = pointer.read_string
|
144
169
|
result.force_encoding 'UTF-8'
|
145
|
-
C.free_string(pointer)
|
146
170
|
|
147
171
|
result
|
172
|
+
ensure
|
173
|
+
C.free_array_of_char(pointer)
|
148
174
|
end
|
149
175
|
|
150
176
|
def get_unlv
|
151
177
|
pointer = C::BaseAPI.get_unlv_text(to_ffi)
|
152
178
|
result = pointer.read_string
|
153
179
|
result.force_encoding 'ISO8859-1'
|
154
|
-
C.free_string(pointer)
|
155
180
|
|
156
181
|
result
|
182
|
+
ensure
|
183
|
+
C.free_array_of_char(pointer)
|
157
184
|
end
|
158
185
|
|
159
186
|
def mean_text_confidence
|
data/lib/tesseract/c/baseapi.rb
CHANGED
@@ -184,6 +184,18 @@ module BaseAPI
|
|
184
184
|
}
|
185
185
|
}
|
186
186
|
|
187
|
+
cpp.function %{
|
188
|
+
bool process_pages (TessBaseAPI* api, const char* filename, STRING* output) {
|
189
|
+
return api->ProcessPages(filename, NULL, 0, output);
|
190
|
+
}
|
191
|
+
}
|
192
|
+
|
193
|
+
cpp.function %{
|
194
|
+
bool process_page (TessBaseAPI* api, Pix* pix, int page_index, const char* filename, STRING* output) {
|
195
|
+
return api->ProcessPage(pix, page_index, filename, NULL, 0, output);
|
196
|
+
}
|
197
|
+
}
|
198
|
+
|
187
199
|
cpp.function %{
|
188
200
|
ResultIterator* get_iterator (TessBaseAPI* api) {
|
189
201
|
return api->GetIterator();
|
data/lib/tesseract/c.rb
CHANGED
@@ -32,8 +32,11 @@ module C
|
|
32
32
|
extend FFI::Inliner
|
33
33
|
|
34
34
|
inline 'C++' do |cpp|
|
35
|
+
cpp.include 'tesseract/strngs.h'
|
36
|
+
cpp.libraries 'tesseract'
|
37
|
+
|
35
38
|
cpp.function %{
|
36
|
-
void
|
39
|
+
void free_array_of_char (char* pointer) {
|
37
40
|
delete [] pointer;
|
38
41
|
}
|
39
42
|
}
|
@@ -43,6 +46,30 @@ module C
|
|
43
46
|
delete [] pointer;
|
44
47
|
}
|
45
48
|
}
|
49
|
+
|
50
|
+
cpp.function %{
|
51
|
+
STRING* create_string (void) {
|
52
|
+
return new STRING();
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
cpp.function %{
|
57
|
+
void destroy_string (STRING* value) {
|
58
|
+
delete value;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
|
62
|
+
cpp.function %{
|
63
|
+
int string_length (STRING* value) {
|
64
|
+
return value->length();
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
cpp.function %{
|
69
|
+
const char* string_content (STRING* value) {
|
70
|
+
return value->string();
|
71
|
+
}
|
72
|
+
}
|
46
73
|
end
|
47
74
|
|
48
75
|
def self.for_enum (what)
|
data/lib/tesseract/engine.rb
CHANGED
@@ -75,6 +75,14 @@ class Engine
|
|
75
75
|
}
|
76
76
|
end
|
77
77
|
|
78
|
+
def input= (name)
|
79
|
+
@api.set_input_name(name)
|
80
|
+
end
|
81
|
+
|
82
|
+
def output= (name)
|
83
|
+
@api.set_output_name(name)
|
84
|
+
end
|
85
|
+
|
78
86
|
def set (name, value)
|
79
87
|
@variables[name] = value
|
80
88
|
|
@@ -118,7 +126,9 @@ class Engine
|
|
118
126
|
end
|
119
127
|
|
120
128
|
def page_segmentation_mode= (value)
|
121
|
-
@
|
129
|
+
@psm = C.for_enum(value)
|
130
|
+
|
131
|
+
@api.set_page_seg_mode @psm
|
122
132
|
end
|
123
133
|
|
124
134
|
def image= (image)
|
@@ -190,6 +200,16 @@ class Engine
|
|
190
200
|
end
|
191
201
|
}
|
192
202
|
|
203
|
+
def process (image, page = nil)
|
204
|
+
if page
|
205
|
+
@api.process_page(API.image_for(image), page)
|
206
|
+
else
|
207
|
+
raise ArgumentError, 'the path does not exist' unless File.exists?(image)
|
208
|
+
|
209
|
+
@api.process_pages(image)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
193
213
|
protected
|
194
214
|
def _init
|
195
215
|
@api.end
|
@@ -203,6 +223,8 @@ protected
|
|
203
223
|
@config.each {|conf|
|
204
224
|
@api.read_config_file(conf)
|
205
225
|
}
|
226
|
+
|
227
|
+
@api.set_page_seg_mode @psm if @psm
|
206
228
|
end
|
207
229
|
|
208
230
|
def _setup (image = nil, x = nil, y = nil, width = nil, height = nil)
|
data/lib/tesseract/version.rb
CHANGED
data/test/tesseract_spec.rb
CHANGED
@@ -78,9 +78,33 @@ describe Tesseract::Engine do
|
|
78
78
|
end
|
79
79
|
end
|
80
80
|
|
81
|
-
describe '#
|
81
|
+
describe '#blocks' do
|
82
82
|
it 'works properly with first image' do
|
83
|
+
engine.blocks_for('first.png').first.to_s.should == "ABC\n"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
describe '#paragraphs' do
|
88
|
+
it 'works properly with first image' do
|
89
|
+
engine.paragraphs_for('first.png').first.to_s.should == "ABC\n"
|
90
|
+
end
|
91
|
+
end
|
83
92
|
|
93
|
+
describe '#lines' do
|
94
|
+
it 'works properly with first image' do
|
95
|
+
engine.lines_for('first.png').first.to_s.should == "ABC\n"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
describe '#words' do
|
100
|
+
it 'works properly with first image' do
|
101
|
+
engine.words_for('first.png').first.to_s.should == 'ABC'
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
describe '#symbols' do
|
106
|
+
it 'works properly with first image' do
|
107
|
+
engine.symbols_for('first.png').first.to_s.should == 'A'
|
84
108
|
end
|
85
109
|
end
|
86
110
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tesseract-ocr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-11-
|
12
|
+
date: 2011-11-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: namedic
|
16
|
-
requirement: &
|
16
|
+
requirement: &12863500 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *12863500
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: memoized
|
27
|
-
requirement: &
|
27
|
+
requirement: &12861460 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *12861460
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: iso-639
|
38
|
-
requirement: &
|
38
|
+
requirement: &12888100 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *12888100
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: ffi-extra
|
49
|
-
requirement: &
|
49
|
+
requirement: &12886020 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *12886020
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: ffi-inliner
|
60
|
-
requirement: &
|
60
|
+
requirement: &12882760 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *12882760
|
69
69
|
description:
|
70
70
|
email: meh@paranoici.org
|
71
71
|
executables:
|
@@ -76,7 +76,9 @@ files:
|
|
76
76
|
- README.md
|
77
77
|
- Rakefile
|
78
78
|
- bin/tesseract.rb
|
79
|
-
- examples/nerdz-captcha-breaker.rb
|
79
|
+
- examples/nerdz-captcha-breaker/break.rb
|
80
|
+
- examples/nerdz-captcha-breaker/lol.gd-giant.exp.box
|
81
|
+
- examples/nerdz-captcha-breaker/lol.gd-giant.exp.tif
|
80
82
|
- lib/tesseract-ocr.rb
|
81
83
|
- lib/tesseract.rb
|
82
84
|
- lib/tesseract/api.rb
|