tesseract-ocr 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -51,6 +51,8 @@ Tesseract::Engine.new.tap {|engine|
51
51
  image.pixel_color x, y, 'black'
52
52
  }
53
53
 
54
+ File.open('/tmp/lol.png', ?w) { |f| f.write(image.resize(10).to_blob) }
55
+
54
56
  puts engine.text_for(image.resize 10).strip
55
57
  }
56
58
  }
@@ -0,0 +1,112 @@
1
+ T 31 46 39 56 0
2
+ h 40 46 48 56 0
3
+ e 49 46 57 53 0
4
+ ( 69 45 74 57 0
5
+ q 76 44 84 53 0
6
+ u 85 46 93 53 0
7
+ i 95 46 101 56 0
8
+ c 103 46 111 53 0
9
+ k 113 46 120 56 0
10
+ ) 122 45 127 57 0
11
+ b 139 46 147 56 0
12
+ r 148 46 156 53 0
13
+ o 157 46 165 53 0
14
+ w 166 46 174 53 0
15
+ n 175 46 183 53 0
16
+ { 195 46 200 56 0
17
+ f 202 46 210 56 0
18
+ o 211 46 219 53 0
19
+ x 220 46 228 53 0
20
+ } 230 46 235 56 0
21
+ j 247 44 254 56 0
22
+ u 256 46 264 53 0
23
+ m 265 46 273 53 0
24
+ p 274 44 282 53 0
25
+ s 283 46 291 53 0
26
+ ! 295 45 297 56 0
27
+ o 310 46 318 53 0
28
+ v 319 46 327 53 0
29
+ e 328 46 336 53 0
30
+ r 337 46 345 53 0
31
+ t 355 46 363 55 0
32
+ h 364 46 372 56 0
33
+ e 373 46 381 53 0
34
+ $ 392 45 399 56 0
35
+ 3 400 46 408 56 0
36
+ , 410 45 415 48 0
37
+ 4 418 46 426 56 0
38
+ 5 427 46 435 56 0
39
+ 6 436 46 444 56 0
40
+ . 447 45 451 48 0
41
+ 7 454 46 462 56 0
42
+ 8 463 46 471 56 0
43
+ < 482 46 488 56 0
44
+ l 492 46 496 56 0
45
+ a 499 46 507 53 0
46
+ z 509 46 515 53 0
47
+ y 517 44 525 53 0
48
+ > 526 46 532 56 0
49
+ # 544 47 551 55 0
50
+ 9 553 46 561 56 0
51
+ 0 562 46 570 56 0
52
+ d 580 46 588 56 0
53
+ o 589 46 597 53 0
54
+ g 598 44 606 53 0
55
+ & 616 46 624 56 0
56
+ d 634 46 642 56 0
57
+ u 643 46 651 53 0
58
+ c 652 46 660 53 0
59
+ k 662 46 669 56 0
60
+ / 670 46 678 56 0
61
+ g 679 44 687 53 0
62
+ o 688 46 696 53 0
63
+ o 697 46 705 53 0
64
+ s 706 46 714 53 0
65
+ e 715 46 723 53 0
66
+ , 725 45 730 48 0
67
+ a 742 46 750 53 0
68
+ s 751 46 759 53 0
69
+ 1 770 46 776 56 0
70
+ 2 778 46 786 56 0
71
+ . 789 45 793 48 0
72
+ 5 796 46 804 56 0
73
+ % 805 46 813 56 0
74
+ o 823 46 831 53 0
75
+ f 832 46 840 56 0
76
+ E 850 46 857 56 0
77
+ - 859 50 867 51 0
78
+ m 868 46 876 53 0
79
+ a 877 46 885 53 0
80
+ i 887 46 893 56 0
81
+ l 897 46 901 56 0
82
+ f 913 46 921 56 0
83
+ r 922 46 930 53 0
84
+ o 931 46 939 53 0
85
+ m 940 46 948 53 0
86
+ a 958 46 966 53 0
87
+ s 967 46 975 53 0
88
+ p 976 44 984 53 0
89
+ a 985 46 993 53 0
90
+ m 994 46 1002 53 0
91
+ m 1003 46 1011 53 0
92
+ e 1012 46 1020 53 0
93
+ r 1021 46 1029 53 0
94
+ @ 1030 46 1038 56 0
95
+ w 1039 46 1047 53 0
96
+ e 1048 46 1056 53 0
97
+ b 1057 46 1065 56 0
98
+ s 1066 46 1074 53 0
99
+ i 1076 46 1082 56 0
100
+ t 1084 46 1092 55 0
101
+ e 1093 46 1101 53 0
102
+ . 1104 45 1108 48 0
103
+ c 1111 46 1119 53 0
104
+ o 1120 46 1128 53 0
105
+ m 1129 46 1137 53 0
106
+ i 1148 46 1154 56 0
107
+ s 1156 46 1164 53 0
108
+ s 1174 46 1182 53 0
109
+ p 1183 44 1191 53 0
110
+ a 1192 46 1200 53 0
111
+ m 1201 46 1209 53 0
112
+ ? 1211 46 1218 56 0
@@ -77,9 +77,10 @@ class Iterator
77
77
  pointer = C::Iterator.get_utf8_text(to_ffi, C.for_enum(level))
78
78
  result = pointer.read_string
79
79
  result.force_encoding 'UTF-8'
80
- C.free_string(pointer)
81
80
 
82
81
  result
82
+ ensure
83
+ C.free_array_of_char(pointer)
83
84
  end
84
85
 
85
86
  def confidence (level = :word)
data/lib/tesseract/api.rb CHANGED
@@ -65,11 +65,11 @@ class API
65
65
  C::BaseAPI.version(to_ffi)
66
66
  end
67
67
 
68
- def input_name= (name)
68
+ def set_input_name (name)
69
69
  C::BaseAPI.set_input_name(to_ffi, name)
70
70
  end
71
71
 
72
- def output_name= (name)
72
+ def set_output_name (name)
73
73
  C::BaseAPI.set_output_name(to_ffi, name)
74
74
  end
75
75
 
@@ -125,6 +125,30 @@ class API
125
125
  C::BaseAPI.set_rectangle(to_ffi, left, top, width, height)
126
126
  end
127
127
 
128
+ def process_pages (name)
129
+ result = C.create_string
130
+
131
+ unless C::BaseAPI.process_pages(to_ffi, name, result)
132
+ raise 'process_pages failed'
133
+ end
134
+
135
+ C.string_content(result).read_string(C.string_length(result))
136
+ ensure
137
+ C.destroy_string(result)
138
+ end
139
+
140
+ def process_page (pix, page = 0, name = "")
141
+ result = C.create_string
142
+
143
+ unless C::BaseAPI.process_page(to_ffi, pix.is_a?(Image) ? pix.to_ffi : pix, page, name, result)
144
+ raise 'process_page failed'
145
+ end
146
+
147
+ C.string_content(result).read_string(C.string_length(result))
148
+ ensure
149
+ C.destroy_string(result)
150
+ end
151
+
128
152
  def get_iterator
129
153
  Iterator.new(C::BaseAPI.get_iterator(to_ffi))
130
154
  end
@@ -133,27 +157,30 @@ class API
133
157
  pointer = C::BaseAPI.get_utf8_text(to_ffi)
134
158
  result = pointer.read_string
135
159
  result.force_encoding 'UTF-8'
136
- C.free_string(pointer)
137
160
 
138
161
  result
162
+ ensure
163
+ C.free_array_of_char(pointer)
139
164
  end
140
165
 
141
166
  def get_box (page = 0)
142
167
  pointer = C::BaseAPI.get_box_text(to_ffi, page)
143
168
  result = pointer.read_string
144
169
  result.force_encoding 'UTF-8'
145
- C.free_string(pointer)
146
170
 
147
171
  result
172
+ ensure
173
+ C.free_array_of_char(pointer)
148
174
  end
149
175
 
150
176
  def get_unlv
151
177
  pointer = C::BaseAPI.get_unlv_text(to_ffi)
152
178
  result = pointer.read_string
153
179
  result.force_encoding 'ISO8859-1'
154
- C.free_string(pointer)
155
180
 
156
181
  result
182
+ ensure
183
+ C.free_array_of_char(pointer)
157
184
  end
158
185
 
159
186
  def mean_text_confidence
@@ -184,6 +184,18 @@ module BaseAPI
184
184
  }
185
185
  }
186
186
 
187
+ cpp.function %{
188
+ bool process_pages (TessBaseAPI* api, const char* filename, STRING* output) {
189
+ return api->ProcessPages(filename, NULL, 0, output);
190
+ }
191
+ }
192
+
193
+ cpp.function %{
194
+ bool process_page (TessBaseAPI* api, Pix* pix, int page_index, const char* filename, STRING* output) {
195
+ return api->ProcessPage(pix, page_index, filename, NULL, 0, output);
196
+ }
197
+ }
198
+
187
199
  cpp.function %{
188
200
  ResultIterator* get_iterator (TessBaseAPI* api) {
189
201
  return api->GetIterator();
data/lib/tesseract/c.rb CHANGED
@@ -32,8 +32,11 @@ module C
32
32
  extend FFI::Inliner
33
33
 
34
34
  inline 'C++' do |cpp|
35
+ cpp.include 'tesseract/strngs.h'
36
+ cpp.libraries 'tesseract'
37
+
35
38
  cpp.function %{
36
- void free_string (char* pointer) {
39
+ void free_array_of_char (char* pointer) {
37
40
  delete [] pointer;
38
41
  }
39
42
  }
@@ -43,6 +46,30 @@ module C
43
46
  delete [] pointer;
44
47
  }
45
48
  }
49
+
50
+ cpp.function %{
51
+ STRING* create_string (void) {
52
+ return new STRING();
53
+ }
54
+ }
55
+
56
+ cpp.function %{
57
+ void destroy_string (STRING* value) {
58
+ delete value;
59
+ }
60
+ }
61
+
62
+ cpp.function %{
63
+ int string_length (STRING* value) {
64
+ return value->length();
65
+ }
66
+ }
67
+
68
+ cpp.function %{
69
+ const char* string_content (STRING* value) {
70
+ return value->string();
71
+ }
72
+ }
46
73
  end
47
74
 
48
75
  def self.for_enum (what)
@@ -75,6 +75,14 @@ class Engine
75
75
  }
76
76
  end
77
77
 
78
+ def input= (name)
79
+ @api.set_input_name(name)
80
+ end
81
+
82
+ def output= (name)
83
+ @api.set_output_name(name)
84
+ end
85
+
78
86
  def set (name, value)
79
87
  @variables[name] = value
80
88
 
@@ -118,7 +126,9 @@ class Engine
118
126
  end
119
127
 
120
128
  def page_segmentation_mode= (value)
121
- @api.set_page_seg_mode C.for_enum(value)
129
+ @psm = C.for_enum(value)
130
+
131
+ @api.set_page_seg_mode @psm
122
132
  end
123
133
 
124
134
  def image= (image)
@@ -190,6 +200,16 @@ class Engine
190
200
  end
191
201
  }
192
202
 
203
+ def process (image, page = nil)
204
+ if page
205
+ @api.process_page(API.image_for(image), page)
206
+ else
207
+ raise ArgumentError, 'the path does not exist' unless File.exists?(image)
208
+
209
+ @api.process_pages(image)
210
+ end
211
+ end
212
+
193
213
  protected
194
214
  def _init
195
215
  @api.end
@@ -203,6 +223,8 @@ protected
203
223
  @config.each {|conf|
204
224
  @api.read_config_file(conf)
205
225
  }
226
+
227
+ @api.set_page_seg_mode @psm if @psm
206
228
  end
207
229
 
208
230
  def _setup (image = nil, x = nil, y = nil, width = nil, height = nil)
@@ -24,6 +24,6 @@
24
24
 
25
25
  module Tesseract
26
26
  def self.version
27
- '0.0.3'
27
+ '0.0.4'
28
28
  end
29
29
  end
@@ -78,9 +78,33 @@ describe Tesseract::Engine do
78
78
  end
79
79
  end
80
80
 
81
- describe '#each_block' do
81
+ describe '#blocks' do
82
82
  it 'works properly with first image' do
83
+ engine.blocks_for('first.png').first.to_s.should == "ABC\n"
84
+ end
85
+ end
86
+
87
+ describe '#paragraphs' do
88
+ it 'works properly with first image' do
89
+ engine.paragraphs_for('first.png').first.to_s.should == "ABC\n"
90
+ end
91
+ end
83
92
 
93
+ describe '#lines' do
94
+ it 'works properly with first image' do
95
+ engine.lines_for('first.png').first.to_s.should == "ABC\n"
96
+ end
97
+ end
98
+
99
+ describe '#words' do
100
+ it 'works properly with first image' do
101
+ engine.words_for('first.png').first.to_s.should == 'ABC'
102
+ end
103
+ end
104
+
105
+ describe '#symbols' do
106
+ it 'works properly with first image' do
107
+ engine.symbols_for('first.png').first.to_s.should == 'A'
84
108
  end
85
109
  end
86
110
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tesseract-ocr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-29 00:00:00.000000000 Z
12
+ date: 2011-11-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: namedic
16
- requirement: &3194800 !ruby/object:Gem::Requirement
16
+ requirement: &12863500 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *3194800
24
+ version_requirements: *12863500
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: memoized
27
- requirement: &3193660 !ruby/object:Gem::Requirement
27
+ requirement: &12861460 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *3193660
35
+ version_requirements: *12861460
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: iso-639
38
- requirement: &3207460 !ruby/object:Gem::Requirement
38
+ requirement: &12888100 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *3207460
46
+ version_requirements: *12888100
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: ffi-extra
49
- requirement: &3205940 !ruby/object:Gem::Requirement
49
+ requirement: &12886020 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *3205940
57
+ version_requirements: *12886020
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: ffi-inliner
60
- requirement: &3205380 !ruby/object:Gem::Requirement
60
+ requirement: &12882760 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *3205380
68
+ version_requirements: *12882760
69
69
  description:
70
70
  email: meh@paranoici.org
71
71
  executables:
@@ -76,7 +76,9 @@ files:
76
76
  - README.md
77
77
  - Rakefile
78
78
  - bin/tesseract.rb
79
- - examples/nerdz-captcha-breaker.rb
79
+ - examples/nerdz-captcha-breaker/break.rb
80
+ - examples/nerdz-captcha-breaker/lol.gd-giant.exp.box
81
+ - examples/nerdz-captcha-breaker/lol.gd-giant.exp.tif
80
82
  - lib/tesseract-ocr.rb
81
83
  - lib/tesseract.rb
82
84
  - lib/tesseract/api.rb