tesseract-ocr 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,6 +51,8 @@ Tesseract::Engine.new.tap {|engine|
51
51
  image.pixel_color x, y, 'black'
52
52
  }
53
53
 
54
+ File.open('/tmp/lol.png', ?w) { |f| f.write(image.resize(10).to_blob) }
55
+
54
56
  puts engine.text_for(image.resize 10).strip
55
57
  }
56
58
  }
@@ -0,0 +1,112 @@
1
+ T 31 46 39 56 0
2
+ h 40 46 48 56 0
3
+ e 49 46 57 53 0
4
+ ( 69 45 74 57 0
5
+ q 76 44 84 53 0
6
+ u 85 46 93 53 0
7
+ i 95 46 101 56 0
8
+ c 103 46 111 53 0
9
+ k 113 46 120 56 0
10
+ ) 122 45 127 57 0
11
+ b 139 46 147 56 0
12
+ r 148 46 156 53 0
13
+ o 157 46 165 53 0
14
+ w 166 46 174 53 0
15
+ n 175 46 183 53 0
16
+ { 195 46 200 56 0
17
+ f 202 46 210 56 0
18
+ o 211 46 219 53 0
19
+ x 220 46 228 53 0
20
+ } 230 46 235 56 0
21
+ j 247 44 254 56 0
22
+ u 256 46 264 53 0
23
+ m 265 46 273 53 0
24
+ p 274 44 282 53 0
25
+ s 283 46 291 53 0
26
+ ! 295 45 297 56 0
27
+ o 310 46 318 53 0
28
+ v 319 46 327 53 0
29
+ e 328 46 336 53 0
30
+ r 337 46 345 53 0
31
+ t 355 46 363 55 0
32
+ h 364 46 372 56 0
33
+ e 373 46 381 53 0
34
+ $ 392 45 399 56 0
35
+ 3 400 46 408 56 0
36
+ , 410 45 415 48 0
37
+ 4 418 46 426 56 0
38
+ 5 427 46 435 56 0
39
+ 6 436 46 444 56 0
40
+ . 447 45 451 48 0
41
+ 7 454 46 462 56 0
42
+ 8 463 46 471 56 0
43
+ < 482 46 488 56 0
44
+ l 492 46 496 56 0
45
+ a 499 46 507 53 0
46
+ z 509 46 515 53 0
47
+ y 517 44 525 53 0
48
+ > 526 46 532 56 0
49
+ # 544 47 551 55 0
50
+ 9 553 46 561 56 0
51
+ 0 562 46 570 56 0
52
+ d 580 46 588 56 0
53
+ o 589 46 597 53 0
54
+ g 598 44 606 53 0
55
+ & 616 46 624 56 0
56
+ d 634 46 642 56 0
57
+ u 643 46 651 53 0
58
+ c 652 46 660 53 0
59
+ k 662 46 669 56 0
60
+ / 670 46 678 56 0
61
+ g 679 44 687 53 0
62
+ o 688 46 696 53 0
63
+ o 697 46 705 53 0
64
+ s 706 46 714 53 0
65
+ e 715 46 723 53 0
66
+ , 725 45 730 48 0
67
+ a 742 46 750 53 0
68
+ s 751 46 759 53 0
69
+ 1 770 46 776 56 0
70
+ 2 778 46 786 56 0
71
+ . 789 45 793 48 0
72
+ 5 796 46 804 56 0
73
+ % 805 46 813 56 0
74
+ o 823 46 831 53 0
75
+ f 832 46 840 56 0
76
+ E 850 46 857 56 0
77
+ - 859 50 867 51 0
78
+ m 868 46 876 53 0
79
+ a 877 46 885 53 0
80
+ i 887 46 893 56 0
81
+ l 897 46 901 56 0
82
+ f 913 46 921 56 0
83
+ r 922 46 930 53 0
84
+ o 931 46 939 53 0
85
+ m 940 46 948 53 0
86
+ a 958 46 966 53 0
87
+ s 967 46 975 53 0
88
+ p 976 44 984 53 0
89
+ a 985 46 993 53 0
90
+ m 994 46 1002 53 0
91
+ m 1003 46 1011 53 0
92
+ e 1012 46 1020 53 0
93
+ r 1021 46 1029 53 0
94
+ @ 1030 46 1038 56 0
95
+ w 1039 46 1047 53 0
96
+ e 1048 46 1056 53 0
97
+ b 1057 46 1065 56 0
98
+ s 1066 46 1074 53 0
99
+ i 1076 46 1082 56 0
100
+ t 1084 46 1092 55 0
101
+ e 1093 46 1101 53 0
102
+ . 1104 45 1108 48 0
103
+ c 1111 46 1119 53 0
104
+ o 1120 46 1128 53 0
105
+ m 1129 46 1137 53 0
106
+ i 1148 46 1154 56 0
107
+ s 1156 46 1164 53 0
108
+ s 1174 46 1182 53 0
109
+ p 1183 44 1191 53 0
110
+ a 1192 46 1200 53 0
111
+ m 1201 46 1209 53 0
112
+ ? 1211 46 1218 56 0
@@ -77,9 +77,10 @@ class Iterator
77
77
  pointer = C::Iterator.get_utf8_text(to_ffi, C.for_enum(level))
78
78
  result = pointer.read_string
79
79
  result.force_encoding 'UTF-8'
80
- C.free_string(pointer)
81
80
 
82
81
  result
82
+ ensure
83
+ C.free_array_of_char(pointer)
83
84
  end
84
85
 
85
86
  def confidence (level = :word)
data/lib/tesseract/api.rb CHANGED
@@ -65,11 +65,11 @@ class API
65
65
  C::BaseAPI.version(to_ffi)
66
66
  end
67
67
 
68
- def input_name= (name)
68
+ def set_input_name (name)
69
69
  C::BaseAPI.set_input_name(to_ffi, name)
70
70
  end
71
71
 
72
- def output_name= (name)
72
+ def set_output_name (name)
73
73
  C::BaseAPI.set_output_name(to_ffi, name)
74
74
  end
75
75
 
@@ -125,6 +125,30 @@ class API
125
125
  C::BaseAPI.set_rectangle(to_ffi, left, top, width, height)
126
126
  end
127
127
 
128
+ def process_pages (name)
129
+ result = C.create_string
130
+
131
+ unless C::BaseAPI.process_pages(to_ffi, name, result)
132
+ raise 'process_pages failed'
133
+ end
134
+
135
+ C.string_content(result).read_string(C.string_length(result))
136
+ ensure
137
+ C.destroy_string(result)
138
+ end
139
+
140
+ def process_page (pix, page = 0, name = "")
141
+ result = C.create_string
142
+
143
+ unless C::BaseAPI.process_page(to_ffi, pix.is_a?(Image) ? pix.to_ffi : pix, page, name, result)
144
+ raise 'process_page failed'
145
+ end
146
+
147
+ C.string_content(result).read_string(C.string_length(result))
148
+ ensure
149
+ C.destroy_string(result)
150
+ end
151
+
128
152
  def get_iterator
129
153
  Iterator.new(C::BaseAPI.get_iterator(to_ffi))
130
154
  end
@@ -133,27 +157,30 @@ class API
133
157
  pointer = C::BaseAPI.get_utf8_text(to_ffi)
134
158
  result = pointer.read_string
135
159
  result.force_encoding 'UTF-8'
136
- C.free_string(pointer)
137
160
 
138
161
  result
162
+ ensure
163
+ C.free_array_of_char(pointer)
139
164
  end
140
165
 
141
166
  def get_box (page = 0)
142
167
  pointer = C::BaseAPI.get_box_text(to_ffi, page)
143
168
  result = pointer.read_string
144
169
  result.force_encoding 'UTF-8'
145
- C.free_string(pointer)
146
170
 
147
171
  result
172
+ ensure
173
+ C.free_array_of_char(pointer)
148
174
  end
149
175
 
150
176
  def get_unlv
151
177
  pointer = C::BaseAPI.get_unlv_text(to_ffi)
152
178
  result = pointer.read_string
153
179
  result.force_encoding 'ISO8859-1'
154
- C.free_string(pointer)
155
180
 
156
181
  result
182
+ ensure
183
+ C.free_array_of_char(pointer)
157
184
  end
158
185
 
159
186
  def mean_text_confidence
@@ -184,6 +184,18 @@ module BaseAPI
184
184
  }
185
185
  }
186
186
 
187
+ cpp.function %{
188
+ bool process_pages (TessBaseAPI* api, const char* filename, STRING* output) {
189
+ return api->ProcessPages(filename, NULL, 0, output);
190
+ }
191
+ }
192
+
193
+ cpp.function %{
194
+ bool process_page (TessBaseAPI* api, Pix* pix, int page_index, const char* filename, STRING* output) {
195
+ return api->ProcessPage(pix, page_index, filename, NULL, 0, output);
196
+ }
197
+ }
198
+
187
199
  cpp.function %{
188
200
  ResultIterator* get_iterator (TessBaseAPI* api) {
189
201
  return api->GetIterator();
data/lib/tesseract/c.rb CHANGED
@@ -32,8 +32,11 @@ module C
32
32
  extend FFI::Inliner
33
33
 
34
34
  inline 'C++' do |cpp|
35
+ cpp.include 'tesseract/strngs.h'
36
+ cpp.libraries 'tesseract'
37
+
35
38
  cpp.function %{
36
- void free_string (char* pointer) {
39
+ void free_array_of_char (char* pointer) {
37
40
  delete [] pointer;
38
41
  }
39
42
  }
@@ -43,6 +46,30 @@ module C
43
46
  delete [] pointer;
44
47
  }
45
48
  }
49
+
50
+ cpp.function %{
51
+ STRING* create_string (void) {
52
+ return new STRING();
53
+ }
54
+ }
55
+
56
+ cpp.function %{
57
+ void destroy_string (STRING* value) {
58
+ delete value;
59
+ }
60
+ }
61
+
62
+ cpp.function %{
63
+ int string_length (STRING* value) {
64
+ return value->length();
65
+ }
66
+ }
67
+
68
+ cpp.function %{
69
+ const char* string_content (STRING* value) {
70
+ return value->string();
71
+ }
72
+ }
46
73
  end
47
74
 
48
75
  def self.for_enum (what)
@@ -75,6 +75,14 @@ class Engine
75
75
  }
76
76
  end
77
77
 
78
+ def input= (name)
79
+ @api.set_input_name(name)
80
+ end
81
+
82
+ def output= (name)
83
+ @api.set_output_name(name)
84
+ end
85
+
78
86
  def set (name, value)
79
87
  @variables[name] = value
80
88
 
@@ -118,7 +126,9 @@ class Engine
118
126
  end
119
127
 
120
128
  def page_segmentation_mode= (value)
121
- @api.set_page_seg_mode C.for_enum(value)
129
+ @psm = C.for_enum(value)
130
+
131
+ @api.set_page_seg_mode @psm
122
132
  end
123
133
 
124
134
  def image= (image)
@@ -190,6 +200,16 @@ class Engine
190
200
  end
191
201
  }
192
202
 
203
+ def process (image, page = nil)
204
+ if page
205
+ @api.process_page(API.image_for(image), page)
206
+ else
207
+ raise ArgumentError, 'the path does not exist' unless File.exists?(image)
208
+
209
+ @api.process_pages(image)
210
+ end
211
+ end
212
+
193
213
  protected
194
214
  def _init
195
215
  @api.end
@@ -203,6 +223,8 @@ protected
203
223
  @config.each {|conf|
204
224
  @api.read_config_file(conf)
205
225
  }
226
+
227
+ @api.set_page_seg_mode @psm if @psm
206
228
  end
207
229
 
208
230
  def _setup (image = nil, x = nil, y = nil, width = nil, height = nil)
@@ -24,6 +24,6 @@
24
24
 
25
25
  module Tesseract
26
26
  def self.version
27
- '0.0.3'
27
+ '0.0.4'
28
28
  end
29
29
  end
@@ -78,9 +78,33 @@ describe Tesseract::Engine do
78
78
  end
79
79
  end
80
80
 
81
- describe '#each_block' do
81
+ describe '#blocks' do
82
82
  it 'works properly with first image' do
83
+ engine.blocks_for('first.png').first.to_s.should == "ABC\n"
84
+ end
85
+ end
86
+
87
+ describe '#paragraphs' do
88
+ it 'works properly with first image' do
89
+ engine.paragraphs_for('first.png').first.to_s.should == "ABC\n"
90
+ end
91
+ end
83
92
 
93
+ describe '#lines' do
94
+ it 'works properly with first image' do
95
+ engine.lines_for('first.png').first.to_s.should == "ABC\n"
96
+ end
97
+ end
98
+
99
+ describe '#words' do
100
+ it 'works properly with first image' do
101
+ engine.words_for('first.png').first.to_s.should == 'ABC'
102
+ end
103
+ end
104
+
105
+ describe '#symbols' do
106
+ it 'works properly with first image' do
107
+ engine.symbols_for('first.png').first.to_s.should == 'A'
84
108
  end
85
109
  end
86
110
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tesseract-ocr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-29 00:00:00.000000000 Z
12
+ date: 2011-11-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: namedic
16
- requirement: &3194800 !ruby/object:Gem::Requirement
16
+ requirement: &12863500 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *3194800
24
+ version_requirements: *12863500
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: memoized
27
- requirement: &3193660 !ruby/object:Gem::Requirement
27
+ requirement: &12861460 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *3193660
35
+ version_requirements: *12861460
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: iso-639
38
- requirement: &3207460 !ruby/object:Gem::Requirement
38
+ requirement: &12888100 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *3207460
46
+ version_requirements: *12888100
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: ffi-extra
49
- requirement: &3205940 !ruby/object:Gem::Requirement
49
+ requirement: &12886020 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *3205940
57
+ version_requirements: *12886020
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: ffi-inliner
60
- requirement: &3205380 !ruby/object:Gem::Requirement
60
+ requirement: &12882760 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *3205380
68
+ version_requirements: *12882760
69
69
  description:
70
70
  email: meh@paranoici.org
71
71
  executables:
@@ -76,7 +76,9 @@ files:
76
76
  - README.md
77
77
  - Rakefile
78
78
  - bin/tesseract.rb
79
- - examples/nerdz-captcha-breaker.rb
79
+ - examples/nerdz-captcha-breaker/break.rb
80
+ - examples/nerdz-captcha-breaker/lol.gd-giant.exp.box
81
+ - examples/nerdz-captcha-breaker/lol.gd-giant.exp.tif
80
82
  - lib/tesseract-ocr.rb
81
83
  - lib/tesseract.rb
82
84
  - lib/tesseract/api.rb