tesseract-ocr 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/tesseract/api.rb +15 -2
- data/lib/tesseract/c/baseapi.rb +6 -0
- data/lib/tesseract/engine.rb +23 -3
- data/lib/tesseract/version.rb +1 -1
- data/test/tesseract_spec.rb +45 -18
- metadata +2 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 15d5c5417af8a215178f247ebca4750d2a7cbbcb
|
4
|
+
data.tar.gz: 35e1a440418421cc2b22f2f7f8e095600911e04c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71064917075114639018f60de081afb1cf164677b1cd9bbec940fe1f34063b0ef0a823f3ee0f2dba434f03775d000e4e50b7a17f93285b79d9982fa2d563f475
|
7
|
+
data.tar.gz: 188d4e48122324899a2d83b97585a932938a9905b4de3960c6496748ee4b352707a9e80455905da446b469afba47c8e87c9337270130b8972309322e8bfd5576
|
data/lib/tesseract/api.rb
CHANGED
@@ -99,8 +99,8 @@ class API
|
|
99
99
|
end
|
100
100
|
end
|
101
101
|
|
102
|
-
def init (datapath =
|
103
|
-
unless C::BaseAPI.init(to_ffi, datapath, language.to_s, mode).zero?
|
102
|
+
def init (datapath = nil, language = 'eng', mode = :DEFAULT)
|
103
|
+
unless C::BaseAPI.init(to_ffi, datapath || Tesseract.prefix || '/usr/share', language.to_s, mode).zero?
|
104
104
|
raise 'the API did not Init correctly'
|
105
105
|
end
|
106
106
|
end
|
@@ -166,6 +166,19 @@ class API
|
|
166
166
|
C.free_array_of_char(pointer) unless pointer.null?
|
167
167
|
end
|
168
168
|
|
169
|
+
def get_hocr(page = 0)
|
170
|
+
pointer = C::BaseAPI.get_hocr_text(to_ffi, page)
|
171
|
+
|
172
|
+
return if pointer.null?
|
173
|
+
|
174
|
+
result = pointer.read_string
|
175
|
+
result.force_encoding 'UTF-8'
|
176
|
+
|
177
|
+
result
|
178
|
+
ensure
|
179
|
+
C.free_array_of_char(pointer) unless pointer.null?
|
180
|
+
end
|
181
|
+
|
169
182
|
def get_box (page = 0)
|
170
183
|
pointer = C::BaseAPI.get_box_text(to_ffi, page)
|
171
184
|
result = pointer.read_string
|
data/lib/tesseract/c/baseapi.rb
CHANGED
@@ -202,6 +202,12 @@ module BaseAPI
|
|
202
202
|
}
|
203
203
|
}, blocking: true
|
204
204
|
|
205
|
+
cpp.function %{
|
206
|
+
char* get_hocr_text (TessBaseAPI* api, int page_number) {
|
207
|
+
return api->GetHOCRText(page_number);
|
208
|
+
}
|
209
|
+
}, blocking: true
|
210
|
+
|
205
211
|
cpp.function %{
|
206
212
|
char* get_box_text (TessBaseAPI* api, int page_number) {
|
207
213
|
return api->GetBoxText(page_number);
|
data/lib/tesseract/engine.rb
CHANGED
@@ -32,9 +32,9 @@ class Engine
|
|
32
32
|
attr_reader :config
|
33
33
|
|
34
34
|
named :path, :language, :mode, :variables,
|
35
|
-
:optional => { :path =>
|
35
|
+
:optional => { :path => nil, :language => :eng, :mode => :DEFAULT, :variables => {}, :config => [] },
|
36
36
|
:alias => { :data => :path, :lang => :language }
|
37
|
-
def initialize (path =
|
37
|
+
def initialize (path = nil, language = :eng, mode = :DEFAULT, variables = {}, config = [], &block) # :yields: self
|
38
38
|
@api = API.new
|
39
39
|
|
40
40
|
@initializing = true
|
@@ -171,6 +171,26 @@ class Engine
|
|
171
171
|
text_at
|
172
172
|
end
|
173
173
|
|
174
|
+
named :image, :x, :y, :width, :height,
|
175
|
+
:optional => 0 .. -1,
|
176
|
+
:alias => { :w => :width, :h => :height }
|
177
|
+
def hocr_for (image = nil, x = nil, y = nil, width = nil, height = nil, page = nil)
|
178
|
+
_setup(image, x, y, width, height)
|
179
|
+
|
180
|
+
@api.get_hocr(page || 0)
|
181
|
+
end
|
182
|
+
|
183
|
+
named :x, :y, :width, :height,
|
184
|
+
:optional => 0 .. -1,
|
185
|
+
:alias => { :w => :width, :h => :height }
|
186
|
+
def hocr_at (x = nil, y = nil, width = nil, height = nil, page = nil)
|
187
|
+
hocr_for(nil, x, y, width, height, page)
|
188
|
+
end
|
189
|
+
|
190
|
+
def hocr
|
191
|
+
hocr_at
|
192
|
+
end
|
193
|
+
|
174
194
|
%w(block paragraph line word symbol).each {|level|
|
175
195
|
define_method "each_#{level}" do |&block|
|
176
196
|
raise ArgumentError, 'you have to pass a block' unless block
|
@@ -231,7 +251,7 @@ protected
|
|
231
251
|
def _init
|
232
252
|
@api.end
|
233
253
|
|
234
|
-
@api.init(
|
254
|
+
@api.init(@path, API.to_language_code(@language), @mode)
|
235
255
|
|
236
256
|
@variables.each {|name, value|
|
237
257
|
@api.set_variable(name.to_s, value.to_s)
|
data/lib/tesseract/version.rb
CHANGED
data/test/tesseract_spec.rb
CHANGED
@@ -9,36 +9,38 @@ describe Tesseract::Engine do
|
|
9
9
|
|
10
10
|
describe '#text_for' do
|
11
11
|
it 'can read the first test image' do
|
12
|
-
engine.text_for('first.png').strip.
|
12
|
+
expect(engine.text_for('first.png').strip).to eq('ABC')
|
13
13
|
end
|
14
14
|
|
15
15
|
it 'can read the second test image' do
|
16
|
-
engine.text_for('second.png').strip.
|
16
|
+
expect(engine.text_for('second.png').strip).to eq("#{Tesseract::API.new.version == '3.01' ? ?| : ?I}'m 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234")
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'raises when going out of the image boundaries' do
|
20
20
|
expect {
|
21
21
|
engine.text_for('second.png', 0, 0, 1000, 1000)
|
22
|
-
}.
|
22
|
+
}.to raise_error IndexError
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
26
|
describe '#text_at' do
|
27
27
|
it 'can read the first test image' do
|
28
28
|
engine.image = 'first.png'
|
29
|
-
|
29
|
+
|
30
|
+
expect(engine.text_at(2, 2, 2, 2).strip).to eq('')
|
30
31
|
end
|
31
32
|
|
32
33
|
it 'can read the second test image' do
|
33
34
|
engine.image = 'second.png'
|
34
|
-
|
35
|
+
|
36
|
+
expect(engine.text_at(242, 191, 129, 31).strip).to eq('OH HAI 1234')
|
35
37
|
end
|
36
38
|
|
37
39
|
it 'raises when going out of the image boundaries' do
|
38
40
|
expect {
|
39
41
|
engine.image = 'second.png'
|
40
42
|
engine.text_at(10, 20, 1000, 1000)
|
41
|
-
}.
|
43
|
+
}.to raise_error IndexError
|
42
44
|
end
|
43
45
|
end
|
44
46
|
|
@@ -47,13 +49,14 @@ describe Tesseract::Engine do
|
|
47
49
|
engine.image = 'first.png'
|
48
50
|
engine.select 2, 2, 2, 2
|
49
51
|
|
50
|
-
engine.text.strip.
|
52
|
+
expect(engine.text.strip).to eq('')
|
51
53
|
end
|
52
54
|
|
53
55
|
it 'can read the second test image' do
|
54
56
|
engine.image = 'second.png'
|
55
57
|
engine.select 242, 191, 129, 31
|
56
|
-
|
58
|
+
|
59
|
+
expect(engine.text.strip).to eq('OH HAI 1234')
|
57
60
|
end
|
58
61
|
|
59
62
|
it 'raises when going out of the image boundaries' do
|
@@ -61,59 +64,83 @@ describe Tesseract::Engine do
|
|
61
64
|
engine.image = 'second.png'
|
62
65
|
engine.select 10, 20, 1000, 1000
|
63
66
|
engine.text
|
64
|
-
}.
|
67
|
+
}.to raise_error IndexError
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
describe '#hocr' do
|
72
|
+
it 'can read the first test image' do
|
73
|
+
engine.image = 'first.png'
|
74
|
+
engine.select 2, 2, 2, 2
|
75
|
+
|
76
|
+
expect(engine.hocr).to eq(" <div class='ocr_page' id='page_1' title='image \"\"; bbox 2 2 2 2; ppageno 0'>\n </div>\n")
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'can read the second test image' do
|
80
|
+
engine.image = 'second.png'
|
81
|
+
engine.select 242, 191, 129, 31
|
82
|
+
|
83
|
+
expect(engine.hocr).to eq(" <div class='ocr_page' id='page_1' title='image \"\"; bbox 242 191 129 31; ppageno 0'>\n <div class='ocr_carea' id='block_1_1' title=\"bbox 242 191 371 222\">\n <p class='ocr_par' dir='ltr' id='par_1_1' title=\"bbox 250 200 365 213\">\n <span class='ocr_line' id='line_1_1' title=\"bbox 250 200 365 213; baseline 0 0\"><span class='ocrx_word' id='word_1_1' title='bbox 250 200 275 213; x_wconf 94' lang='eng' dir='ltr'>OH</span> <span class='ocrx_word' id='word_1_2' title='bbox 285 200 313 213; x_wconf 90' lang='eng' dir='ltr'>HAI</span> <span class='ocrx_word' id='word_1_3' title='bbox 323 200 365 213; x_wconf 86' lang='eng'>1234</span> \n </span>\n </p>\n </div>\n </div>\n")
|
65
84
|
end
|
66
85
|
|
86
|
+
it 'raises when going out of the image boundaries' do
|
87
|
+
expect {
|
88
|
+
engine.image = 'second.png'
|
89
|
+
engine.select 10, 20, 1000, 1000
|
90
|
+
|
91
|
+
engine.hocr
|
92
|
+
}.to raise_error IndexError
|
93
|
+
end
|
67
94
|
end
|
68
95
|
|
69
96
|
describe '#blacklist' do
|
70
97
|
it 'works with removing weird signs' do
|
71
|
-
engine.with { |e| e.blacklist = '|' }.text_for('second.png').strip.
|
98
|
+
expect(engine.with { |e| e.blacklist = '|' }.text_for('second.png').strip).to eq("I'm 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234")
|
72
99
|
end
|
73
100
|
end
|
74
101
|
|
75
102
|
describe '#whitelist' do
|
76
103
|
it 'makes everything into a number' do
|
77
|
-
engine.with { |e| e.whitelist = '1234567890' }.text_for('second.png').strip.
|
104
|
+
expect(engine.with { |e| e.whitelist = '1234567890' }.text_for('second.png').strip).to match(/^[\d\s]*$/)
|
78
105
|
end
|
79
106
|
end
|
80
107
|
|
81
108
|
describe '#page_segmentation_mode' do
|
82
109
|
it 'sets it correctly' do
|
83
|
-
engine.with {|e|
|
110
|
+
expect(engine.with {|e|
|
84
111
|
e.page_segmentation_mode = :single_line
|
85
112
|
e.whitelist = [*'a'..'z', *'A'..'Z', *0..9, " ."].join
|
86
|
-
}.text_for('jsmj.png').strip.
|
113
|
+
}.text_for('jsmj.png').strip).to eq('Jsmj')
|
87
114
|
end
|
88
115
|
end
|
89
116
|
|
90
117
|
describe '#blocks' do
|
91
118
|
it 'works properly with first image' do
|
92
|
-
engine.blocks_for('first.png').first.to_s.strip.
|
119
|
+
expect(engine.blocks_for('first.png').first.to_s.strip).to eq('ABC')
|
93
120
|
end
|
94
121
|
end
|
95
122
|
|
96
123
|
describe '#paragraphs' do
|
97
124
|
it 'works properly with first image' do
|
98
|
-
engine.paragraphs_for('first.png').first.to_s.strip.
|
125
|
+
expect(engine.paragraphs_for('first.png').first.to_s.strip).to eq('ABC')
|
99
126
|
end
|
100
127
|
end
|
101
128
|
|
102
129
|
describe '#lines' do
|
103
130
|
it 'works properly with first image' do
|
104
|
-
engine.lines_for('first.png').first.to_s.strip.
|
131
|
+
expect(engine.lines_for('first.png').first.to_s.strip).to eq('ABC')
|
105
132
|
end
|
106
133
|
end
|
107
134
|
|
108
135
|
describe '#words' do
|
109
136
|
it 'works properly with first image' do
|
110
|
-
engine.words_for('first.png').first.to_s.
|
137
|
+
expect(engine.words_for('first.png').first.to_s).to eq('ABC')
|
111
138
|
end
|
112
139
|
end
|
113
140
|
|
114
141
|
describe '#symbols' do
|
115
142
|
it 'works properly with first image' do
|
116
|
-
engine.symbols_for('first.png').first.to_s.
|
143
|
+
expect(engine.symbols_for('first.png').first.to_s).to eq('A')
|
117
144
|
end
|
118
145
|
end
|
119
146
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tesseract-ocr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- meh.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: call-me
|
@@ -243,4 +243,3 @@ test_files:
|
|
243
243
|
- test/tesseract_spec.rb
|
244
244
|
- test/test-european.jpg
|
245
245
|
- test/test.png
|
246
|
-
has_rdoc:
|