tesseract-ocr 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/tesseract/api.rb +15 -2
- data/lib/tesseract/c/baseapi.rb +6 -0
- data/lib/tesseract/engine.rb +23 -3
- data/lib/tesseract/version.rb +1 -1
- data/test/tesseract_spec.rb +45 -18
- metadata +2 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 15d5c5417af8a215178f247ebca4750d2a7cbbcb
|
4
|
+
data.tar.gz: 35e1a440418421cc2b22f2f7f8e095600911e04c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71064917075114639018f60de081afb1cf164677b1cd9bbec940fe1f34063b0ef0a823f3ee0f2dba434f03775d000e4e50b7a17f93285b79d9982fa2d563f475
|
7
|
+
data.tar.gz: 188d4e48122324899a2d83b97585a932938a9905b4de3960c6496748ee4b352707a9e80455905da446b469afba47c8e87c9337270130b8972309322e8bfd5576
|
data/lib/tesseract/api.rb
CHANGED
@@ -99,8 +99,8 @@ class API
|
|
99
99
|
end
|
100
100
|
end
|
101
101
|
|
102
|
-
def init (datapath =
|
103
|
-
unless C::BaseAPI.init(to_ffi, datapath, language.to_s, mode).zero?
|
102
|
+
def init (datapath = nil, language = 'eng', mode = :DEFAULT)
|
103
|
+
unless C::BaseAPI.init(to_ffi, datapath || Tesseract.prefix || '/usr/share', language.to_s, mode).zero?
|
104
104
|
raise 'the API did not Init correctly'
|
105
105
|
end
|
106
106
|
end
|
@@ -166,6 +166,19 @@ class API
|
|
166
166
|
C.free_array_of_char(pointer) unless pointer.null?
|
167
167
|
end
|
168
168
|
|
169
|
+
def get_hocr(page = 0)
|
170
|
+
pointer = C::BaseAPI.get_hocr_text(to_ffi, page)
|
171
|
+
|
172
|
+
return if pointer.null?
|
173
|
+
|
174
|
+
result = pointer.read_string
|
175
|
+
result.force_encoding 'UTF-8'
|
176
|
+
|
177
|
+
result
|
178
|
+
ensure
|
179
|
+
C.free_array_of_char(pointer) unless pointer.null?
|
180
|
+
end
|
181
|
+
|
169
182
|
def get_box (page = 0)
|
170
183
|
pointer = C::BaseAPI.get_box_text(to_ffi, page)
|
171
184
|
result = pointer.read_string
|
data/lib/tesseract/c/baseapi.rb
CHANGED
@@ -202,6 +202,12 @@ module BaseAPI
|
|
202
202
|
}
|
203
203
|
}, blocking: true
|
204
204
|
|
205
|
+
cpp.function %{
|
206
|
+
char* get_hocr_text (TessBaseAPI* api, int page_number) {
|
207
|
+
return api->GetHOCRText(page_number);
|
208
|
+
}
|
209
|
+
}, blocking: true
|
210
|
+
|
205
211
|
cpp.function %{
|
206
212
|
char* get_box_text (TessBaseAPI* api, int page_number) {
|
207
213
|
return api->GetBoxText(page_number);
|
data/lib/tesseract/engine.rb
CHANGED
@@ -32,9 +32,9 @@ class Engine
|
|
32
32
|
attr_reader :config
|
33
33
|
|
34
34
|
named :path, :language, :mode, :variables,
|
35
|
-
:optional => { :path =>
|
35
|
+
:optional => { :path => nil, :language => :eng, :mode => :DEFAULT, :variables => {}, :config => [] },
|
36
36
|
:alias => { :data => :path, :lang => :language }
|
37
|
-
def initialize (path =
|
37
|
+
def initialize (path = nil, language = :eng, mode = :DEFAULT, variables = {}, config = [], &block) # :yields: self
|
38
38
|
@api = API.new
|
39
39
|
|
40
40
|
@initializing = true
|
@@ -171,6 +171,26 @@ class Engine
|
|
171
171
|
text_at
|
172
172
|
end
|
173
173
|
|
174
|
+
named :image, :x, :y, :width, :height,
|
175
|
+
:optional => 0 .. -1,
|
176
|
+
:alias => { :w => :width, :h => :height }
|
177
|
+
def hocr_for (image = nil, x = nil, y = nil, width = nil, height = nil, page = nil)
|
178
|
+
_setup(image, x, y, width, height)
|
179
|
+
|
180
|
+
@api.get_hocr(page || 0)
|
181
|
+
end
|
182
|
+
|
183
|
+
named :x, :y, :width, :height,
|
184
|
+
:optional => 0 .. -1,
|
185
|
+
:alias => { :w => :width, :h => :height }
|
186
|
+
def hocr_at (x = nil, y = nil, width = nil, height = nil, page = nil)
|
187
|
+
hocr_for(nil, x, y, width, height, page)
|
188
|
+
end
|
189
|
+
|
190
|
+
def hocr
|
191
|
+
hocr_at
|
192
|
+
end
|
193
|
+
|
174
194
|
%w(block paragraph line word symbol).each {|level|
|
175
195
|
define_method "each_#{level}" do |&block|
|
176
196
|
raise ArgumentError, 'you have to pass a block' unless block
|
@@ -231,7 +251,7 @@ protected
|
|
231
251
|
def _init
|
232
252
|
@api.end
|
233
253
|
|
234
|
-
@api.init(
|
254
|
+
@api.init(@path, API.to_language_code(@language), @mode)
|
235
255
|
|
236
256
|
@variables.each {|name, value|
|
237
257
|
@api.set_variable(name.to_s, value.to_s)
|
data/lib/tesseract/version.rb
CHANGED
data/test/tesseract_spec.rb
CHANGED
@@ -9,36 +9,38 @@ describe Tesseract::Engine do
|
|
9
9
|
|
10
10
|
describe '#text_for' do
|
11
11
|
it 'can read the first test image' do
|
12
|
-
engine.text_for('first.png').strip.
|
12
|
+
expect(engine.text_for('first.png').strip).to eq('ABC')
|
13
13
|
end
|
14
14
|
|
15
15
|
it 'can read the second test image' do
|
16
|
-
engine.text_for('second.png').strip.
|
16
|
+
expect(engine.text_for('second.png').strip).to eq("#{Tesseract::API.new.version == '3.01' ? ?| : ?I}'m 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234")
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'raises when going out of the image boundaries' do
|
20
20
|
expect {
|
21
21
|
engine.text_for('second.png', 0, 0, 1000, 1000)
|
22
|
-
}.
|
22
|
+
}.to raise_error IndexError
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
26
|
describe '#text_at' do
|
27
27
|
it 'can read the first test image' do
|
28
28
|
engine.image = 'first.png'
|
29
|
-
|
29
|
+
|
30
|
+
expect(engine.text_at(2, 2, 2, 2).strip).to eq('')
|
30
31
|
end
|
31
32
|
|
32
33
|
it 'can read the second test image' do
|
33
34
|
engine.image = 'second.png'
|
34
|
-
|
35
|
+
|
36
|
+
expect(engine.text_at(242, 191, 129, 31).strip).to eq('OH HAI 1234')
|
35
37
|
end
|
36
38
|
|
37
39
|
it 'raises when going out of the image boundaries' do
|
38
40
|
expect {
|
39
41
|
engine.image = 'second.png'
|
40
42
|
engine.text_at(10, 20, 1000, 1000)
|
41
|
-
}.
|
43
|
+
}.to raise_error IndexError
|
42
44
|
end
|
43
45
|
end
|
44
46
|
|
@@ -47,13 +49,14 @@ describe Tesseract::Engine do
|
|
47
49
|
engine.image = 'first.png'
|
48
50
|
engine.select 2, 2, 2, 2
|
49
51
|
|
50
|
-
engine.text.strip.
|
52
|
+
expect(engine.text.strip).to eq('')
|
51
53
|
end
|
52
54
|
|
53
55
|
it 'can read the second test image' do
|
54
56
|
engine.image = 'second.png'
|
55
57
|
engine.select 242, 191, 129, 31
|
56
|
-
|
58
|
+
|
59
|
+
expect(engine.text.strip).to eq('OH HAI 1234')
|
57
60
|
end
|
58
61
|
|
59
62
|
it 'raises when going out of the image boundaries' do
|
@@ -61,59 +64,83 @@ describe Tesseract::Engine do
|
|
61
64
|
engine.image = 'second.png'
|
62
65
|
engine.select 10, 20, 1000, 1000
|
63
66
|
engine.text
|
64
|
-
}.
|
67
|
+
}.to raise_error IndexError
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
describe '#hocr' do
|
72
|
+
it 'can read the first test image' do
|
73
|
+
engine.image = 'first.png'
|
74
|
+
engine.select 2, 2, 2, 2
|
75
|
+
|
76
|
+
expect(engine.hocr).to eq(" <div class='ocr_page' id='page_1' title='image \"\"; bbox 2 2 2 2; ppageno 0'>\n </div>\n")
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'can read the second test image' do
|
80
|
+
engine.image = 'second.png'
|
81
|
+
engine.select 242, 191, 129, 31
|
82
|
+
|
83
|
+
expect(engine.hocr).to eq(" <div class='ocr_page' id='page_1' title='image \"\"; bbox 242 191 129 31; ppageno 0'>\n <div class='ocr_carea' id='block_1_1' title=\"bbox 242 191 371 222\">\n <p class='ocr_par' dir='ltr' id='par_1_1' title=\"bbox 250 200 365 213\">\n <span class='ocr_line' id='line_1_1' title=\"bbox 250 200 365 213; baseline 0 0\"><span class='ocrx_word' id='word_1_1' title='bbox 250 200 275 213; x_wconf 94' lang='eng' dir='ltr'>OH</span> <span class='ocrx_word' id='word_1_2' title='bbox 285 200 313 213; x_wconf 90' lang='eng' dir='ltr'>HAI</span> <span class='ocrx_word' id='word_1_3' title='bbox 323 200 365 213; x_wconf 86' lang='eng'>1234</span> \n </span>\n </p>\n </div>\n </div>\n")
|
65
84
|
end
|
66
85
|
|
86
|
+
it 'raises when going out of the image boundaries' do
|
87
|
+
expect {
|
88
|
+
engine.image = 'second.png'
|
89
|
+
engine.select 10, 20, 1000, 1000
|
90
|
+
|
91
|
+
engine.hocr
|
92
|
+
}.to raise_error IndexError
|
93
|
+
end
|
67
94
|
end
|
68
95
|
|
69
96
|
describe '#blacklist' do
|
70
97
|
it 'works with removing weird signs' do
|
71
|
-
engine.with { |e| e.blacklist = '|' }.text_for('second.png').strip.
|
98
|
+
expect(engine.with { |e| e.blacklist = '|' }.text_for('second.png').strip).to eq("I'm 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234")
|
72
99
|
end
|
73
100
|
end
|
74
101
|
|
75
102
|
describe '#whitelist' do
|
76
103
|
it 'makes everything into a number' do
|
77
|
-
engine.with { |e| e.whitelist = '1234567890' }.text_for('second.png').strip.
|
104
|
+
expect(engine.with { |e| e.whitelist = '1234567890' }.text_for('second.png').strip).to match(/^[\d\s]*$/)
|
78
105
|
end
|
79
106
|
end
|
80
107
|
|
81
108
|
describe '#page_segmentation_mode' do
|
82
109
|
it 'sets it correctly' do
|
83
|
-
engine.with {|e|
|
110
|
+
expect(engine.with {|e|
|
84
111
|
e.page_segmentation_mode = :single_line
|
85
112
|
e.whitelist = [*'a'..'z', *'A'..'Z', *0..9, " ."].join
|
86
|
-
}.text_for('jsmj.png').strip.
|
113
|
+
}.text_for('jsmj.png').strip).to eq('Jsmj')
|
87
114
|
end
|
88
115
|
end
|
89
116
|
|
90
117
|
describe '#blocks' do
|
91
118
|
it 'works properly with first image' do
|
92
|
-
engine.blocks_for('first.png').first.to_s.strip.
|
119
|
+
expect(engine.blocks_for('first.png').first.to_s.strip).to eq('ABC')
|
93
120
|
end
|
94
121
|
end
|
95
122
|
|
96
123
|
describe '#paragraphs' do
|
97
124
|
it 'works properly with first image' do
|
98
|
-
engine.paragraphs_for('first.png').first.to_s.strip.
|
125
|
+
expect(engine.paragraphs_for('first.png').first.to_s.strip).to eq('ABC')
|
99
126
|
end
|
100
127
|
end
|
101
128
|
|
102
129
|
describe '#lines' do
|
103
130
|
it 'works properly with first image' do
|
104
|
-
engine.lines_for('first.png').first.to_s.strip.
|
131
|
+
expect(engine.lines_for('first.png').first.to_s.strip).to eq('ABC')
|
105
132
|
end
|
106
133
|
end
|
107
134
|
|
108
135
|
describe '#words' do
|
109
136
|
it 'works properly with first image' do
|
110
|
-
engine.words_for('first.png').first.to_s.
|
137
|
+
expect(engine.words_for('first.png').first.to_s).to eq('ABC')
|
111
138
|
end
|
112
139
|
end
|
113
140
|
|
114
141
|
describe '#symbols' do
|
115
142
|
it 'works properly with first image' do
|
116
|
-
engine.symbols_for('first.png').first.to_s.
|
143
|
+
expect(engine.symbols_for('first.png').first.to_s).to eq('A')
|
117
144
|
end
|
118
145
|
end
|
119
146
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tesseract-ocr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- meh.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: call-me
|
@@ -243,4 +243,3 @@ test_files:
|
|
243
243
|
- test/tesseract_spec.rb
|
244
244
|
- test/test-european.jpg
|
245
245
|
- test/test.png
|
246
|
-
has_rdoc:
|