tesseract-ocr 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8ba69437f990d1dec349c582d119d1d8a5007477
4
- data.tar.gz: 7e536618d411e1e568afb1a57958b10074d8c5db
3
+ metadata.gz: 15d5c5417af8a215178f247ebca4750d2a7cbbcb
4
+ data.tar.gz: 35e1a440418421cc2b22f2f7f8e095600911e04c
5
5
  SHA512:
6
- metadata.gz: c6509845536b6acc5451974756bc006bbdea858968f3cd793d3d59f57973eb8f777856ab230e9935caccb902ec73a4444de47527132d877b495fa30538dfd77b
7
- data.tar.gz: 3528e0185fbf1c8c57284baa753e9e75c886f12952ebc2fb69169d9b08ea89a99b0b0df915b5f18e5dc00958baadbaabe696146d3a33284e3186eee100bd5078
6
+ metadata.gz: 71064917075114639018f60de081afb1cf164677b1cd9bbec940fe1f34063b0ef0a823f3ee0f2dba434f03775d000e4e50b7a17f93285b79d9982fa2d563f475
7
+ data.tar.gz: 188d4e48122324899a2d83b97585a932938a9905b4de3960c6496748ee4b352707a9e80455905da446b469afba47c8e87c9337270130b8972309322e8bfd5576
@@ -99,8 +99,8 @@ class API
99
99
  end
100
100
  end
101
101
 
102
- def init (datapath = Tesseract.prefix || '.', language = 'eng', mode = :DEFAULT)
103
- unless C::BaseAPI.init(to_ffi, datapath, language.to_s, mode).zero?
102
+ def init (datapath = nil, language = 'eng', mode = :DEFAULT)
103
+ unless C::BaseAPI.init(to_ffi, datapath || Tesseract.prefix || '/usr/share', language.to_s, mode).zero?
104
104
  raise 'the API did not Init correctly'
105
105
  end
106
106
  end
@@ -166,6 +166,19 @@ class API
166
166
  C.free_array_of_char(pointer) unless pointer.null?
167
167
  end
168
168
 
169
+ def get_hocr(page = 0)
170
+ pointer = C::BaseAPI.get_hocr_text(to_ffi, page)
171
+
172
+ return if pointer.null?
173
+
174
+ result = pointer.read_string
175
+ result.force_encoding 'UTF-8'
176
+
177
+ result
178
+ ensure
179
+ C.free_array_of_char(pointer) unless pointer.null?
180
+ end
181
+
169
182
  def get_box (page = 0)
170
183
  pointer = C::BaseAPI.get_box_text(to_ffi, page)
171
184
  result = pointer.read_string
@@ -202,6 +202,12 @@ module BaseAPI
202
202
  }
203
203
  }, blocking: true
204
204
 
205
+ cpp.function %{
206
+ char* get_hocr_text (TessBaseAPI* api, int page_number) {
207
+ return api->GetHOCRText(page_number);
208
+ }
209
+ }, blocking: true
210
+
205
211
  cpp.function %{
206
212
  char* get_box_text (TessBaseAPI* api, int page_number) {
207
213
  return api->GetBoxText(page_number);
@@ -32,9 +32,9 @@ class Engine
32
32
  attr_reader :config
33
33
 
34
34
  named :path, :language, :mode, :variables,
35
- :optional => { :path => '.', :language => :eng, :mode => :DEFAULT, :variables => {}, :config => [] },
35
+ :optional => { :path => nil, :language => :eng, :mode => :DEFAULT, :variables => {}, :config => [] },
36
36
  :alias => { :data => :path, :lang => :language }
37
- def initialize (path = '.', language = :eng, mode = :DEFAULT, variables = {}, config = [], &block) # :yields: self
37
+ def initialize (path = nil, language = :eng, mode = :DEFAULT, variables = {}, config = [], &block) # :yields: self
38
38
  @api = API.new
39
39
 
40
40
  @initializing = true
@@ -171,6 +171,26 @@ class Engine
171
171
  text_at
172
172
  end
173
173
 
174
+ named :image, :x, :y, :width, :height,
175
+ :optional => 0 .. -1,
176
+ :alias => { :w => :width, :h => :height }
177
+ def hocr_for (image = nil, x = nil, y = nil, width = nil, height = nil, page = nil)
178
+ _setup(image, x, y, width, height)
179
+
180
+ @api.get_hocr(page || 0)
181
+ end
182
+
183
+ named :x, :y, :width, :height,
184
+ :optional => 0 .. -1,
185
+ :alias => { :w => :width, :h => :height }
186
+ def hocr_at (x = nil, y = nil, width = nil, height = nil, page = nil)
187
+ hocr_for(nil, x, y, width, height, page)
188
+ end
189
+
190
+ def hocr
191
+ hocr_at
192
+ end
193
+
174
194
  %w(block paragraph line word symbol).each {|level|
175
195
  define_method "each_#{level}" do |&block|
176
196
  raise ArgumentError, 'you have to pass a block' unless block
@@ -231,7 +251,7 @@ protected
231
251
  def _init
232
252
  @api.end
233
253
 
234
- @api.init(File.expand_path(@path), API.to_language_code(@language), @mode)
254
+ @api.init(@path, API.to_language_code(@language), @mode)
235
255
 
236
256
  @variables.each {|name, value|
237
257
  @api.set_variable(name.to_s, value.to_s)
@@ -24,6 +24,6 @@
24
24
 
25
25
  module Tesseract
26
26
  def self.version
27
- '0.1.7'
27
+ '0.1.8'
28
28
  end
29
29
  end
@@ -9,36 +9,38 @@ describe Tesseract::Engine do
9
9
 
10
10
  describe '#text_for' do
11
11
  it 'can read the first test image' do
12
- engine.text_for('first.png').strip.should == 'ABC'
12
+ expect(engine.text_for('first.png').strip).to eq('ABC')
13
13
  end
14
14
 
15
15
  it 'can read the second test image' do
16
- engine.text_for('second.png').strip.should == "#{Tesseract::API.new.version == '3.01' ? ?| : ?I}'m 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234"
16
+ expect(engine.text_for('second.png').strip).to eq("#{Tesseract::API.new.version == '3.01' ? ?| : ?I}'m 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234")
17
17
  end
18
18
 
19
19
  it 'raises when going out of the image boundaries' do
20
20
  expect {
21
21
  engine.text_for('second.png', 0, 0, 1000, 1000)
22
- }.should raise_error
22
+ }.to raise_error IndexError
23
23
  end
24
24
  end
25
25
 
26
26
  describe '#text_at' do
27
27
  it 'can read the first test image' do
28
28
  engine.image = 'first.png'
29
- engine.text_at(2, 2, 2, 2).strip.should == ''
29
+
30
+ expect(engine.text_at(2, 2, 2, 2).strip).to eq('')
30
31
  end
31
32
 
32
33
  it 'can read the second test image' do
33
34
  engine.image = 'second.png'
34
- engine.text_at(242, 191, 129, 31).strip.should == 'OH HAI 1234'
35
+
36
+ expect(engine.text_at(242, 191, 129, 31).strip).to eq('OH HAI 1234')
35
37
  end
36
38
 
37
39
  it 'raises when going out of the image boundaries' do
38
40
  expect {
39
41
  engine.image = 'second.png'
40
42
  engine.text_at(10, 20, 1000, 1000)
41
- }.should raise_error
43
+ }.to raise_error IndexError
42
44
  end
43
45
  end
44
46
 
@@ -47,13 +49,14 @@ describe Tesseract::Engine do
47
49
  engine.image = 'first.png'
48
50
  engine.select 2, 2, 2, 2
49
51
 
50
- engine.text.strip.should == ''
52
+ expect(engine.text.strip).to eq('')
51
53
  end
52
54
 
53
55
  it 'can read the second test image' do
54
56
  engine.image = 'second.png'
55
57
  engine.select 242, 191, 129, 31
56
- engine.text.strip.should == 'OH HAI 1234'
58
+
59
+ expect(engine.text.strip).to eq('OH HAI 1234')
57
60
  end
58
61
 
59
62
  it 'raises when going out of the image boundaries' do
@@ -61,59 +64,83 @@ describe Tesseract::Engine do
61
64
  engine.image = 'second.png'
62
65
  engine.select 10, 20, 1000, 1000
63
66
  engine.text
64
- }.should raise_error
67
+ }.to raise_error IndexError
68
+ end
69
+ end
70
+
71
+ describe '#hocr' do
72
+ it 'can read the first test image' do
73
+ engine.image = 'first.png'
74
+ engine.select 2, 2, 2, 2
75
+
76
+ expect(engine.hocr).to eq(" <div class='ocr_page' id='page_1' title='image \"\"; bbox 2 2 2 2; ppageno 0'>\n </div>\n")
77
+ end
78
+
79
+ it 'can read the second test image' do
80
+ engine.image = 'second.png'
81
+ engine.select 242, 191, 129, 31
82
+
83
+ expect(engine.hocr).to eq(" <div class='ocr_page' id='page_1' title='image \"\"; bbox 242 191 129 31; ppageno 0'>\n <div class='ocr_carea' id='block_1_1' title=\"bbox 242 191 371 222\">\n <p class='ocr_par' dir='ltr' id='par_1_1' title=\"bbox 250 200 365 213\">\n <span class='ocr_line' id='line_1_1' title=\"bbox 250 200 365 213; baseline 0 0\"><span class='ocrx_word' id='word_1_1' title='bbox 250 200 275 213; x_wconf 94' lang='eng' dir='ltr'>OH</span> <span class='ocrx_word' id='word_1_2' title='bbox 285 200 313 213; x_wconf 90' lang='eng' dir='ltr'>HAI</span> <span class='ocrx_word' id='word_1_3' title='bbox 323 200 365 213; x_wconf 86' lang='eng'>1234</span> \n </span>\n </p>\n </div>\n </div>\n")
65
84
  end
66
85
 
86
+ it 'raises when going out of the image boundaries' do
87
+ expect {
88
+ engine.image = 'second.png'
89
+ engine.select 10, 20, 1000, 1000
90
+
91
+ engine.hocr
92
+ }.to raise_error IndexError
93
+ end
67
94
  end
68
95
 
69
96
  describe '#blacklist' do
70
97
  it 'works with removing weird signs' do
71
- engine.with { |e| e.blacklist = '|' }.text_for('second.png').strip.should == "I'm 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234"
98
+ expect(engine.with { |e| e.blacklist = '|' }.text_for('second.png').strip).to eq("I'm 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234")
72
99
  end
73
100
  end
74
101
 
75
102
  describe '#whitelist' do
76
103
  it 'makes everything into a number' do
77
- engine.with { |e| e.whitelist = '1234567890' }.text_for('second.png').strip.should match(/^[\d\s]*$/)
104
+ expect(engine.with { |e| e.whitelist = '1234567890' }.text_for('second.png').strip).to match(/^[\d\s]*$/)
78
105
  end
79
106
  end
80
107
 
81
108
  describe '#page_segmentation_mode' do
82
109
  it 'sets it correctly' do
83
- engine.with {|e|
110
+ expect(engine.with {|e|
84
111
  e.page_segmentation_mode = :single_line
85
112
  e.whitelist = [*'a'..'z', *'A'..'Z', *0..9, " ."].join
86
- }.text_for('jsmj.png').strip.should == 'JSmj'
113
+ }.text_for('jsmj.png').strip).to eq('Jsmj')
87
114
  end
88
115
  end
89
116
 
90
117
  describe '#blocks' do
91
118
  it 'works properly with first image' do
92
- engine.blocks_for('first.png').first.to_s.strip.should == 'ABC'
119
+ expect(engine.blocks_for('first.png').first.to_s.strip).to eq('ABC')
93
120
  end
94
121
  end
95
122
 
96
123
  describe '#paragraphs' do
97
124
  it 'works properly with first image' do
98
- engine.paragraphs_for('first.png').first.to_s.strip.should == 'ABC'
125
+ expect(engine.paragraphs_for('first.png').first.to_s.strip).to eq('ABC')
99
126
  end
100
127
  end
101
128
 
102
129
  describe '#lines' do
103
130
  it 'works properly with first image' do
104
- engine.lines_for('first.png').first.to_s.strip.should == 'ABC'
131
+ expect(engine.lines_for('first.png').first.to_s.strip).to eq('ABC')
105
132
  end
106
133
  end
107
134
 
108
135
  describe '#words' do
109
136
  it 'works properly with first image' do
110
- engine.words_for('first.png').first.to_s.should == 'ABC'
137
+ expect(engine.words_for('first.png').first.to_s).to eq('ABC')
111
138
  end
112
139
  end
113
140
 
114
141
  describe '#symbols' do
115
142
  it 'works properly with first image' do
116
- engine.symbols_for('first.png').first.to_s.should == 'A'
143
+ expect(engine.symbols_for('first.png').first.to_s).to eq('A')
117
144
  end
118
145
  end
119
146
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tesseract-ocr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - meh.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-11 00:00:00.000000000 Z
11
+ date: 2015-06-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: call-me
@@ -243,4 +243,3 @@ test_files:
243
243
  - test/tesseract_spec.rb
244
244
  - test/test-european.jpg
245
245
  - test/test.png
246
- has_rdoc: