tesseract-ocr 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8ba69437f990d1dec349c582d119d1d8a5007477
4
- data.tar.gz: 7e536618d411e1e568afb1a57958b10074d8c5db
3
+ metadata.gz: 15d5c5417af8a215178f247ebca4750d2a7cbbcb
4
+ data.tar.gz: 35e1a440418421cc2b22f2f7f8e095600911e04c
5
5
  SHA512:
6
- metadata.gz: c6509845536b6acc5451974756bc006bbdea858968f3cd793d3d59f57973eb8f777856ab230e9935caccb902ec73a4444de47527132d877b495fa30538dfd77b
7
- data.tar.gz: 3528e0185fbf1c8c57284baa753e9e75c886f12952ebc2fb69169d9b08ea89a99b0b0df915b5f18e5dc00958baadbaabe696146d3a33284e3186eee100bd5078
6
+ metadata.gz: 71064917075114639018f60de081afb1cf164677b1cd9bbec940fe1f34063b0ef0a823f3ee0f2dba434f03775d000e4e50b7a17f93285b79d9982fa2d563f475
7
+ data.tar.gz: 188d4e48122324899a2d83b97585a932938a9905b4de3960c6496748ee4b352707a9e80455905da446b469afba47c8e87c9337270130b8972309322e8bfd5576
@@ -99,8 +99,8 @@ class API
99
99
  end
100
100
  end
101
101
 
102
- def init (datapath = Tesseract.prefix || '.', language = 'eng', mode = :DEFAULT)
103
- unless C::BaseAPI.init(to_ffi, datapath, language.to_s, mode).zero?
102
+ def init (datapath = nil, language = 'eng', mode = :DEFAULT)
103
+ unless C::BaseAPI.init(to_ffi, datapath || Tesseract.prefix || '/usr/share', language.to_s, mode).zero?
104
104
  raise 'the API did not Init correctly'
105
105
  end
106
106
  end
@@ -166,6 +166,19 @@ class API
166
166
  C.free_array_of_char(pointer) unless pointer.null?
167
167
  end
168
168
 
169
+ def get_hocr(page = 0)
170
+ pointer = C::BaseAPI.get_hocr_text(to_ffi, page)
171
+
172
+ return if pointer.null?
173
+
174
+ result = pointer.read_string
175
+ result.force_encoding 'UTF-8'
176
+
177
+ result
178
+ ensure
179
+ C.free_array_of_char(pointer) unless pointer.null?
180
+ end
181
+
169
182
  def get_box (page = 0)
170
183
  pointer = C::BaseAPI.get_box_text(to_ffi, page)
171
184
  result = pointer.read_string
@@ -202,6 +202,12 @@ module BaseAPI
202
202
  }
203
203
  }, blocking: true
204
204
 
205
+ cpp.function %{
206
+ char* get_hocr_text (TessBaseAPI* api, int page_number) {
207
+ return api->GetHOCRText(page_number);
208
+ }
209
+ }, blocking: true
210
+
205
211
  cpp.function %{
206
212
  char* get_box_text (TessBaseAPI* api, int page_number) {
207
213
  return api->GetBoxText(page_number);
@@ -32,9 +32,9 @@ class Engine
32
32
  attr_reader :config
33
33
 
34
34
  named :path, :language, :mode, :variables,
35
- :optional => { :path => '.', :language => :eng, :mode => :DEFAULT, :variables => {}, :config => [] },
35
+ :optional => { :path => nil, :language => :eng, :mode => :DEFAULT, :variables => {}, :config => [] },
36
36
  :alias => { :data => :path, :lang => :language }
37
- def initialize (path = '.', language = :eng, mode = :DEFAULT, variables = {}, config = [], &block) # :yields: self
37
+ def initialize (path = nil, language = :eng, mode = :DEFAULT, variables = {}, config = [], &block) # :yields: self
38
38
  @api = API.new
39
39
 
40
40
  @initializing = true
@@ -171,6 +171,26 @@ class Engine
171
171
  text_at
172
172
  end
173
173
 
174
+ named :image, :x, :y, :width, :height,
175
+ :optional => 0 .. -1,
176
+ :alias => { :w => :width, :h => :height }
177
+ def hocr_for (image = nil, x = nil, y = nil, width = nil, height = nil, page = nil)
178
+ _setup(image, x, y, width, height)
179
+
180
+ @api.get_hocr(page || 0)
181
+ end
182
+
183
+ named :x, :y, :width, :height,
184
+ :optional => 0 .. -1,
185
+ :alias => { :w => :width, :h => :height }
186
+ def hocr_at (x = nil, y = nil, width = nil, height = nil, page = nil)
187
+ hocr_for(nil, x, y, width, height, page)
188
+ end
189
+
190
+ def hocr
191
+ hocr_at
192
+ end
193
+
174
194
  %w(block paragraph line word symbol).each {|level|
175
195
  define_method "each_#{level}" do |&block|
176
196
  raise ArgumentError, 'you have to pass a block' unless block
@@ -231,7 +251,7 @@ protected
231
251
  def _init
232
252
  @api.end
233
253
 
234
- @api.init(File.expand_path(@path), API.to_language_code(@language), @mode)
254
+ @api.init(@path, API.to_language_code(@language), @mode)
235
255
 
236
256
  @variables.each {|name, value|
237
257
  @api.set_variable(name.to_s, value.to_s)
@@ -24,6 +24,6 @@
24
24
 
25
25
  module Tesseract
26
26
  def self.version
27
- '0.1.7'
27
+ '0.1.8'
28
28
  end
29
29
  end
@@ -9,36 +9,38 @@ describe Tesseract::Engine do
9
9
 
10
10
  describe '#text_for' do
11
11
  it 'can read the first test image' do
12
- engine.text_for('first.png').strip.should == 'ABC'
12
+ expect(engine.text_for('first.png').strip).to eq('ABC')
13
13
  end
14
14
 
15
15
  it 'can read the second test image' do
16
- engine.text_for('second.png').strip.should == "#{Tesseract::API.new.version == '3.01' ? ?| : ?I}'m 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234"
16
+ expect(engine.text_for('second.png').strip).to eq("#{Tesseract::API.new.version == '3.01' ? ?| : ?I}'m 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234")
17
17
  end
18
18
 
19
19
  it 'raises when going out of the image boundaries' do
20
20
  expect {
21
21
  engine.text_for('second.png', 0, 0, 1000, 1000)
22
- }.should raise_error
22
+ }.to raise_error IndexError
23
23
  end
24
24
  end
25
25
 
26
26
  describe '#text_at' do
27
27
  it 'can read the first test image' do
28
28
  engine.image = 'first.png'
29
- engine.text_at(2, 2, 2, 2).strip.should == ''
29
+
30
+ expect(engine.text_at(2, 2, 2, 2).strip).to eq('')
30
31
  end
31
32
 
32
33
  it 'can read the second test image' do
33
34
  engine.image = 'second.png'
34
- engine.text_at(242, 191, 129, 31).strip.should == 'OH HAI 1234'
35
+
36
+ expect(engine.text_at(242, 191, 129, 31).strip).to eq('OH HAI 1234')
35
37
  end
36
38
 
37
39
  it 'raises when going out of the image boundaries' do
38
40
  expect {
39
41
  engine.image = 'second.png'
40
42
  engine.text_at(10, 20, 1000, 1000)
41
- }.should raise_error
43
+ }.to raise_error IndexError
42
44
  end
43
45
  end
44
46
 
@@ -47,13 +49,14 @@ describe Tesseract::Engine do
47
49
  engine.image = 'first.png'
48
50
  engine.select 2, 2, 2, 2
49
51
 
50
- engine.text.strip.should == ''
52
+ expect(engine.text.strip).to eq('')
51
53
  end
52
54
 
53
55
  it 'can read the second test image' do
54
56
  engine.image = 'second.png'
55
57
  engine.select 242, 191, 129, 31
56
- engine.text.strip.should == 'OH HAI 1234'
58
+
59
+ expect(engine.text.strip).to eq('OH HAI 1234')
57
60
  end
58
61
 
59
62
  it 'raises when going out of the image boundaries' do
@@ -61,59 +64,83 @@ describe Tesseract::Engine do
61
64
  engine.image = 'second.png'
62
65
  engine.select 10, 20, 1000, 1000
63
66
  engine.text
64
- }.should raise_error
67
+ }.to raise_error IndexError
68
+ end
69
+ end
70
+
71
+ describe '#hocr' do
72
+ it 'can read the first test image' do
73
+ engine.image = 'first.png'
74
+ engine.select 2, 2, 2, 2
75
+
76
+ expect(engine.hocr).to eq(" <div class='ocr_page' id='page_1' title='image \"\"; bbox 2 2 2 2; ppageno 0'>\n </div>\n")
77
+ end
78
+
79
+ it 'can read the second test image' do
80
+ engine.image = 'second.png'
81
+ engine.select 242, 191, 129, 31
82
+
83
+ expect(engine.hocr).to eq(" <div class='ocr_page' id='page_1' title='image \"\"; bbox 242 191 129 31; ppageno 0'>\n <div class='ocr_carea' id='block_1_1' title=\"bbox 242 191 371 222\">\n <p class='ocr_par' dir='ltr' id='par_1_1' title=\"bbox 250 200 365 213\">\n <span class='ocr_line' id='line_1_1' title=\"bbox 250 200 365 213; baseline 0 0\"><span class='ocrx_word' id='word_1_1' title='bbox 250 200 275 213; x_wconf 94' lang='eng' dir='ltr'>OH</span> <span class='ocrx_word' id='word_1_2' title='bbox 285 200 313 213; x_wconf 90' lang='eng' dir='ltr'>HAI</span> <span class='ocrx_word' id='word_1_3' title='bbox 323 200 365 213; x_wconf 86' lang='eng'>1234</span> \n </span>\n </p>\n </div>\n </div>\n")
65
84
  end
66
85
 
86
+ it 'raises when going out of the image boundaries' do
87
+ expect {
88
+ engine.image = 'second.png'
89
+ engine.select 10, 20, 1000, 1000
90
+
91
+ engine.hocr
92
+ }.to raise_error IndexError
93
+ end
67
94
  end
68
95
 
69
96
  describe '#blacklist' do
70
97
  it 'works with removing weird signs' do
71
- engine.with { |e| e.blacklist = '|' }.text_for('second.png').strip.should == "I'm 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234"
98
+ expect(engine.with { |e| e.blacklist = '|' }.text_for('second.png').strip).to eq("I'm 12 and what is this.\nINSTALL GENTOO\nOH HAI 1234")
72
99
  end
73
100
  end
74
101
 
75
102
  describe '#whitelist' do
76
103
  it 'makes everything into a number' do
77
- engine.with { |e| e.whitelist = '1234567890' }.text_for('second.png').strip.should match(/^[\d\s]*$/)
104
+ expect(engine.with { |e| e.whitelist = '1234567890' }.text_for('second.png').strip).to match(/^[\d\s]*$/)
78
105
  end
79
106
  end
80
107
 
81
108
  describe '#page_segmentation_mode' do
82
109
  it 'sets it correctly' do
83
- engine.with {|e|
110
+ expect(engine.with {|e|
84
111
  e.page_segmentation_mode = :single_line
85
112
  e.whitelist = [*'a'..'z', *'A'..'Z', *0..9, " ."].join
86
- }.text_for('jsmj.png').strip.should == 'JSmj'
113
+ }.text_for('jsmj.png').strip).to eq('Jsmj')
87
114
  end
88
115
  end
89
116
 
90
117
  describe '#blocks' do
91
118
  it 'works properly with first image' do
92
- engine.blocks_for('first.png').first.to_s.strip.should == 'ABC'
119
+ expect(engine.blocks_for('first.png').first.to_s.strip).to eq('ABC')
93
120
  end
94
121
  end
95
122
 
96
123
  describe '#paragraphs' do
97
124
  it 'works properly with first image' do
98
- engine.paragraphs_for('first.png').first.to_s.strip.should == 'ABC'
125
+ expect(engine.paragraphs_for('first.png').first.to_s.strip).to eq('ABC')
99
126
  end
100
127
  end
101
128
 
102
129
  describe '#lines' do
103
130
  it 'works properly with first image' do
104
- engine.lines_for('first.png').first.to_s.strip.should == 'ABC'
131
+ expect(engine.lines_for('first.png').first.to_s.strip).to eq('ABC')
105
132
  end
106
133
  end
107
134
 
108
135
  describe '#words' do
109
136
  it 'works properly with first image' do
110
- engine.words_for('first.png').first.to_s.should == 'ABC'
137
+ expect(engine.words_for('first.png').first.to_s).to eq('ABC')
111
138
  end
112
139
  end
113
140
 
114
141
  describe '#symbols' do
115
142
  it 'works properly with first image' do
116
- engine.symbols_for('first.png').first.to_s.should == 'A'
143
+ expect(engine.symbols_for('first.png').first.to_s).to eq('A')
117
144
  end
118
145
  end
119
146
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tesseract-ocr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - meh.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-11 00:00:00.000000000 Z
11
+ date: 2015-06-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: call-me
@@ -243,4 +243,3 @@ test_files:
243
243
  - test/tesseract_spec.rb
244
244
  - test/test-european.jpg
245
245
  - test/test.png
246
- has_rdoc: