rtesseract 2.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 695108fd1fe3b6bb921444dc8daadb248466dd13
4
- data.tar.gz: d2e8b38f6a54c7ffd004863de72a9d880be9bdbc
3
+ metadata.gz: 6eae58279cf744227e79b7bbc9180f7aea852547
4
+ data.tar.gz: 3836aa96d24b7f1a0b957cf803553f547cc33544
5
5
  SHA512:
6
- metadata.gz: c28af3ffb9d288fb580d22f68f99d3e159919284735ff6cf84a91ba8da636d4c771568019c0db4ae968bbad030cfc8e187100ca075b7904fb05eda5658ca8c0d
7
- data.tar.gz: 3fdb3195471c7b0a3674c000d6ebafe4a7474cb6336911ff93e898233f872bb220b56b309be2e1148a19eb2abd1a764fa60ef09bfcfea66a103bdb35f836d8bd
6
+ metadata.gz: 0ef57359c7c7f43094a50838b6d29d28d7808c9cadd8f2b8514c613be030161f8d640c41ba3d403c00fb59fdf85ffcbc57795f6c65b8418ad348eb1a6c07e901
7
+ data.tar.gz: ff5f0f94c8039bd0b38b0c9ec2618b4c38b07b9707e28ff29a3bb943abc85d5afaa543dfba1ba2b9e565d056ea558eda9b7f6d222a6adb43614cd86c6e8fdcac
data/.travis.yml CHANGED
@@ -3,6 +3,7 @@ addons:
3
3
  apt:
4
4
  packages:
5
5
  - tesseract-ocr
6
+
6
7
  sudo: false
7
8
  rvm:
8
9
  - 1.9.3
data/CHANGELOG.md ADDED
@@ -0,0 +1,22 @@
1
+ ## v2.0.1
2
+
3
+ #### Changed
4
+
5
+ * Refactoring of some small classes
6
+
7
+ ## v2.0.0
8
+
9
+ #### Added
10
+
11
+ * Support to options --tessdata-dir, --user-words and --user-patterns
12
+ * Ruby 2.3.0 to travis tests.
13
+
14
+ #### Changed
15
+
16
+ * Refactoring of some classes
17
+ * Crop options is a hash with x,y,w,h keys.
18
+ * Areas of RTesseract::Mixed now changed :width to :w and :height to :h.
19
+
20
+ #### Removed
21
+
22
+ * Support to quick_magick gem.
data/Gemfile.lock CHANGED
@@ -3,9 +3,8 @@ GEM
3
3
  specs:
4
4
  addressable (2.4.0)
5
5
  builder (3.2.2)
6
- coveralls (0.8.10)
6
+ coveralls (0.8.13)
7
7
  json (~> 1.8)
8
- rest-client (>= 1.6.8, < 2)
9
8
  simplecov (~> 0.11.0)
10
9
  term-ansicolor (~> 1.3)
11
10
  thor (~> 0.19.1)
@@ -14,8 +13,6 @@ GEM
14
13
  thread_safe (~> 0.3, >= 0.3.1)
15
14
  diff-lcs (1.2.5)
16
15
  docile (1.1.5)
17
- domain_name (0.5.25)
18
- unf (>= 0.0.5, < 1.0.0)
19
16
  faraday (0.9.2)
20
17
  multipart-post (>= 1.2, < 3)
21
18
  git (1.3.0)
@@ -28,8 +25,6 @@ GEM
28
25
  oauth2
29
26
  hashie (3.4.3)
30
27
  highline (1.7.8)
31
- http-cookie (1.0.2)
32
- domain_name (~> 0.5)
33
28
  jeweler (2.1.1)
34
29
  builder
35
30
  bundler (>= 1.0)
@@ -42,13 +37,11 @@ GEM
42
37
  semver
43
38
  json (1.8.3)
44
39
  jwt (1.5.1)
45
- mime-types (2.99)
46
- mini_magick (4.3.6)
40
+ mini_magick (4.5.1)
47
41
  mini_portile2 (2.0.0)
48
42
  multi_json (1.11.2)
49
43
  multi_xml (0.5.5)
50
44
  multipart-post (2.0.0)
51
- netrc (0.11.0)
52
45
  nokogiri (1.6.7.2)
53
46
  mini_portile2 (~> 2.0.0.rc2)
54
47
  oauth2 (1.1.0)
@@ -61,26 +54,22 @@ GEM
61
54
  rake (11.1.2)
62
55
  rdoc (4.2.2)
63
56
  json (~> 1.4)
64
- rest-client (1.8.0)
65
- http-cookie (>= 1.0.2, < 2.0)
66
- mime-types (>= 1.16, < 3.0)
67
- netrc (~> 0.7)
68
57
  rmagick (2.15.4)
69
58
  rspec (3.4.0)
70
59
  rspec-core (~> 3.4.0)
71
60
  rspec-expectations (~> 3.4.0)
72
61
  rspec-mocks (~> 3.4.0)
73
- rspec-core (3.4.1)
62
+ rspec-core (3.4.4)
74
63
  rspec-support (~> 3.4.0)
75
64
  rspec-expectations (3.4.0)
76
65
  diff-lcs (>= 1.2.0, < 2.0)
77
66
  rspec-support (~> 3.4.0)
78
- rspec-mocks (3.4.0)
67
+ rspec-mocks (3.4.1)
79
68
  diff-lcs (>= 1.2.0, < 2.0)
80
69
  rspec-support (~> 3.4.0)
81
70
  rspec-support (3.4.1)
82
71
  semver (1.0.1)
83
- simplecov (0.11.1)
72
+ simplecov (0.11.2)
84
73
  docile (~> 1.1.0)
85
74
  json (~> 1.8)
86
75
  simplecov-html (~> 0.10.0)
@@ -90,9 +79,6 @@ GEM
90
79
  thor (0.19.1)
91
80
  thread_safe (0.3.5)
92
81
  tins (1.6.0)
93
- unf (0.1.4)
94
- unf_ext
95
- unf_ext (0.0.7.1)
96
82
 
97
83
  PLATFORMS
98
84
  ruby
@@ -109,4 +95,4 @@ DEPENDENCIES
109
95
  simplecov
110
96
 
111
97
  BUNDLED WITH
112
- 1.10.6
98
+ 1.11.2
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.0
1
+ 2.0.1
@@ -1,32 +1,43 @@
1
1
  # encoding: UTF-8
2
- # Add to rtesseract a image manipulation with MiniMagick
3
- module MiniMagickProcessor
4
- def self.setup
5
- require 'mini_magick'
6
- end
2
+ # RTesseract class
3
+ class RTesseract
4
+ # Processor Module
5
+ module Processor
6
+ # Add to rtesseract a image manipulation with MiniMagick
7
+ module MiniMagickProcessor
8
+ # Setup Processor
9
+ def self.setup
10
+ require 'mini_magick'
11
+ end
7
12
 
8
- def self.a_name?(name)
9
- %w(mini_magick MiniMagickProcessor).include?(name.to_s)
10
- end
13
+ # Check if is this Processor
14
+ def self.a_name?(name)
15
+ %w(mini_magick MiniMagickProcessor).include?(name.to_s)
16
+ end
11
17
 
12
- def self.image_to_tif(source, _points = {})
13
- tmp_file = Tempfile.new(['', '.tif'])
14
- cat = source.is_a?(Pathname) ? read_with_processor(source.to_s) : source
15
- cat.format('tif') do |c|
16
- c.compress 'None'
17
- c.alpha 'off'
18
- end
19
- cat.crop("#{_points[:w]}x#{_points[:h]}+#{_points[:x]}+#{_points[:y]}") if _points.is_a?(Hash) && _points.values.compact != []
20
- cat.alpha 'off'
21
- cat.write tmp_file.path.to_s
22
- tmp_file
23
- end
18
+ # Convert Image to Tiff
19
+ def self.image_to_tif(source, points = {})
20
+ tmp_file = Tempfile.new(['', '.tif'])
21
+ cat = source.is_a?(Pathname) ? read_with_processor(source.to_s) : source
22
+ cat.format('tif') do |c|
23
+ c.compress 'None'
24
+ c.alpha 'off'
25
+ end
26
+ cat.crop("#{points[:w]}x#{points[:h]}+#{points[:x]}+#{points[:y]}") if points.is_a?(Hash) && points.values.compact != []
27
+ cat.alpha 'off'
28
+ cat.write tmp_file.path.to_s
29
+ tmp_file
30
+ end
24
31
 
25
- def self.read_with_processor(path)
26
- MiniMagick::Image.open(path.to_s)
27
- end
32
+ # Cast instance of image
33
+ def self.read_with_processor(path)
34
+ MiniMagick::Image.open(path.to_s)
35
+ end
28
36
 
29
- def self.image?(object)
30
- object.class == MiniMagick::Image
37
+ # Check if is a MiniMagick image
38
+ def self.image?(object)
39
+ object.class == MiniMagick::Image
40
+ end
41
+ end
31
42
  end
32
- end
43
+ end
@@ -1,26 +1,34 @@
1
1
  # encoding: UTF-8
2
- # Add to rtesseract a image without manipulation
3
- module NoneProcessor
4
- def self.setup
5
- end
6
-
7
- def self.a_name?(name)
8
- %w(none NoneProcessor).include?(name.to_s)
9
- end
2
+ # RTesseract class
3
+ class RTesseract
4
+ # Processor Module
5
+ module Processor
6
+ # Add to rtesseract a image without manipulation
7
+ module NoneProcessor
8
+ # Setup Processor
9
+ def self.setup
10
+ end
10
11
 
11
- def self.image_to_tif(source, _points = {})
12
- tmp_file = Tempfile.new(['', '.tif'])
13
- tmp_file.write(read_with_processor(source))
14
- tmp_file
15
- end
12
+ # Check if is this Processor
13
+ def self.a_name?(name)
14
+ %w(none NoneProcessor).include?(name.to_s)
15
+ end
16
16
 
17
- def self.need_crop?(*)
18
- end
17
+ # Convert Image to Tiff
18
+ def self.image_to_tif(source, _points = {})
19
+ tmp_file = Tempfile.new(['', '.tif'])
20
+ tmp_file.write(read_with_processor(source))
21
+ tmp_file
22
+ end
19
23
 
20
- def self.read_with_processor(path)
21
- File.read(path)
22
- end
24
+ # Cast instance of image
25
+ def self.read_with_processor(path)
26
+ File.read(path)
27
+ end
23
28
 
24
- def self.image?(*)
29
+ # Check if is a image
30
+ def self.image?(*)
31
+ end
32
+ end
25
33
  end
26
34
  end
@@ -1,35 +1,46 @@
1
1
  # encoding: UTF-8
2
- # Add to rtesseract a image manipulation with RMagick
3
- module RMagickProcessor
4
- def self.setup
5
- require 'rmagick'
6
- rescue LoadError
7
- # :nocov:
8
- require 'RMagick'
9
- # :nocov:
10
- end
2
+ # RTesseract class
3
+ class RTesseract
4
+ # Processor Module
5
+ module Processor
6
+ # Add to rtesseract a image manipulation with RMagick
7
+ module RMagickProcessor
8
+ # Setup Processor
9
+ def self.setup
10
+ require 'rmagick'
11
+ rescue LoadError
12
+ # :nocov:
13
+ require 'RMagick'
14
+ # :nocov:
15
+ end
11
16
 
12
- def self.a_name?(name)
13
- %w(rmagick RMagickProcessor).include?(name.to_s)
14
- end
17
+ # Check if is this Processor
18
+ def self.a_name?(name)
19
+ %w(rmagick RMagickProcessor).include?(name.to_s)
20
+ end
15
21
 
16
- def self.image_to_tif(source, _points = {})
17
- tmp_file = Tempfile.new(['', '.tif'])
18
- cat = source.is_a?(Pathname) ? read_with_processor(source.to_s) : source
19
- cat.crop!(_points[:x], _points[:y], _points[:w], _points[:h]) if _points.is_a?(Hash) && _points.values.compact != []
20
- cat.alpha Magick::DeactivateAlphaChannel
21
- cat.write(tmp_file.path.to_s) do
22
- # self.depth = 16
23
- self.compression = Magick::NoCompression
24
- end
25
- tmp_file
26
- end
22
+ # Convert Image to Tiff
23
+ def self.image_to_tif(source, points = {})
24
+ tmp_file = Tempfile.new(['', '.tif'])
25
+ cat = source.is_a?(Pathname) ? read_with_processor(source.to_s) : source
26
+ cat.crop!(points[:x], points[:y], points[:w], points[:h]) if points.is_a?(Hash) && points.values.compact != []
27
+ cat.alpha Magick::DeactivateAlphaChannel
28
+ cat.write(tmp_file.path.to_s) do
29
+ # self.depth = 16
30
+ self.compression = Magick::NoCompression
31
+ end
32
+ tmp_file
33
+ end
27
34
 
28
- def self.read_with_processor(path)
29
- Magick::Image.read(path.to_s).first
30
- end
35
+ # Cast instance of image
36
+ def self.read_with_processor(path)
37
+ Magick::Image.read(path.to_s).first
38
+ end
31
39
 
32
- def self.image?(object)
33
- object.class == Magick::Image
40
+ # Check if is a RMagick image
41
+ def self.image?(object)
42
+ object.class == Magick::Image
43
+ end
44
+ end
34
45
  end
35
46
  end
data/lib/rtesseract.rb CHANGED
@@ -1,18 +1,10 @@
1
1
  # encoding: UTF-8
2
2
  require 'pathname'
3
3
  require 'tempfile'
4
- require 'utils'
5
4
 
5
+ require 'rtesseract/utils'
6
6
  require 'rtesseract/configuration'
7
7
  require 'rtesseract/errors'
8
- require 'rtesseract/mixed'
9
- require 'rtesseract/box'
10
- require 'rtesseract/box_char'
11
-
12
- # Processors
13
- require 'processors/rmagick.rb'
14
- require 'processors/mini_magick.rb'
15
- require 'processors/none.rb'
16
8
 
17
9
  # Ruby wrapper for Tesseract OCR
18
10
  class RTesseract
@@ -23,58 +15,30 @@ class RTesseract
23
15
  def initialize(src = '', options = {})
24
16
  self.configuration = RTesseract.local_config(options)
25
17
  @options = options || {}
26
- @value, @points = [nil, {}]
27
- @processor = RTesseract.choose_processor!(self.configuration.processor)
18
+ @value = nil
19
+ @points = {}
20
+ @processor = RTesseract::Processor.choose_processor!(configuration.processor)
28
21
  @source = @processor.image?(src) ? src : Pathname.new(src)
29
22
  initialize_hook
30
23
  end
31
24
 
25
+ # Hook to end of initialize method
32
26
  def initialize_hook
33
27
  end
34
28
 
35
- def self.read(src = nil, options = {})
36
- fail RTesseract::ImageNotSelectedError if src.nil?
37
- processor = RTesseract.choose_processor!(options.option(:processor, nil))
38
- image = processor.read_with_processor(src.to_s)
39
- yield(image)
40
- object = RTesseract.new('', options).from_blob(image.to_blob)
41
- object
42
- end
43
-
44
- def read
45
- image = @processor.read_with_processor(@source.to_s)
46
- new_image = yield(image)
47
- from_blob(new_image.to_blob, File.extname(@source.to_s))
48
- self
49
- end
50
-
29
+ # Define the source
51
30
  def source=(src)
52
31
  @value = nil
53
32
  @source = @processor.image?(src) ? src : Pathname.new(src)
54
33
  end
55
34
 
56
35
  # Crop image to convert
57
- def crop!(_points = {})
36
+ def crop!(points = {})
58
37
  @value = nil
59
- @points = _points
38
+ @points = points
60
39
  self
61
40
  end
62
41
 
63
- # Remove files
64
- def remove_file(files = [])
65
- files.each do |file|
66
- if file.is_a?(Tempfile)
67
- file.close
68
- file.unlink
69
- else
70
- File.unlink(file)
71
- end
72
- end
73
- true
74
- rescue => error
75
- raise RTesseract::TempFilesNotRemovedError.new(error: error, files: files)
76
- end
77
-
78
42
  # Select the language
79
43
  # ===Languages
80
44
  ## * eng - English
@@ -88,58 +52,56 @@ class RTesseract
88
52
  ## * vie - Vietnamese
89
53
  ## Note: Make sure you have installed the language to tesseract
90
54
  def lang
91
- language = "#{self.configuration.lang}".strip.downcase
92
- LANGUAGES.each do |value, names|
93
- return " -l #{value} " if names.include? language
94
- end
95
- return " -l #{language} " if language.size > 0
55
+ language = (configuration.lang || 'eng').to_s.strip.downcase
56
+ " -l #{LANGUAGES[language] || language} "
57
+ rescue
96
58
  ''
59
+ end
60
+
61
+ # Convert option to command
62
+ def option_to_string(prefix, value = nil)
63
+ (value.nil? ? '' : " #{prefix} #{value} ")
97
64
  rescue
98
65
  ''
99
66
  end
100
67
 
101
68
  # Page Segment Mode
102
69
  def psm
103
- (self.configuration.psm.nil? ? '' : " -psm #{self.configuration.psm} ")
104
- rescue
105
- ''
70
+ option_to_string('-psm', configuration.psm)
106
71
  end
107
72
 
108
73
  # Tessdata Dir
109
74
  def tessdata_dir
110
- (self.configuration.tessdata_dir.nil? ? '' : " --tessdata-dir #{self.configuration.tessdata_dir} ")
111
- rescue
112
- ''
75
+ option_to_string('--tessdata-dir', configuration.tessdata_dir)
113
76
  end
114
77
 
115
78
  # User Words
116
79
  def user_words
117
- (self.configuration.user_words.nil? ? '' : " --user-words #{self.configuration.user_words} ")
118
- rescue
119
- ''
80
+ option_to_string('--user-words', configuration.user_words)
120
81
  end
121
82
 
122
83
  # User Patterns
123
84
  def user_patterns
124
- (self.configuration.user_patterns.nil? ? '' : " --user-patterns #{self.configuration.user_patterns} ")
125
- rescue
126
- ''
85
+ option_to_string('--user-patterns', configuration.user_patterns)
127
86
  end
128
87
 
129
88
  # Options on line
130
89
  def options_cmd
131
- self.configuration.options_cmd
90
+ configuration.options_cmd
132
91
  end
133
92
 
93
+ # Hook to before config
134
94
  def config_hook
135
95
  end
136
96
 
97
+ # Convert configurations
137
98
  def config
138
99
  @options ||= {}
139
100
  config_hook
140
101
  @options.map { |k, v| "#{k} #{v}" }.join("\n")
141
102
  end
142
103
 
104
+ # Write config to file
143
105
  def config_file
144
106
  config_hook
145
107
  return '' if @options == {}
@@ -151,34 +113,41 @@ class RTesseract
151
113
 
152
114
  # TODO: Clear console for MacOS or Windows
153
115
  def clear_console_output
154
- return '' if self.configuration.debug
116
+ return '' if configuration.debug
155
117
  return '2>/dev/null' if File.exist?('/dev/null') # Linux console clear
156
118
  end
157
119
 
120
+ # Get image
158
121
  def image
159
122
  (@image = @processor.image_to_tif(@source, @points)).path
160
123
  end
161
124
 
125
+ # Extension of file
162
126
  def file_ext
163
127
  '.txt'
164
128
  end
165
129
 
130
+ # Rand file path
166
131
  def text_file
167
132
  @text_file = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
168
133
  end
169
134
 
135
+ # Full path of file with extension
170
136
  def text_file_with_ext(ext = nil)
171
137
  [@text_file, ext || file_ext].join('')
172
138
  end
173
139
 
140
+ # Run command
174
141
  def convert_command
175
- `#{self.configuration.command} "#{image}" "#{text_file}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{self.configuration.options_cmd.join(' ')}`
142
+ `#{configuration.command} "#{image}" "#{text_file}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{configuration.options_cmd.join(' ')}`
176
143
  end
177
144
 
145
+ # Read result file
178
146
  def convert_text
179
147
  @value = File.read(text_file_with_ext).to_s
180
148
  end
181
149
 
150
+ # Hook to convert
182
151
  def after_convert_hook
183
152
  end
184
153
 
@@ -187,21 +156,7 @@ class RTesseract
187
156
  convert_command
188
157
  after_convert_hook
189
158
  convert_text
190
- remove_file([@image, text_file_with_ext])
191
- rescue => error
192
- raise RTesseract::ConversionError.new(error), error, caller
193
- end
194
-
195
- # Read image from memory blob
196
- def from_blob(blob, ext = '')
197
- blob_file = Tempfile.new(['blob', ext], encoding: 'ascii-8bit')
198
- blob_file.binmode.write(blob)
199
- blob_file.rewind
200
- blob_file.flush
201
- self.source = blob_file.path
202
- convert
203
- remove_file([blob_file])
204
- self
159
+ RTesseract::Utils.remove_files([@image, text_file_with_ext])
205
160
  rescue => error
206
161
  raise RTesseract::ConversionError.new(error), error, caller
207
162
  end
@@ -220,19 +175,17 @@ class RTesseract
220
175
 
221
176
  # Remove spaces and break-lines
222
177
  def to_s_without_spaces
223
- to_s.gsub(' ', '').gsub("\n", '').gsub("\r", '')
224
- end
225
-
226
- def self.choose_processor!(processor)
227
- processor =
228
- if MiniMagickProcessor.a_name?(processor.to_s)
229
- MiniMagickProcessor
230
- elsif NoneProcessor.a_name?(processor.to_s)
231
- NoneProcessor
232
- else
233
- RMagickProcessor
234
- end
235
- processor.setup
236
- processor
178
+ to_s.delete(' ').delete("\n").delete("\r")
237
179
  end
238
180
  end
181
+
182
+ require 'rtesseract/mixed'
183
+ require 'rtesseract/box'
184
+ require 'rtesseract/box_char'
185
+ require 'rtesseract/blob'
186
+ require 'rtesseract/processor'
187
+
188
+ # Processors
189
+ require 'processors/rmagick.rb'
190
+ require 'processors/mini_magick.rb'
191
+ require 'processors/none.rb'
@@ -0,0 +1,34 @@
1
+ # Blob methods
2
+ class RTesseract
3
+ # Read image from memory blob
4
+ def self.read(src = nil, options = {})
5
+ fail RTesseract::ImageNotSelectedError if src.nil?
6
+ processor = RTesseract::Processor.choose_processor!(options.option(:processor, nil))
7
+ image = processor.read_with_processor(src.to_s)
8
+ yield(image)
9
+ object = RTesseract.new('', options).from_blob(image.to_blob)
10
+ object
11
+ end
12
+
13
+ # Read image from memory blob
14
+ def read
15
+ image = @processor.read_with_processor(@source.to_s)
16
+ new_image = yield(image)
17
+ from_blob(new_image.to_blob, File.extname(@source.to_s))
18
+ self
19
+ end
20
+
21
+ # Read image from memory blob
22
+ def from_blob(blob, ext = '')
23
+ blob_file = Tempfile.new(['blob', ext], encoding: 'ascii-8bit')
24
+ blob_file.binmode.write(blob)
25
+ blob_file.rewind
26
+ blob_file.flush
27
+ self.source = blob_file.path
28
+ convert
29
+ RTesseract::Utils.remove_files([blob_file])
30
+ self
31
+ rescue => error
32
+ raise RTesseract::ConversionError.new(error), error, caller
33
+ end
34
+ end
@@ -2,37 +2,45 @@
2
2
  require 'nokogiri'
3
3
  require 'fileutils'
4
4
 
5
+ # RTesseract
5
6
  class RTesseract
6
7
  # Class to read char positions from an image
7
8
  class Box < RTesseract
9
+ # Setting value as blank array
8
10
  def initialize_hook
9
- @value, @points = [[], {}]
11
+ @value = []
10
12
  end
11
13
 
14
+ # Aditional options to config file
12
15
  def config_hook
13
16
  @options['tessedit_create_hocr'] = 1 # Split Words configuration
14
17
  end
15
18
 
19
+ # Words converted
16
20
  def words
17
21
  convert if @value == []
18
22
  @value
19
23
  end
20
24
 
25
+ # Extension of file
21
26
  def file_ext
22
27
  '.hocr'
23
28
  end
24
29
 
30
+ # Read the result file
25
31
  def parse_file
26
32
  html = Nokogiri::HTML(File.read(text_file_with_ext))
27
33
  html.css('span.ocrx_word, span.ocr_word')
28
34
  end
29
35
 
36
+ # Return words to value
30
37
  def convert_text
31
38
  text_objects = []
32
39
  parse_file.each { |word| text_objects << BoxParser.new(word).to_h }
33
40
  @value = text_objects
34
41
  end
35
42
 
43
+ # Move file html to hocr
36
44
  def after_convert_hook
37
45
  FileUtils.mv(text_file_with_ext('.html'), text_file_with_ext) rescue nil
38
46
  end
@@ -56,6 +64,7 @@ class RTesseract
56
64
  @attributes = title.gsub(';', '').split(' ')
57
65
  end
58
66
 
67
+ # Hash of word and position
59
68
  def to_h
60
69
  {
61
70
  word: @word.text,
@@ -1,4 +1,5 @@
1
1
  # encoding: UTF-8
2
+ # RTesseract
2
3
  class RTesseract
3
4
  # Class to read char positions from an image
4
5
  class BoxChar < Box
@@ -8,10 +9,12 @@ class RTesseract
8
9
 
9
10
  alias_method :characters, :words
10
11
 
12
+ # Extension of file
11
13
  def file_ext
12
14
  '.box'
13
15
  end
14
16
 
17
+ # Read the result file
15
18
  def parse_file
16
19
  File.read(text_file_with_ext).to_s
17
20
  end
@@ -1,12 +1,16 @@
1
- # Configuration
1
+ # RTesseract
2
2
  class RTesseract
3
3
  # Aliases to languages names
4
4
  LANGUAGES = {
5
- 'eng' => %w(en en-us english),
6
- 'ita' => %w(it),
7
- 'por' => %w(pt pt-br portuguese),
8
- 'spa' => %w(sp)
9
- }
5
+ 'en' => 'eng',
6
+ 'en-us' => 'eng',
7
+ 'english' => 'eng',
8
+ 'pt' => 'por',
9
+ 'pt-br' => 'por',
10
+ 'portuguese' => 'por',
11
+ 'it' => 'ita',
12
+ 'sp' => 'spa'
13
+ }.freeze
10
14
 
11
15
  # Configuration class
12
16
  class Configuration
@@ -16,16 +20,19 @@ class RTesseract
16
20
  @processor = 'rmagick'
17
21
  end
18
22
 
23
+ # Global configuration
19
24
  def parent
20
25
  @parent ||= RTesseract.configuration || RTesseract::Configuration.new
21
26
  end
22
27
 
28
+ # Set value of option
23
29
  def option(options, name, default = nil)
24
30
  self.instance_variable_set("@#{name}", options.option(name, parent.send(name)) || default)
25
31
  end
26
32
 
33
+ # Return the values of options
27
34
  def load_options(options, names = [])
28
- names.each{ |name| option(options, name, nil) }
35
+ names.each { |name| option(options, name, nil) }
29
36
  end
30
37
  end
31
38
 
@@ -38,6 +45,7 @@ class RTesseract
38
45
  yield(configuration)
39
46
  end
40
47
 
48
+ # Default command
41
49
  def self.default_command
42
50
  TesseractBin::Executables[:tesseract] || 'tesseract'
43
51
  rescue
@@ -49,7 +57,7 @@ class RTesseract
49
57
  RTesseract::Configuration.new.tap do |config|
50
58
  config.command = config.option(options, :command, RTesseract.default_command)
51
59
  config.processor = config.option(options, :processor, 'rmagick')
52
- config.load_options(options, [ :lang, :psm, :tessdata_dir, :user_words, :user_patterns ])
60
+ config.load_options(options, [:lang, :psm, :tessdata_dir, :user_words, :user_patterns])
53
61
  config.debug = config.option(options, :debug, false)
54
62
  config.options_cmd = [options.option(:options, nil)].flatten.compact
55
63
  end
@@ -1,3 +1,4 @@
1
+ # RTesseract
1
2
  class RTesseract
2
3
  # Class of error with storage of normal errors
3
4
  class ErrorWithMemory < StandardError
@@ -1,4 +1,5 @@
1
1
  # encoding: UTF-8
2
+ # RTesseract
2
3
  class RTesseract
3
4
  # Class to read an image from specified areas
4
5
  class Mixed
@@ -12,11 +13,13 @@ class RTesseract
12
13
  yield self if block_given?
13
14
  end
14
15
 
15
- def area(_points)
16
+ # Add areas
17
+ def area(points)
16
18
  @value = ''
17
- @areas << _points # { x: x, y: y, width: width, height: height }
19
+ @areas << points
18
20
  end
19
21
 
22
+ # Clear areas
20
23
  def clear_areas
21
24
  @areas = []
22
25
  end
@@ -25,7 +28,7 @@ class RTesseract
25
28
  def convert
26
29
  @value = []
27
30
  @areas.each_with_object(RTesseract.new(@source.to_s, @options.dup)) do |area, image|
28
- image.crop!(area) # area[:x], area[:y], area[:width], area[:height])
31
+ image.crop!(area)
29
32
  @value << image.to_s
30
33
  end
31
34
  rescue => error
@@ -45,7 +48,7 @@ class RTesseract
45
48
 
46
49
  # Remove spaces and break-lines
47
50
  def to_s_without_spaces
48
- to_s.gsub(' ', '').gsub("\n", '').gsub("\r", '')
51
+ to_s.delete(' ').delete("\n").delete("\r")
49
52
  end
50
53
  end
51
54
  end
@@ -0,0 +1,19 @@
1
+ # RTesseract
2
+ class RTesseract
3
+ # Processor managment
4
+ module Processor
5
+ # Return the processor
6
+ def self.choose_processor!(processor)
7
+ processor =
8
+ if RTesseract::Processor::MiniMagickProcessor.a_name?(processor.to_s)
9
+ MiniMagickProcessor
10
+ elsif RTesseract::Processor::NoneProcessor.a_name?(processor.to_s)
11
+ NoneProcessor
12
+ else
13
+ RMagickProcessor
14
+ end
15
+ processor.setup
16
+ processor
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,34 @@
1
+ # RTesseract
2
+ class RTesseract
3
+ # Some utils methods
4
+ module Utils
5
+ # Remove files or Tempfile
6
+ def self.remove_files(files = [])
7
+ files.each do |file|
8
+ self.remove_file(file)
9
+ end
10
+ true
11
+ rescue => error
12
+ raise RTesseract::TempFilesNotRemovedError.new(error: error, files: files)
13
+ end
14
+
15
+ # Remove file or Tempfile
16
+ def self.remove_file(file)
17
+ if file.is_a?(Tempfile)
18
+ file.close
19
+ file.unlink
20
+ else
21
+ File.unlink(file)
22
+ end
23
+ true
24
+ end
25
+ end
26
+ end
27
+
28
+ # Hash
29
+ class Hash
30
+ # return the value and remove from hash
31
+ def option(attr_name, default)
32
+ delete(attr_name.to_s) || delete(attr_name) || default
33
+ end
34
+ end
data/rtesseract.gemspec CHANGED
@@ -2,16 +2,16 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: rtesseract 2.0.0 ruby lib
5
+ # stub: rtesseract 2.0.1 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
8
  s.name = "rtesseract"
9
- s.version = "2.0.0"
9
+ s.version = "2.0.1"
10
10
 
11
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
12
  s.require_paths = ["lib"]
13
13
  s.authors = ["Danilo Jeremias da Silva"]
14
- s.date = "2016-04-19"
14
+ s.date = "2016-05-17"
15
15
  s.description = "Ruby library for working with the Tesseract OCR."
16
16
  s.email = "dannnylo@gmail.com"
17
17
  s.extra_rdoc_files = [
@@ -22,6 +22,7 @@ Gem::Specification.new do |s|
22
22
  ".document",
23
23
  ".rspec",
24
24
  ".travis.yml",
25
+ "CHANGELOG.md",
25
26
  "Gemfile",
26
27
  "Gemfile.lock",
27
28
  "LICENSE.txt",
@@ -32,13 +33,16 @@ Gem::Specification.new do |s|
32
33
  "lib/processors/none.rb",
33
34
  "lib/processors/rmagick.rb",
34
35
  "lib/rtesseract.rb",
36
+ "lib/rtesseract/blob.rb",
35
37
  "lib/rtesseract/box.rb",
36
38
  "lib/rtesseract/box_char.rb",
37
39
  "lib/rtesseract/configuration.rb",
38
40
  "lib/rtesseract/errors.rb",
39
41
  "lib/rtesseract/mixed.rb",
40
- "lib/utils.rb",
42
+ "lib/rtesseract/processor.rb",
43
+ "lib/rtesseract/utils.rb",
41
44
  "rtesseract.gemspec",
45
+ "spec/configs/eng.user-words.txt",
42
46
  "spec/images/README.pdf",
43
47
  "spec/images/blank.tif",
44
48
  "spec/images/mixed.tif",
@@ -0,0 +1,13 @@
1
+ you
2
+ are
3
+ a
4
+ friend
5
+ you
6
+ speak
7
+ the
8
+ password
9
+ and
10
+ the
11
+ doors
12
+ will
13
+ open
@@ -6,17 +6,7 @@ describe 'Rtesseract::BoxChar' do
6
6
  @path = Pathname.new(__FILE__.gsub('rtesseract_box_char_spec.rb', '')).expand_path
7
7
  @image_tiff = @path.join('images', 'test.tif').to_s
8
8
  @words_image = @path.join('images', 'test_words.png').to_s
9
- end
10
-
11
- it 'bounding box by char' do
12
- expect(RTesseract::BoxChar.new(@image_tiff).characters.is_a?(Array)).to eql(true)
13
- expect(RTesseract::BoxChar.new(@image_tiff).characters).to eql([
14
- { char: '4', x_start: 145, y_start: 14, x_end: 159, y_end: 33 },
15
- { char: '3', x_start: 184, y_start: 14, x_end: 196, y_end: 33 },
16
- { char: 'X', x_start: 222, y_start: 14, x_end: 238, y_end: 32 },
17
- { char: 'F', x_start: 260, y_start: 14, x_end: 273, y_end: 32 }])
18
-
19
- expect(RTesseract::BoxChar.new(@words_image).characters).to eql([
9
+ @values = [
20
10
  { char: 'I', x_start: 52, y_start: 91, x_end: 54, y_end: 104 },
21
11
  { char: 'f', x_start: 56, y_start: 91, x_end: 63, y_end: 105 },
22
12
  { char: 'y', x_start: 69, y_start: 87, x_end: 79, y_end: 101 },
@@ -72,7 +62,18 @@ describe 'Rtesseract::BoxChar' do
72
62
  { char: 'p', x_start: 228, y_start: 43, x_end: 237, y_end: 57 },
73
63
  { char: 'e', x_start: 238, y_start: 47, x_end: 248, y_end: 57 },
74
64
  { char: 'n', x_start: 250, y_start: 47, x_end: 258, y_end: 57 },
75
- { char: '.', x_start: 261, y_start: 47, x_end: 263, y_end: 49 }])
65
+ { char: '.', x_start: 261, y_start: 47, x_end: 263, y_end: 49 }]
66
+ end
67
+
68
+ it 'bounding box by char' do
69
+ expect(RTesseract::BoxChar.new(@image_tiff).characters.is_a?(Array)).to eql(true)
70
+ expect(RTesseract::BoxChar.new(@image_tiff).characters).to eql([
71
+ { char: '4', x_start: 145, y_start: 14, x_end: 159, y_end: 33 },
72
+ { char: '3', x_start: 184, y_start: 14, x_end: 196, y_end: 33 },
73
+ { char: 'X', x_start: 222, y_start: 14, x_end: 238, y_end: 32 },
74
+ { char: 'F', x_start: 260, y_start: 14, x_end: 273, y_end: 32 }])
75
+
76
+ expect(RTesseract::BoxChar.new(@words_image).characters).to eql(@values)
76
77
 
77
78
  expect { RTesseract::BoxChar.new(@image_tiff, command: 'tesseract_error').to_s }.to raise_error(RTesseract::ConversionError)
78
79
  expect { RTesseract::BoxChar.new(@image_tiff + '_not_exist').to_s }.to raise_error(RTesseract::ImageNotSelectedError)
@@ -81,6 +81,7 @@ describe 'Rtesseract' do
81
81
  expect(RTesseract.new(@image_tif, lang: 'eng').to_s_without_spaces).to eql('43XF')
82
82
 
83
83
  expect(RTesseract.new(@image_tif, lang: 'eng').lang).to eql(' -l eng ')
84
+ expect(RTesseract.new(@image_tif, lang: 'it').lang).to eql(' -l ita ')
84
85
 
85
86
  # Invalid lang object
86
87
  expect(RTesseract.new(@image_tif, lang: MakeStringError.new).lang).to eql('')
@@ -98,6 +99,7 @@ describe 'Rtesseract' do
98
99
  expect(RTesseract.new(@image_tif, chop_enable: 0).config).to eql('chop_enable 0')
99
100
  expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0).config).to eql("chop_enable 0\nenable_assoc 0")
100
101
  expect(RTesseract.new(@image_tif, chop_enable: 0).to_s_without_spaces).to eql('43XF')
102
+ expect(RTesseract.new(@image_tif, tessedit_char_whitelist: "ABCDEF12345").to_s_without_spaces).to eql('43F')
101
103
  end
102
104
 
103
105
  it ' crop image' do
@@ -121,14 +123,14 @@ describe 'Rtesseract' do
121
123
 
122
124
  it ' use a instance' do
123
125
  expect(RTesseract.new(Magick::Image.read(@image_tif.to_s).first).to_s_without_spaces).to eql('43XF')
124
- expect(RMagickProcessor.a_name?('teste')).to eql(false)
125
- expect(RMagickProcessor.a_name?('rmagick')).to eql(true)
126
- expect(RMagickProcessor.a_name?('RMagickProcessor')).to eql(true)
127
- expect(MiniMagickProcessor.a_name?('teste')).to eql(false)
128
- expect(MiniMagickProcessor.a_name?('mini_magick')).to eql(true)
129
- expect(MiniMagickProcessor.a_name?('MiniMagickProcessor')).to eql(true)
130
- expect(NoneProcessor.a_name?('none')).to eql(true)
131
- expect(NoneProcessor.a_name?('NoneProcessor')).to eql(true)
126
+ expect(RTesseract::Processor::RMagickProcessor.a_name?('teste')).to eql(false)
127
+ expect(RTesseract::Processor::RMagickProcessor.a_name?('rmagick')).to eql(true)
128
+ expect(RTesseract::Processor::RMagickProcessor.a_name?('RMagickProcessor')).to eql(true)
129
+ expect(RTesseract::Processor::MiniMagickProcessor.a_name?('teste')).to eql(false)
130
+ expect(RTesseract::Processor::MiniMagickProcessor.a_name?('mini_magick')).to eql(true)
131
+ expect(RTesseract::Processor::MiniMagickProcessor.a_name?('MiniMagickProcessor')).to eql(true)
132
+ expect(RTesseract::Processor::NoneProcessor.a_name?('none')).to eql(true)
133
+ expect(RTesseract::Processor::NoneProcessor.a_name?('NoneProcessor')).to eql(true)
132
134
  end
133
135
 
134
136
  it ' change image in a block' do
@@ -172,10 +174,9 @@ describe 'Rtesseract' do
172
174
  end
173
175
 
174
176
  it 'remove a file' do
175
- rtesseract = RTesseract.new('.')
176
- rtesseract.remove_file(Tempfile.new('config'))
177
+ RTesseract::Utils.remove_files(Tempfile.new('config'))
177
178
 
178
- expect { rtesseract.remove_file(Pathname.new(Dir.tmpdir).join('test_not_exists')) }.to raise_error(RTesseract::TempFilesNotRemovedError)
179
+ expect { RTesseract::Utils.remove_files(Pathname.new(Dir.tmpdir).join('test_not_exists')) }.to raise_error(RTesseract::TempFilesNotRemovedError)
179
180
  end
180
181
 
181
182
  it ' support default config processors' do
@@ -201,7 +202,6 @@ describe 'Rtesseract' do
201
202
  RTesseract.configure { |config| config.psm = 7 }
202
203
  expect(RTesseract.new(@image_tif).psm).to eql(' -psm 7 ')
203
204
 
204
-
205
205
  RTesseract.configure { |config| config.tessdata_dir = '/tmp/test' }
206
206
  expect(RTesseract.new(@image_tif).tessdata_dir).to eql(' --tessdata-dir /tmp/test ')
207
207
 
@@ -220,5 +220,7 @@ describe 'Rtesseract' do
220
220
  expect(RTesseract.new(@image_tif, tessdata_dir: MakeStringError.new).tessdata_dir).to eql('')
221
221
  expect(RTesseract.new(@image_tif, user_words: MakeStringError.new).user_words).to eql('')
222
222
  expect(RTesseract.new(@image_tif, user_patterns: MakeStringError.new).user_patterns).to eql('')
223
+
224
+ # expect(RTesseract.new(@path.join('images', 'test_words.png').to_s, psm: 3, user_words: @path.join('configs', 'eng.user-words.txt').to_s).to_s).to eql("If you are a friend,\nyou speak the password,\nand the doors will open.\n\n")
223
225
  end
224
226
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rtesseract
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danilo Jeremias da Silva
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-19 00:00:00.000000000 Z
11
+ date: 2016-05-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -119,6 +119,7 @@ files:
119
119
  - ".document"
120
120
  - ".rspec"
121
121
  - ".travis.yml"
122
+ - CHANGELOG.md
122
123
  - Gemfile
123
124
  - Gemfile.lock
124
125
  - LICENSE.txt
@@ -129,13 +130,16 @@ files:
129
130
  - lib/processors/none.rb
130
131
  - lib/processors/rmagick.rb
131
132
  - lib/rtesseract.rb
133
+ - lib/rtesseract/blob.rb
132
134
  - lib/rtesseract/box.rb
133
135
  - lib/rtesseract/box_char.rb
134
136
  - lib/rtesseract/configuration.rb
135
137
  - lib/rtesseract/errors.rb
136
138
  - lib/rtesseract/mixed.rb
137
- - lib/utils.rb
139
+ - lib/rtesseract/processor.rb
140
+ - lib/rtesseract/utils.rb
138
141
  - rtesseract.gemspec
142
+ - spec/configs/eng.user-words.txt
139
143
  - spec/images/README.pdf
140
144
  - spec/images/blank.tif
141
145
  - spec/images/mixed.tif
data/lib/utils.rb DELETED
@@ -1,5 +0,0 @@
1
- class Hash
2
- def option(attr_name, default)
3
- delete(attr_name.to_s) || delete(attr_name) || default
4
- end
5
- end