rtesseract 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 695108fd1fe3b6bb921444dc8daadb248466dd13
4
- data.tar.gz: d2e8b38f6a54c7ffd004863de72a9d880be9bdbc
3
+ metadata.gz: 6eae58279cf744227e79b7bbc9180f7aea852547
4
+ data.tar.gz: 3836aa96d24b7f1a0b957cf803553f547cc33544
5
5
  SHA512:
6
- metadata.gz: c28af3ffb9d288fb580d22f68f99d3e159919284735ff6cf84a91ba8da636d4c771568019c0db4ae968bbad030cfc8e187100ca075b7904fb05eda5658ca8c0d
7
- data.tar.gz: 3fdb3195471c7b0a3674c000d6ebafe4a7474cb6336911ff93e898233f872bb220b56b309be2e1148a19eb2abd1a764fa60ef09bfcfea66a103bdb35f836d8bd
6
+ metadata.gz: 0ef57359c7c7f43094a50838b6d29d28d7808c9cadd8f2b8514c613be030161f8d640c41ba3d403c00fb59fdf85ffcbc57795f6c65b8418ad348eb1a6c07e901
7
+ data.tar.gz: ff5f0f94c8039bd0b38b0c9ec2618b4c38b07b9707e28ff29a3bb943abc85d5afaa543dfba1ba2b9e565d056ea558eda9b7f6d222a6adb43614cd86c6e8fdcac
data/.travis.yml CHANGED
@@ -3,6 +3,7 @@ addons:
3
3
  apt:
4
4
  packages:
5
5
  - tesseract-ocr
6
+
6
7
  sudo: false
7
8
  rvm:
8
9
  - 1.9.3
data/CHANGELOG.md ADDED
@@ -0,0 +1,22 @@
1
+ ## v2.0.1
2
+
3
+ #### Changed
4
+
5
+ * Refactoring of some small classes
6
+
7
+ ## v2.0.0
8
+
9
+ #### Added
10
+
11
+ * Support to options --tessdata-dir, --user-words and --user-patterns
12
+ * Ruby 2.3.0 to travis tests.
13
+
14
+ #### Changed
15
+
16
+ * Refactoring of some classes
17
+ * Crop options is a hash with x,y,w,h keys.
18
+ * Areas of RTesseract::Mixed now changed :width to :w and :height to :h.
19
+
20
+ #### Removed
21
+
22
+ * Support to quick_magick gem.
data/Gemfile.lock CHANGED
@@ -3,9 +3,8 @@ GEM
3
3
  specs:
4
4
  addressable (2.4.0)
5
5
  builder (3.2.2)
6
- coveralls (0.8.10)
6
+ coveralls (0.8.13)
7
7
  json (~> 1.8)
8
- rest-client (>= 1.6.8, < 2)
9
8
  simplecov (~> 0.11.0)
10
9
  term-ansicolor (~> 1.3)
11
10
  thor (~> 0.19.1)
@@ -14,8 +13,6 @@ GEM
14
13
  thread_safe (~> 0.3, >= 0.3.1)
15
14
  diff-lcs (1.2.5)
16
15
  docile (1.1.5)
17
- domain_name (0.5.25)
18
- unf (>= 0.0.5, < 1.0.0)
19
16
  faraday (0.9.2)
20
17
  multipart-post (>= 1.2, < 3)
21
18
  git (1.3.0)
@@ -28,8 +25,6 @@ GEM
28
25
  oauth2
29
26
  hashie (3.4.3)
30
27
  highline (1.7.8)
31
- http-cookie (1.0.2)
32
- domain_name (~> 0.5)
33
28
  jeweler (2.1.1)
34
29
  builder
35
30
  bundler (>= 1.0)
@@ -42,13 +37,11 @@ GEM
42
37
  semver
43
38
  json (1.8.3)
44
39
  jwt (1.5.1)
45
- mime-types (2.99)
46
- mini_magick (4.3.6)
40
+ mini_magick (4.5.1)
47
41
  mini_portile2 (2.0.0)
48
42
  multi_json (1.11.2)
49
43
  multi_xml (0.5.5)
50
44
  multipart-post (2.0.0)
51
- netrc (0.11.0)
52
45
  nokogiri (1.6.7.2)
53
46
  mini_portile2 (~> 2.0.0.rc2)
54
47
  oauth2 (1.1.0)
@@ -61,26 +54,22 @@ GEM
61
54
  rake (11.1.2)
62
55
  rdoc (4.2.2)
63
56
  json (~> 1.4)
64
- rest-client (1.8.0)
65
- http-cookie (>= 1.0.2, < 2.0)
66
- mime-types (>= 1.16, < 3.0)
67
- netrc (~> 0.7)
68
57
  rmagick (2.15.4)
69
58
  rspec (3.4.0)
70
59
  rspec-core (~> 3.4.0)
71
60
  rspec-expectations (~> 3.4.0)
72
61
  rspec-mocks (~> 3.4.0)
73
- rspec-core (3.4.1)
62
+ rspec-core (3.4.4)
74
63
  rspec-support (~> 3.4.0)
75
64
  rspec-expectations (3.4.0)
76
65
  diff-lcs (>= 1.2.0, < 2.0)
77
66
  rspec-support (~> 3.4.0)
78
- rspec-mocks (3.4.0)
67
+ rspec-mocks (3.4.1)
79
68
  diff-lcs (>= 1.2.0, < 2.0)
80
69
  rspec-support (~> 3.4.0)
81
70
  rspec-support (3.4.1)
82
71
  semver (1.0.1)
83
- simplecov (0.11.1)
72
+ simplecov (0.11.2)
84
73
  docile (~> 1.1.0)
85
74
  json (~> 1.8)
86
75
  simplecov-html (~> 0.10.0)
@@ -90,9 +79,6 @@ GEM
90
79
  thor (0.19.1)
91
80
  thread_safe (0.3.5)
92
81
  tins (1.6.0)
93
- unf (0.1.4)
94
- unf_ext
95
- unf_ext (0.0.7.1)
96
82
 
97
83
  PLATFORMS
98
84
  ruby
@@ -109,4 +95,4 @@ DEPENDENCIES
109
95
  simplecov
110
96
 
111
97
  BUNDLED WITH
112
- 1.10.6
98
+ 1.11.2
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.0
1
+ 2.0.1
@@ -1,32 +1,43 @@
1
1
  # encoding: UTF-8
2
- # Add to rtesseract a image manipulation with MiniMagick
3
- module MiniMagickProcessor
4
- def self.setup
5
- require 'mini_magick'
6
- end
2
+ # RTesseract class
3
+ class RTesseract
4
+ # Processor Module
5
+ module Processor
6
+ # Add to rtesseract a image manipulation with MiniMagick
7
+ module MiniMagickProcessor
8
+ # Setup Processor
9
+ def self.setup
10
+ require 'mini_magick'
11
+ end
7
12
 
8
- def self.a_name?(name)
9
- %w(mini_magick MiniMagickProcessor).include?(name.to_s)
10
- end
13
+ # Check if is this Processor
14
+ def self.a_name?(name)
15
+ %w(mini_magick MiniMagickProcessor).include?(name.to_s)
16
+ end
11
17
 
12
- def self.image_to_tif(source, _points = {})
13
- tmp_file = Tempfile.new(['', '.tif'])
14
- cat = source.is_a?(Pathname) ? read_with_processor(source.to_s) : source
15
- cat.format('tif') do |c|
16
- c.compress 'None'
17
- c.alpha 'off'
18
- end
19
- cat.crop("#{_points[:w]}x#{_points[:h]}+#{_points[:x]}+#{_points[:y]}") if _points.is_a?(Hash) && _points.values.compact != []
20
- cat.alpha 'off'
21
- cat.write tmp_file.path.to_s
22
- tmp_file
23
- end
18
+ # Convert Image to Tiff
19
+ def self.image_to_tif(source, points = {})
20
+ tmp_file = Tempfile.new(['', '.tif'])
21
+ cat = source.is_a?(Pathname) ? read_with_processor(source.to_s) : source
22
+ cat.format('tif') do |c|
23
+ c.compress 'None'
24
+ c.alpha 'off'
25
+ end
26
+ cat.crop("#{points[:w]}x#{points[:h]}+#{points[:x]}+#{points[:y]}") if points.is_a?(Hash) && points.values.compact != []
27
+ cat.alpha 'off'
28
+ cat.write tmp_file.path.to_s
29
+ tmp_file
30
+ end
24
31
 
25
- def self.read_with_processor(path)
26
- MiniMagick::Image.open(path.to_s)
27
- end
32
+ # Cast instance of image
33
+ def self.read_with_processor(path)
34
+ MiniMagick::Image.open(path.to_s)
35
+ end
28
36
 
29
- def self.image?(object)
30
- object.class == MiniMagick::Image
37
+ # Check if is a MiniMagick image
38
+ def self.image?(object)
39
+ object.class == MiniMagick::Image
40
+ end
41
+ end
31
42
  end
32
- end
43
+ end
@@ -1,26 +1,34 @@
1
1
  # encoding: UTF-8
2
- # Add to rtesseract a image without manipulation
3
- module NoneProcessor
4
- def self.setup
5
- end
6
-
7
- def self.a_name?(name)
8
- %w(none NoneProcessor).include?(name.to_s)
9
- end
2
+ # RTesseract class
3
+ class RTesseract
4
+ # Processor Module
5
+ module Processor
6
+ # Add to rtesseract a image without manipulation
7
+ module NoneProcessor
8
+ # Setup Processor
9
+ def self.setup
10
+ end
10
11
 
11
- def self.image_to_tif(source, _points = {})
12
- tmp_file = Tempfile.new(['', '.tif'])
13
- tmp_file.write(read_with_processor(source))
14
- tmp_file
15
- end
12
+ # Check if is this Processor
13
+ def self.a_name?(name)
14
+ %w(none NoneProcessor).include?(name.to_s)
15
+ end
16
16
 
17
- def self.need_crop?(*)
18
- end
17
+ # Convert Image to Tiff
18
+ def self.image_to_tif(source, _points = {})
19
+ tmp_file = Tempfile.new(['', '.tif'])
20
+ tmp_file.write(read_with_processor(source))
21
+ tmp_file
22
+ end
19
23
 
20
- def self.read_with_processor(path)
21
- File.read(path)
22
- end
24
+ # Cast instance of image
25
+ def self.read_with_processor(path)
26
+ File.read(path)
27
+ end
23
28
 
24
- def self.image?(*)
29
+ # Check if is a image
30
+ def self.image?(*)
31
+ end
32
+ end
25
33
  end
26
34
  end
@@ -1,35 +1,46 @@
1
1
  # encoding: UTF-8
2
- # Add to rtesseract a image manipulation with RMagick
3
- module RMagickProcessor
4
- def self.setup
5
- require 'rmagick'
6
- rescue LoadError
7
- # :nocov:
8
- require 'RMagick'
9
- # :nocov:
10
- end
2
+ # RTesseract class
3
+ class RTesseract
4
+ # Processor Module
5
+ module Processor
6
+ # Add to rtesseract a image manipulation with RMagick
7
+ module RMagickProcessor
8
+ # Setup Processor
9
+ def self.setup
10
+ require 'rmagick'
11
+ rescue LoadError
12
+ # :nocov:
13
+ require 'RMagick'
14
+ # :nocov:
15
+ end
11
16
 
12
- def self.a_name?(name)
13
- %w(rmagick RMagickProcessor).include?(name.to_s)
14
- end
17
+ # Check if is this Processor
18
+ def self.a_name?(name)
19
+ %w(rmagick RMagickProcessor).include?(name.to_s)
20
+ end
15
21
 
16
- def self.image_to_tif(source, _points = {})
17
- tmp_file = Tempfile.new(['', '.tif'])
18
- cat = source.is_a?(Pathname) ? read_with_processor(source.to_s) : source
19
- cat.crop!(_points[:x], _points[:y], _points[:w], _points[:h]) if _points.is_a?(Hash) && _points.values.compact != []
20
- cat.alpha Magick::DeactivateAlphaChannel
21
- cat.write(tmp_file.path.to_s) do
22
- # self.depth = 16
23
- self.compression = Magick::NoCompression
24
- end
25
- tmp_file
26
- end
22
+ # Convert Image to Tiff
23
+ def self.image_to_tif(source, points = {})
24
+ tmp_file = Tempfile.new(['', '.tif'])
25
+ cat = source.is_a?(Pathname) ? read_with_processor(source.to_s) : source
26
+ cat.crop!(points[:x], points[:y], points[:w], points[:h]) if points.is_a?(Hash) && points.values.compact != []
27
+ cat.alpha Magick::DeactivateAlphaChannel
28
+ cat.write(tmp_file.path.to_s) do
29
+ # self.depth = 16
30
+ self.compression = Magick::NoCompression
31
+ end
32
+ tmp_file
33
+ end
27
34
 
28
- def self.read_with_processor(path)
29
- Magick::Image.read(path.to_s).first
30
- end
35
+ # Cast instance of image
36
+ def self.read_with_processor(path)
37
+ Magick::Image.read(path.to_s).first
38
+ end
31
39
 
32
- def self.image?(object)
33
- object.class == Magick::Image
40
+ # Check if is a RMagick image
41
+ def self.image?(object)
42
+ object.class == Magick::Image
43
+ end
44
+ end
34
45
  end
35
46
  end
data/lib/rtesseract.rb CHANGED
@@ -1,18 +1,10 @@
1
1
  # encoding: UTF-8
2
2
  require 'pathname'
3
3
  require 'tempfile'
4
- require 'utils'
5
4
 
5
+ require 'rtesseract/utils'
6
6
  require 'rtesseract/configuration'
7
7
  require 'rtesseract/errors'
8
- require 'rtesseract/mixed'
9
- require 'rtesseract/box'
10
- require 'rtesseract/box_char'
11
-
12
- # Processors
13
- require 'processors/rmagick.rb'
14
- require 'processors/mini_magick.rb'
15
- require 'processors/none.rb'
16
8
 
17
9
  # Ruby wrapper for Tesseract OCR
18
10
  class RTesseract
@@ -23,58 +15,30 @@ class RTesseract
23
15
  def initialize(src = '', options = {})
24
16
  self.configuration = RTesseract.local_config(options)
25
17
  @options = options || {}
26
- @value, @points = [nil, {}]
27
- @processor = RTesseract.choose_processor!(self.configuration.processor)
18
+ @value = nil
19
+ @points = {}
20
+ @processor = RTesseract::Processor.choose_processor!(configuration.processor)
28
21
  @source = @processor.image?(src) ? src : Pathname.new(src)
29
22
  initialize_hook
30
23
  end
31
24
 
25
+ # Hook to end of initialize method
32
26
  def initialize_hook
33
27
  end
34
28
 
35
- def self.read(src = nil, options = {})
36
- fail RTesseract::ImageNotSelectedError if src.nil?
37
- processor = RTesseract.choose_processor!(options.option(:processor, nil))
38
- image = processor.read_with_processor(src.to_s)
39
- yield(image)
40
- object = RTesseract.new('', options).from_blob(image.to_blob)
41
- object
42
- end
43
-
44
- def read
45
- image = @processor.read_with_processor(@source.to_s)
46
- new_image = yield(image)
47
- from_blob(new_image.to_blob, File.extname(@source.to_s))
48
- self
49
- end
50
-
29
+ # Define the source
51
30
  def source=(src)
52
31
  @value = nil
53
32
  @source = @processor.image?(src) ? src : Pathname.new(src)
54
33
  end
55
34
 
56
35
  # Crop image to convert
57
- def crop!(_points = {})
36
+ def crop!(points = {})
58
37
  @value = nil
59
- @points = _points
38
+ @points = points
60
39
  self
61
40
  end
62
41
 
63
- # Remove files
64
- def remove_file(files = [])
65
- files.each do |file|
66
- if file.is_a?(Tempfile)
67
- file.close
68
- file.unlink
69
- else
70
- File.unlink(file)
71
- end
72
- end
73
- true
74
- rescue => error
75
- raise RTesseract::TempFilesNotRemovedError.new(error: error, files: files)
76
- end
77
-
78
42
  # Select the language
79
43
  # ===Languages
80
44
  ## * eng - English
@@ -88,58 +52,56 @@ class RTesseract
88
52
  ## * vie - Vietnamese
89
53
  ## Note: Make sure you have installed the language to tesseract
90
54
  def lang
91
- language = "#{self.configuration.lang}".strip.downcase
92
- LANGUAGES.each do |value, names|
93
- return " -l #{value} " if names.include? language
94
- end
95
- return " -l #{language} " if language.size > 0
55
+ language = (configuration.lang || 'eng').to_s.strip.downcase
56
+ " -l #{LANGUAGES[language] || language} "
57
+ rescue
96
58
  ''
59
+ end
60
+
61
+ # Convert option to command
62
+ def option_to_string(prefix, value = nil)
63
+ (value.nil? ? '' : " #{prefix} #{value} ")
97
64
  rescue
98
65
  ''
99
66
  end
100
67
 
101
68
  # Page Segment Mode
102
69
  def psm
103
- (self.configuration.psm.nil? ? '' : " -psm #{self.configuration.psm} ")
104
- rescue
105
- ''
70
+ option_to_string('-psm', configuration.psm)
106
71
  end
107
72
 
108
73
  # Tessdata Dir
109
74
  def tessdata_dir
110
- (self.configuration.tessdata_dir.nil? ? '' : " --tessdata-dir #{self.configuration.tessdata_dir} ")
111
- rescue
112
- ''
75
+ option_to_string('--tessdata-dir', configuration.tessdata_dir)
113
76
  end
114
77
 
115
78
  # User Words
116
79
  def user_words
117
- (self.configuration.user_words.nil? ? '' : " --user-words #{self.configuration.user_words} ")
118
- rescue
119
- ''
80
+ option_to_string('--user-words', configuration.user_words)
120
81
  end
121
82
 
122
83
  # User Patterns
123
84
  def user_patterns
124
- (self.configuration.user_patterns.nil? ? '' : " --user-patterns #{self.configuration.user_patterns} ")
125
- rescue
126
- ''
85
+ option_to_string('--user-patterns', configuration.user_patterns)
127
86
  end
128
87
 
129
88
  # Options on line
130
89
  def options_cmd
131
- self.configuration.options_cmd
90
+ configuration.options_cmd
132
91
  end
133
92
 
93
+ # Hook to before config
134
94
  def config_hook
135
95
  end
136
96
 
97
+ # Convert configurations
137
98
  def config
138
99
  @options ||= {}
139
100
  config_hook
140
101
  @options.map { |k, v| "#{k} #{v}" }.join("\n")
141
102
  end
142
103
 
104
+ # Write config to file
143
105
  def config_file
144
106
  config_hook
145
107
  return '' if @options == {}
@@ -151,34 +113,41 @@ class RTesseract
151
113
 
152
114
  # TODO: Clear console for MacOS or Windows
153
115
  def clear_console_output
154
- return '' if self.configuration.debug
116
+ return '' if configuration.debug
155
117
  return '2>/dev/null' if File.exist?('/dev/null') # Linux console clear
156
118
  end
157
119
 
120
+ # Get image
158
121
  def image
159
122
  (@image = @processor.image_to_tif(@source, @points)).path
160
123
  end
161
124
 
125
+ # Extension of file
162
126
  def file_ext
163
127
  '.txt'
164
128
  end
165
129
 
130
+ # Rand file path
166
131
  def text_file
167
132
  @text_file = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
168
133
  end
169
134
 
135
+ # Full path of file with extension
170
136
  def text_file_with_ext(ext = nil)
171
137
  [@text_file, ext || file_ext].join('')
172
138
  end
173
139
 
140
+ # Run command
174
141
  def convert_command
175
- `#{self.configuration.command} "#{image}" "#{text_file}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{self.configuration.options_cmd.join(' ')}`
142
+ `#{configuration.command} "#{image}" "#{text_file}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{configuration.options_cmd.join(' ')}`
176
143
  end
177
144
 
145
+ # Read result file
178
146
  def convert_text
179
147
  @value = File.read(text_file_with_ext).to_s
180
148
  end
181
149
 
150
+ # Hook to convert
182
151
  def after_convert_hook
183
152
  end
184
153
 
@@ -187,21 +156,7 @@ class RTesseract
187
156
  convert_command
188
157
  after_convert_hook
189
158
  convert_text
190
- remove_file([@image, text_file_with_ext])
191
- rescue => error
192
- raise RTesseract::ConversionError.new(error), error, caller
193
- end
194
-
195
- # Read image from memory blob
196
- def from_blob(blob, ext = '')
197
- blob_file = Tempfile.new(['blob', ext], encoding: 'ascii-8bit')
198
- blob_file.binmode.write(blob)
199
- blob_file.rewind
200
- blob_file.flush
201
- self.source = blob_file.path
202
- convert
203
- remove_file([blob_file])
204
- self
159
+ RTesseract::Utils.remove_files([@image, text_file_with_ext])
205
160
  rescue => error
206
161
  raise RTesseract::ConversionError.new(error), error, caller
207
162
  end
@@ -220,19 +175,17 @@ class RTesseract
220
175
 
221
176
  # Remove spaces and break-lines
222
177
  def to_s_without_spaces
223
- to_s.gsub(' ', '').gsub("\n", '').gsub("\r", '')
224
- end
225
-
226
- def self.choose_processor!(processor)
227
- processor =
228
- if MiniMagickProcessor.a_name?(processor.to_s)
229
- MiniMagickProcessor
230
- elsif NoneProcessor.a_name?(processor.to_s)
231
- NoneProcessor
232
- else
233
- RMagickProcessor
234
- end
235
- processor.setup
236
- processor
178
+ to_s.delete(' ').delete("\n").delete("\r")
237
179
  end
238
180
  end
181
+
182
+ require 'rtesseract/mixed'
183
+ require 'rtesseract/box'
184
+ require 'rtesseract/box_char'
185
+ require 'rtesseract/blob'
186
+ require 'rtesseract/processor'
187
+
188
+ # Processors
189
+ require 'processors/rmagick.rb'
190
+ require 'processors/mini_magick.rb'
191
+ require 'processors/none.rb'
@@ -0,0 +1,34 @@
1
+ # Blob methods
2
+ class RTesseract
3
+ # Read image from memory blob
4
+ def self.read(src = nil, options = {})
5
+ fail RTesseract::ImageNotSelectedError if src.nil?
6
+ processor = RTesseract::Processor.choose_processor!(options.option(:processor, nil))
7
+ image = processor.read_with_processor(src.to_s)
8
+ yield(image)
9
+ object = RTesseract.new('', options).from_blob(image.to_blob)
10
+ object
11
+ end
12
+
13
+ # Read image from memory blob
14
+ def read
15
+ image = @processor.read_with_processor(@source.to_s)
16
+ new_image = yield(image)
17
+ from_blob(new_image.to_blob, File.extname(@source.to_s))
18
+ self
19
+ end
20
+
21
+ # Read image from memory blob
22
+ def from_blob(blob, ext = '')
23
+ blob_file = Tempfile.new(['blob', ext], encoding: 'ascii-8bit')
24
+ blob_file.binmode.write(blob)
25
+ blob_file.rewind
26
+ blob_file.flush
27
+ self.source = blob_file.path
28
+ convert
29
+ RTesseract::Utils.remove_files([blob_file])
30
+ self
31
+ rescue => error
32
+ raise RTesseract::ConversionError.new(error), error, caller
33
+ end
34
+ end
@@ -2,37 +2,45 @@
2
2
  require 'nokogiri'
3
3
  require 'fileutils'
4
4
 
5
+ # RTesseract
5
6
  class RTesseract
6
7
  # Class to read char positions from an image
7
8
  class Box < RTesseract
9
+ # Setting value as blank array
8
10
  def initialize_hook
9
- @value, @points = [[], {}]
11
+ @value = []
10
12
  end
11
13
 
14
+ # Aditional options to config file
12
15
  def config_hook
13
16
  @options['tessedit_create_hocr'] = 1 # Split Words configuration
14
17
  end
15
18
 
19
+ # Words converted
16
20
  def words
17
21
  convert if @value == []
18
22
  @value
19
23
  end
20
24
 
25
+ # Extension of file
21
26
  def file_ext
22
27
  '.hocr'
23
28
  end
24
29
 
30
+ # Read the result file
25
31
  def parse_file
26
32
  html = Nokogiri::HTML(File.read(text_file_with_ext))
27
33
  html.css('span.ocrx_word, span.ocr_word')
28
34
  end
29
35
 
36
+ # Return words to value
30
37
  def convert_text
31
38
  text_objects = []
32
39
  parse_file.each { |word| text_objects << BoxParser.new(word).to_h }
33
40
  @value = text_objects
34
41
  end
35
42
 
43
+ # Move file html to hocr
36
44
  def after_convert_hook
37
45
  FileUtils.mv(text_file_with_ext('.html'), text_file_with_ext) rescue nil
38
46
  end
@@ -56,6 +64,7 @@ class RTesseract
56
64
  @attributes = title.gsub(';', '').split(' ')
57
65
  end
58
66
 
67
+ # Hash of word and position
59
68
  def to_h
60
69
  {
61
70
  word: @word.text,
@@ -1,4 +1,5 @@
1
1
  # encoding: UTF-8
2
+ # RTesseract
2
3
  class RTesseract
3
4
  # Class to read char positions from an image
4
5
  class BoxChar < Box
@@ -8,10 +9,12 @@ class RTesseract
8
9
 
9
10
  alias_method :characters, :words
10
11
 
12
+ # Extension of file
11
13
  def file_ext
12
14
  '.box'
13
15
  end
14
16
 
17
+ # Read the result file
15
18
  def parse_file
16
19
  File.read(text_file_with_ext).to_s
17
20
  end
@@ -1,12 +1,16 @@
1
- # Configuration
1
+ # RTesseract
2
2
  class RTesseract
3
3
  # Aliases to languages names
4
4
  LANGUAGES = {
5
- 'eng' => %w(en en-us english),
6
- 'ita' => %w(it),
7
- 'por' => %w(pt pt-br portuguese),
8
- 'spa' => %w(sp)
9
- }
5
+ 'en' => 'eng',
6
+ 'en-us' => 'eng',
7
+ 'english' => 'eng',
8
+ 'pt' => 'por',
9
+ 'pt-br' => 'por',
10
+ 'portuguese' => 'por',
11
+ 'it' => 'ita',
12
+ 'sp' => 'spa'
13
+ }.freeze
10
14
 
11
15
  # Configuration class
12
16
  class Configuration
@@ -16,16 +20,19 @@ class RTesseract
16
20
  @processor = 'rmagick'
17
21
  end
18
22
 
23
+ # Global configuration
19
24
  def parent
20
25
  @parent ||= RTesseract.configuration || RTesseract::Configuration.new
21
26
  end
22
27
 
28
+ # Set value of option
23
29
  def option(options, name, default = nil)
24
30
  self.instance_variable_set("@#{name}", options.option(name, parent.send(name)) || default)
25
31
  end
26
32
 
33
+ # Return the values of options
27
34
  def load_options(options, names = [])
28
- names.each{ |name| option(options, name, nil) }
35
+ names.each { |name| option(options, name, nil) }
29
36
  end
30
37
  end
31
38
 
@@ -38,6 +45,7 @@ class RTesseract
38
45
  yield(configuration)
39
46
  end
40
47
 
48
+ # Default command
41
49
  def self.default_command
42
50
  TesseractBin::Executables[:tesseract] || 'tesseract'
43
51
  rescue
@@ -49,7 +57,7 @@ class RTesseract
49
57
  RTesseract::Configuration.new.tap do |config|
50
58
  config.command = config.option(options, :command, RTesseract.default_command)
51
59
  config.processor = config.option(options, :processor, 'rmagick')
52
- config.load_options(options, [ :lang, :psm, :tessdata_dir, :user_words, :user_patterns ])
60
+ config.load_options(options, [:lang, :psm, :tessdata_dir, :user_words, :user_patterns])
53
61
  config.debug = config.option(options, :debug, false)
54
62
  config.options_cmd = [options.option(:options, nil)].flatten.compact
55
63
  end
@@ -1,3 +1,4 @@
1
+ # RTesseract
1
2
  class RTesseract
2
3
  # Class of error with storage of normal errors
3
4
  class ErrorWithMemory < StandardError
@@ -1,4 +1,5 @@
1
1
  # encoding: UTF-8
2
+ # RTesseract
2
3
  class RTesseract
3
4
  # Class to read an image from specified areas
4
5
  class Mixed
@@ -12,11 +13,13 @@ class RTesseract
12
13
  yield self if block_given?
13
14
  end
14
15
 
15
- def area(_points)
16
+ # Add areas
17
+ def area(points)
16
18
  @value = ''
17
- @areas << _points # { x: x, y: y, width: width, height: height }
19
+ @areas << points
18
20
  end
19
21
 
22
+ # Clear areas
20
23
  def clear_areas
21
24
  @areas = []
22
25
  end
@@ -25,7 +28,7 @@ class RTesseract
25
28
  def convert
26
29
  @value = []
27
30
  @areas.each_with_object(RTesseract.new(@source.to_s, @options.dup)) do |area, image|
28
- image.crop!(area) # area[:x], area[:y], area[:width], area[:height])
31
+ image.crop!(area)
29
32
  @value << image.to_s
30
33
  end
31
34
  rescue => error
@@ -45,7 +48,7 @@ class RTesseract
45
48
 
46
49
  # Remove spaces and break-lines
47
50
  def to_s_without_spaces
48
- to_s.gsub(' ', '').gsub("\n", '').gsub("\r", '')
51
+ to_s.delete(' ').delete("\n").delete("\r")
49
52
  end
50
53
  end
51
54
  end
@@ -0,0 +1,19 @@
1
+ # RTesseract
2
+ class RTesseract
3
+ # Processor managment
4
+ module Processor
5
+ # Return the processor
6
+ def self.choose_processor!(processor)
7
+ processor =
8
+ if RTesseract::Processor::MiniMagickProcessor.a_name?(processor.to_s)
9
+ MiniMagickProcessor
10
+ elsif RTesseract::Processor::NoneProcessor.a_name?(processor.to_s)
11
+ NoneProcessor
12
+ else
13
+ RMagickProcessor
14
+ end
15
+ processor.setup
16
+ processor
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,34 @@
1
+ # RTesseract
2
+ class RTesseract
3
+ # Some utils methods
4
+ module Utils
5
+ # Remove files or Tempfile
6
+ def self.remove_files(files = [])
7
+ files.each do |file|
8
+ self.remove_file(file)
9
+ end
10
+ true
11
+ rescue => error
12
+ raise RTesseract::TempFilesNotRemovedError.new(error: error, files: files)
13
+ end
14
+
15
+ # Remove file or Tempfile
16
+ def self.remove_file(file)
17
+ if file.is_a?(Tempfile)
18
+ file.close
19
+ file.unlink
20
+ else
21
+ File.unlink(file)
22
+ end
23
+ true
24
+ end
25
+ end
26
+ end
27
+
28
+ # Hash
29
+ class Hash
30
+ # return the value and remove from hash
31
+ def option(attr_name, default)
32
+ delete(attr_name.to_s) || delete(attr_name) || default
33
+ end
34
+ end
data/rtesseract.gemspec CHANGED
@@ -2,16 +2,16 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: rtesseract 2.0.0 ruby lib
5
+ # stub: rtesseract 2.0.1 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
8
  s.name = "rtesseract"
9
- s.version = "2.0.0"
9
+ s.version = "2.0.1"
10
10
 
11
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
12
  s.require_paths = ["lib"]
13
13
  s.authors = ["Danilo Jeremias da Silva"]
14
- s.date = "2016-04-19"
14
+ s.date = "2016-05-17"
15
15
  s.description = "Ruby library for working with the Tesseract OCR."
16
16
  s.email = "dannnylo@gmail.com"
17
17
  s.extra_rdoc_files = [
@@ -22,6 +22,7 @@ Gem::Specification.new do |s|
22
22
  ".document",
23
23
  ".rspec",
24
24
  ".travis.yml",
25
+ "CHANGELOG.md",
25
26
  "Gemfile",
26
27
  "Gemfile.lock",
27
28
  "LICENSE.txt",
@@ -32,13 +33,16 @@ Gem::Specification.new do |s|
32
33
  "lib/processors/none.rb",
33
34
  "lib/processors/rmagick.rb",
34
35
  "lib/rtesseract.rb",
36
+ "lib/rtesseract/blob.rb",
35
37
  "lib/rtesseract/box.rb",
36
38
  "lib/rtesseract/box_char.rb",
37
39
  "lib/rtesseract/configuration.rb",
38
40
  "lib/rtesseract/errors.rb",
39
41
  "lib/rtesseract/mixed.rb",
40
- "lib/utils.rb",
42
+ "lib/rtesseract/processor.rb",
43
+ "lib/rtesseract/utils.rb",
41
44
  "rtesseract.gemspec",
45
+ "spec/configs/eng.user-words.txt",
42
46
  "spec/images/README.pdf",
43
47
  "spec/images/blank.tif",
44
48
  "spec/images/mixed.tif",
@@ -0,0 +1,13 @@
1
+ you
2
+ are
3
+ a
4
+ friend
5
+ you
6
+ speak
7
+ the
8
+ password
9
+ and
10
+ the
11
+ doors
12
+ will
13
+ open
@@ -6,17 +6,7 @@ describe 'Rtesseract::BoxChar' do
6
6
  @path = Pathname.new(__FILE__.gsub('rtesseract_box_char_spec.rb', '')).expand_path
7
7
  @image_tiff = @path.join('images', 'test.tif').to_s
8
8
  @words_image = @path.join('images', 'test_words.png').to_s
9
- end
10
-
11
- it 'bounding box by char' do
12
- expect(RTesseract::BoxChar.new(@image_tiff).characters.is_a?(Array)).to eql(true)
13
- expect(RTesseract::BoxChar.new(@image_tiff).characters).to eql([
14
- { char: '4', x_start: 145, y_start: 14, x_end: 159, y_end: 33 },
15
- { char: '3', x_start: 184, y_start: 14, x_end: 196, y_end: 33 },
16
- { char: 'X', x_start: 222, y_start: 14, x_end: 238, y_end: 32 },
17
- { char: 'F', x_start: 260, y_start: 14, x_end: 273, y_end: 32 }])
18
-
19
- expect(RTesseract::BoxChar.new(@words_image).characters).to eql([
9
+ @values = [
20
10
  { char: 'I', x_start: 52, y_start: 91, x_end: 54, y_end: 104 },
21
11
  { char: 'f', x_start: 56, y_start: 91, x_end: 63, y_end: 105 },
22
12
  { char: 'y', x_start: 69, y_start: 87, x_end: 79, y_end: 101 },
@@ -72,7 +62,18 @@ describe 'Rtesseract::BoxChar' do
72
62
  { char: 'p', x_start: 228, y_start: 43, x_end: 237, y_end: 57 },
73
63
  { char: 'e', x_start: 238, y_start: 47, x_end: 248, y_end: 57 },
74
64
  { char: 'n', x_start: 250, y_start: 47, x_end: 258, y_end: 57 },
75
- { char: '.', x_start: 261, y_start: 47, x_end: 263, y_end: 49 }])
65
+ { char: '.', x_start: 261, y_start: 47, x_end: 263, y_end: 49 }]
66
+ end
67
+
68
+ it 'bounding box by char' do
69
+ expect(RTesseract::BoxChar.new(@image_tiff).characters.is_a?(Array)).to eql(true)
70
+ expect(RTesseract::BoxChar.new(@image_tiff).characters).to eql([
71
+ { char: '4', x_start: 145, y_start: 14, x_end: 159, y_end: 33 },
72
+ { char: '3', x_start: 184, y_start: 14, x_end: 196, y_end: 33 },
73
+ { char: 'X', x_start: 222, y_start: 14, x_end: 238, y_end: 32 },
74
+ { char: 'F', x_start: 260, y_start: 14, x_end: 273, y_end: 32 }])
75
+
76
+ expect(RTesseract::BoxChar.new(@words_image).characters).to eql(@values)
76
77
 
77
78
  expect { RTesseract::BoxChar.new(@image_tiff, command: 'tesseract_error').to_s }.to raise_error(RTesseract::ConversionError)
78
79
  expect { RTesseract::BoxChar.new(@image_tiff + '_not_exist').to_s }.to raise_error(RTesseract::ImageNotSelectedError)
@@ -81,6 +81,7 @@ describe 'Rtesseract' do
81
81
  expect(RTesseract.new(@image_tif, lang: 'eng').to_s_without_spaces).to eql('43XF')
82
82
 
83
83
  expect(RTesseract.new(@image_tif, lang: 'eng').lang).to eql(' -l eng ')
84
+ expect(RTesseract.new(@image_tif, lang: 'it').lang).to eql(' -l ita ')
84
85
 
85
86
  # Invalid lang object
86
87
  expect(RTesseract.new(@image_tif, lang: MakeStringError.new).lang).to eql('')
@@ -98,6 +99,7 @@ describe 'Rtesseract' do
98
99
  expect(RTesseract.new(@image_tif, chop_enable: 0).config).to eql('chop_enable 0')
99
100
  expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0).config).to eql("chop_enable 0\nenable_assoc 0")
100
101
  expect(RTesseract.new(@image_tif, chop_enable: 0).to_s_without_spaces).to eql('43XF')
102
+ expect(RTesseract.new(@image_tif, tessedit_char_whitelist: "ABCDEF12345").to_s_without_spaces).to eql('43F')
101
103
  end
102
104
 
103
105
  it ' crop image' do
@@ -121,14 +123,14 @@ describe 'Rtesseract' do
121
123
 
122
124
  it ' use a instance' do
123
125
  expect(RTesseract.new(Magick::Image.read(@image_tif.to_s).first).to_s_without_spaces).to eql('43XF')
124
- expect(RMagickProcessor.a_name?('teste')).to eql(false)
125
- expect(RMagickProcessor.a_name?('rmagick')).to eql(true)
126
- expect(RMagickProcessor.a_name?('RMagickProcessor')).to eql(true)
127
- expect(MiniMagickProcessor.a_name?('teste')).to eql(false)
128
- expect(MiniMagickProcessor.a_name?('mini_magick')).to eql(true)
129
- expect(MiniMagickProcessor.a_name?('MiniMagickProcessor')).to eql(true)
130
- expect(NoneProcessor.a_name?('none')).to eql(true)
131
- expect(NoneProcessor.a_name?('NoneProcessor')).to eql(true)
126
+ expect(RTesseract::Processor::RMagickProcessor.a_name?('teste')).to eql(false)
127
+ expect(RTesseract::Processor::RMagickProcessor.a_name?('rmagick')).to eql(true)
128
+ expect(RTesseract::Processor::RMagickProcessor.a_name?('RMagickProcessor')).to eql(true)
129
+ expect(RTesseract::Processor::MiniMagickProcessor.a_name?('teste')).to eql(false)
130
+ expect(RTesseract::Processor::MiniMagickProcessor.a_name?('mini_magick')).to eql(true)
131
+ expect(RTesseract::Processor::MiniMagickProcessor.a_name?('MiniMagickProcessor')).to eql(true)
132
+ expect(RTesseract::Processor::NoneProcessor.a_name?('none')).to eql(true)
133
+ expect(RTesseract::Processor::NoneProcessor.a_name?('NoneProcessor')).to eql(true)
132
134
  end
133
135
 
134
136
  it ' change image in a block' do
@@ -172,10 +174,9 @@ describe 'Rtesseract' do
172
174
  end
173
175
 
174
176
  it 'remove a file' do
175
- rtesseract = RTesseract.new('.')
176
- rtesseract.remove_file(Tempfile.new('config'))
177
+ RTesseract::Utils.remove_files(Tempfile.new('config'))
177
178
 
178
- expect { rtesseract.remove_file(Pathname.new(Dir.tmpdir).join('test_not_exists')) }.to raise_error(RTesseract::TempFilesNotRemovedError)
179
+ expect { RTesseract::Utils.remove_files(Pathname.new(Dir.tmpdir).join('test_not_exists')) }.to raise_error(RTesseract::TempFilesNotRemovedError)
179
180
  end
180
181
 
181
182
  it ' support default config processors' do
@@ -201,7 +202,6 @@ describe 'Rtesseract' do
201
202
  RTesseract.configure { |config| config.psm = 7 }
202
203
  expect(RTesseract.new(@image_tif).psm).to eql(' -psm 7 ')
203
204
 
204
-
205
205
  RTesseract.configure { |config| config.tessdata_dir = '/tmp/test' }
206
206
  expect(RTesseract.new(@image_tif).tessdata_dir).to eql(' --tessdata-dir /tmp/test ')
207
207
 
@@ -220,5 +220,7 @@ describe 'Rtesseract' do
220
220
  expect(RTesseract.new(@image_tif, tessdata_dir: MakeStringError.new).tessdata_dir).to eql('')
221
221
  expect(RTesseract.new(@image_tif, user_words: MakeStringError.new).user_words).to eql('')
222
222
  expect(RTesseract.new(@image_tif, user_patterns: MakeStringError.new).user_patterns).to eql('')
223
+
224
+ # expect(RTesseract.new(@path.join('images', 'test_words.png').to_s, psm: 3, user_words: @path.join('configs', 'eng.user-words.txt').to_s).to_s).to eql("If you are a friend,\nyou speak the password,\nand the doors will open.\n\n")
223
225
  end
224
226
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rtesseract
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danilo Jeremias da Silva
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-19 00:00:00.000000000 Z
11
+ date: 2016-05-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -119,6 +119,7 @@ files:
119
119
  - ".document"
120
120
  - ".rspec"
121
121
  - ".travis.yml"
122
+ - CHANGELOG.md
122
123
  - Gemfile
123
124
  - Gemfile.lock
124
125
  - LICENSE.txt
@@ -129,13 +130,16 @@ files:
129
130
  - lib/processors/none.rb
130
131
  - lib/processors/rmagick.rb
131
132
  - lib/rtesseract.rb
133
+ - lib/rtesseract/blob.rb
132
134
  - lib/rtesseract/box.rb
133
135
  - lib/rtesseract/box_char.rb
134
136
  - lib/rtesseract/configuration.rb
135
137
  - lib/rtesseract/errors.rb
136
138
  - lib/rtesseract/mixed.rb
137
- - lib/utils.rb
139
+ - lib/rtesseract/processor.rb
140
+ - lib/rtesseract/utils.rb
138
141
  - rtesseract.gemspec
142
+ - spec/configs/eng.user-words.txt
139
143
  - spec/images/README.pdf
140
144
  - spec/images/blank.tif
141
145
  - spec/images/mixed.tif
data/lib/utils.rb DELETED
@@ -1,5 +0,0 @@
1
- class Hash
2
- def option(attr_name, default)
3
- delete(attr_name.to_s) || delete(attr_name) || default
4
- end
5
- end