rtesseract 2.2.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +5 -5
  2. data/.document +1 -2
  3. data/.gitignore +12 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +13 -10
  6. data/CODE_OF_CONDUCT.md +74 -0
  7. data/Gemfile +4 -17
  8. data/Gemfile.lock +40 -85
  9. data/LICENSE.txt +18 -17
  10. data/README.md +137 -0
  11. data/Rakefile +4 -48
  12. data/bin/console +14 -0
  13. data/bin/setup +8 -0
  14. data/lib/rtesseract.rb +22 -220
  15. data/lib/rtesseract/box.rb +15 -60
  16. data/lib/rtesseract/check.rb +14 -0
  17. data/lib/rtesseract/command.rb +41 -0
  18. data/lib/rtesseract/configuration.rb +15 -64
  19. data/lib/rtesseract/pdf.rb +18 -0
  20. data/lib/rtesseract/text.rb +9 -0
  21. data/lib/rtesseract/tsv.rb +18 -0
  22. data/lib/rtesseract/version.rb +3 -0
  23. data/rtesseract.gemspec +27 -98
  24. metadata +36 -85
  25. data/README.rdoc +0 -156
  26. data/VERSION +0 -1
  27. data/lib/processors/mini_magick.rb +0 -43
  28. data/lib/processors/none.rb +0 -34
  29. data/lib/processors/rmagick.rb +0 -46
  30. data/lib/rtesseract/blob.rb +0 -34
  31. data/lib/rtesseract/box_char.rb +0 -31
  32. data/lib/rtesseract/errors.rb +0 -21
  33. data/lib/rtesseract/mixed.rb +0 -54
  34. data/lib/rtesseract/processor.rb +0 -19
  35. data/lib/rtesseract/utils.rb +0 -44
  36. data/lib/rtesseract/uzn.rb +0 -47
  37. data/spec/configs/eng.user-words.txt +0 -13
  38. data/spec/images/README.pdf +0 -0
  39. data/spec/images/blank.tif +0 -0
  40. data/spec/images/mixed.tif +0 -0
  41. data/spec/images/orientation_reverse.png +0 -0
  42. data/spec/images/test with spaces.tif +0 -0
  43. data/spec/images/test-pdf.png +0 -0
  44. data/spec/images/test.bmp +0 -0
  45. data/spec/images/test.jpg +0 -0
  46. data/spec/images/test.png +0 -0
  47. data/spec/images/test.tif +0 -0
  48. data/spec/images/test1.tif +0 -0
  49. data/spec/images/test_words.png +0 -0
  50. data/spec/rtesseract_box_char_spec.rb +0 -82
  51. data/spec/rtesseract_box_spec.rb +0 -36
  52. data/spec/rtesseract_mixed_spec.rb +0 -49
  53. data/spec/rtesseract_spec.rb +0 -282
  54. data/spec/rtesseract_uzn_spec.rb +0 -56
  55. data/spec/spec_helper.rb +0 -21
data/Rakefile CHANGED
@@ -1,50 +1,6 @@
1
- # encoding: utf-8
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
2
3
 
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts 'Run `bundle install` to install missing gems'
10
- exit e.status_code
11
- end
12
- require 'rake'
4
+ RSpec::Core::RakeTask.new(:spec)
13
5
 
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
17
- gem.name = 'rtesseract'
18
- gem.homepage = 'http://github.com/dannnylo/rtesseract'
19
- gem.license = 'MIT'
20
- gem.summary = 'Ruby library for working with the Tesseract OCR.'
21
- gem.description = 'Ruby library for working with the Tesseract OCR.'
22
- gem.email = 'dannnylo@gmail.com'
23
- gem.authors = ['Danilo Jeremias da Silva']
24
- # dependencies defined in Gemfile
25
- end
26
- Jeweler::RubygemsDotOrgTasks.new
27
-
28
- require 'rspec/core'
29
- require 'rspec/core/rake_task'
30
- RSpec::Core::RakeTask.new(:spec) do |spec|
31
- spec.pattern = FileList['spec/**/*_spec.rb']
32
- end
33
-
34
- desc 'Code coverage detail'
35
- task :simplecov do
36
- ENV['COVERAGE'] = 'true'
37
- Rake::Task['spec'].execute
38
- end
39
-
40
- task default: :spec
41
-
42
- require 'rdoc/task'
43
- Rake::RDocTask.new do |rdoc|
44
- version = File.exist?('VERSION') ? File.read('VERSION') : ''
45
-
46
- rdoc.rdoc_dir = 'rdoc'
47
- rdoc.title = "rtesseract #{version}"
48
- rdoc.rdoc_files.include('README*')
49
- rdoc.rdoc_files.include('lib/**/*.rb')
50
- end
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "rtesseract"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/lib/rtesseract.rb CHANGED
@@ -1,244 +1,46 @@
1
- # encoding: UTF-8
2
- require 'pathname'
3
- require 'tempfile'
1
+ require "rtesseract/check"
2
+ require "rtesseract/configuration"
3
+ require "rtesseract/command"
4
+ require "rtesseract/text"
5
+ require "rtesseract/pdf"
6
+ require "rtesseract/box"
7
+ require "rtesseract/tsv"
4
8
 
5
- require 'rtesseract/utils'
6
- require 'rtesseract/configuration'
7
- require 'rtesseract/errors'
8
-
9
- # Ruby wrapper for Tesseract OCR
10
9
  class RTesseract
11
- attr_accessor :configuration
12
- attr_reader :processor
13
- attr_reader :source
14
-
15
- def initialize(src = '', options = {})
16
- self.configuration = RTesseract.local_config(options)
17
- @options = options || {}
18
- @points = {}
19
- @processor = RTesseract::Processor.choose_processor!(configuration.processor)
20
- self.source = src
21
- initialize_hook
22
- end
23
-
24
- # Hook to end of initialize method
25
- def initialize_hook
26
- end
27
-
28
- # Define the source
29
- def source=(src)
30
- @value = nil
31
- @pdf_path = nil
32
- @source = @processor.image?(src) ? src : Pathname.new(src)
33
- end
34
-
35
- # Crop image to convert
36
- def crop!(points = {})
37
- @value = nil
38
- @points = points
39
- self
40
- end
41
-
42
- # Select the language
43
- # ===Languages
44
- ## * eng - English
45
- ## * deu - German
46
- ## * deu-f - German fraktur
47
- ## * fra - French
48
- ## * ita - Italian
49
- ## * nld - Dutch
50
- ## * por - Portuguese
51
- ## * spa - Spanish
52
- ## * vie - Vietnamese
53
- ## Note: Make sure you have installed the language to tesseract
54
- def lang
55
- language = (configuration.lang || 'eng').to_s.strip.downcase
56
- " -l #{LANGUAGES[language] || language} "
57
- rescue
58
- ''
59
- end
60
-
61
- # Convert option to command
62
- def option_to_string(prefix, value = nil)
63
- (value.nil? ? '' : " #{prefix} #{value} ")
64
- rescue
65
- ''
66
- end
67
-
68
- # Page Segment Mode
69
- def psm
70
- option_to_string('-psm', configuration.psm)
71
- end
72
-
73
- # Engine Mode
74
- def oem
75
- option_to_string '--oem', configuration.oem
76
- end
77
-
78
- # Tessdata Dir
79
- def tessdata_dir
80
- option_to_string('--tessdata-dir', configuration.tessdata_dir)
81
- end
82
-
83
- # User Words
84
- def user_words
85
- option_to_string('--user-words', configuration.user_words)
86
- end
87
-
88
- # User Patterns
89
- def user_patterns
90
- option_to_string('--user-patterns', configuration.user_patterns)
91
- end
92
-
93
- # Options on line
94
- def options_cmd
95
- configuration.options_cmd
96
- end
97
-
98
- # Hook to before config
99
- def config_hook
100
- end
101
-
102
- # Convert configurations
103
- def config
104
- @options ||= {}
105
- config_hook
106
- @options.map { |k, v| "#{k} #{v}" }.join("\n")
107
- end
10
+ class Error < StandardError; end
108
11
 
109
- # Write config to file
110
- def config_file
111
- config_hook
112
- return '' if @options == {}
113
- conf = Tempfile.new('config')
114
- conf.write(config)
115
- conf.flush
116
- conf.path
117
- end
118
-
119
- # TODO: Clear console for MacOS or Windows
120
- def clear_console_output
121
- return '' if configuration.debug
122
- return '2>/dev/null' if File.exist?('/dev/null') # Linux console clear
123
- end
124
-
125
- # Get image
126
- def image
127
- (@image = @processor.image_to_tif(@source, @points)).path
128
- end
129
-
130
- # Extension of file
131
- def file_ext
132
- '.txt'
133
- end
134
-
135
- # Detect version number
136
- def tesseract_version
137
- RTesseract::Utils.version_number
138
- end
139
-
140
-
141
- # Rand file path
142
- def file_dest
143
- @file_dest = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
144
- end
145
-
146
- # Full path of file with txt extension
147
- def file_with_ext(ext = nil)
148
- [@file_dest, ext || file_ext].join('')
149
- end
150
-
151
- # Run command
152
- def convert_command
153
- `#{configuration.command} "#{image}" "#{file_dest}" #{lang} #{oem} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{options_cmd.join(' ')}`
154
- end
12
+ check_version!
155
13
 
156
- # Is pdf output?
157
- def pdf?
158
- options_cmd.include? 'pdf'
159
- end
14
+ attr_reader :config, :source
160
15
 
161
- # Read result file
162
- def convert_text
163
- @value = File.read(file_with_ext).to_s
16
+ def initialize(src = '', options = {})
17
+ @source = src
18
+ @config = RTesseract.config.merge(options)
164
19
  end
165
20
 
166
- # Store pdf result path
167
- def convert_pdf
168
- @pdf_path = file_with_ext('.pdf')
21
+ def to_box
22
+ Box.run(@source, config)
169
23
  end
170
24
 
171
- # Convert result to proper type
172
- def convert_result
173
- if pdf?
174
- convert_pdf
175
- else
176
- convert_text
177
- RTesseract::Utils.remove_files([@image, file_with_ext])
178
- end
25
+ def words
26
+ to_box.map { |word| word[:word] }
179
27
  end
180
28
 
181
- # Hook to convert
182
- def after_convert_hook
29
+ def to_pdf
30
+ Pdf.run(@source, config)
183
31
  end
184
32
 
185
- # Convert image to string
186
- def convert
187
- convert_command
188
- after_convert_hook
189
- convert_result
190
- rescue => error
191
- raise RTesseract::ConversionError.new(error), error, caller
33
+ def to_tsv
34
+ Tsv.run(@source, config)
192
35
  end
193
36
 
194
37
  # Output value
195
38
  def to_s
196
- return @value if @value
197
-
198
- if @processor.image?(@source) || @source.file?
199
- convert
200
- @value
201
- else
202
- fail RTesseract::ImageNotSelectedError.new(@source)
203
- end
39
+ Text.run(@source, config)
204
40
  end
205
41
 
206
42
  # Remove spaces and break-lines
207
43
  def to_s_without_spaces
208
44
  to_s.gsub(/\s/, '')
209
45
  end
210
-
211
- # Output pdf path
212
- def to_pdf
213
- return @pdf_path if @pdf_path
214
-
215
- fail TesseractVersionError.new if tesseract_version.nil? || tesseract_version < 3.03
216
-
217
- if @processor.image?(@source) || @source.file?
218
- options_cmd << 'pdf'
219
- convert
220
- options_cmd.delete('pdf')
221
- @pdf_path
222
- else
223
- fail RTesseract::ImageNotSelectedError.new(@source)
224
- end
225
- end
226
-
227
- # Destroy pdf file
228
- def clean
229
- RTesseract::Utils.remove_files([@pdf_path])
230
- end
231
-
232
46
  end
233
-
234
- require 'rtesseract/mixed'
235
- require 'rtesseract/uzn'
236
- require 'rtesseract/box'
237
- require 'rtesseract/box_char'
238
- require 'rtesseract/blob'
239
- require 'rtesseract/processor'
240
-
241
- # Processors
242
- require 'processors/rmagick.rb'
243
- require 'processors/mini_magick.rb'
244
- require 'processors/none.rb'
@@ -1,73 +1,28 @@
1
- # encoding: UTF-8
2
1
  require 'nokogiri'
3
- require 'fileutils'
2
+ require 'tmpdir'
4
3
 
5
- # RTesseract
6
4
  class RTesseract
7
- # Class to read char positions from an image
8
- class Box < RTesseract
9
- # Setting value as blank array
10
- def initialize_hook
11
- @value = []
5
+ module Box
6
+ def self.temp_dir
7
+ @file_path = Pathname.new(Dir.tmpdir)
12
8
  end
13
9
 
14
- # Aditional options to config file
15
- def config_hook
16
- @options['tessedit_create_hocr'] = 1 # Split Words configuration
17
- end
18
-
19
- # Words converted
20
- def words
21
- convert if @value == []
22
- @value
23
- end
24
-
25
- # Extension of file
26
- def file_ext
27
- '.hocr'
28
- end
29
-
30
- # Read the result file
31
- def parse_file
32
- html = Nokogiri::HTML(File.read(file_with_ext))
33
- html.css('span.ocrx_word, span.ocr_word')
34
- end
35
-
36
- # Return words to value
37
- def convert_text
38
- text_objects = []
39
- parse_file.each { |word| text_objects << BoxParser.new(word).to_h }
40
- @value = text_objects
41
- end
10
+ def self.run(source, options)
11
+ name = "rtesseract_#{SecureRandom.uuid}"
12
+ options.tessedit_create_hocr = 1
42
13
 
43
- # Move file html to hocr
44
- def after_convert_hook
45
- FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil
46
- end
14
+ RTesseract::Command.new(source, temp_dir.join(name).to_s, options).run
47
15
 
48
- # Output value
49
- def to_s
50
- return @value.map { |word| word[:word] } if @value != []
51
- if @processor.image?(@source) || @source.file?
52
- convert
53
- @value.map { |word| word[:word] }.join(' ')
54
- else
55
- fail RTesseract::ImageNotSelectedError.new(@source)
56
- end
16
+ parse(temp_dir.join("#{name}.hocr").read)
57
17
  end
58
18
 
59
- # Parse word data from html.
60
- class BoxParser
61
- def initialize(word_html)
62
- @word = word_html
63
- title = @word.attributes['title'].value.to_s
64
- @attributes = title.gsub(';', '').split(' ')
65
- end
19
+ def self.parse(content)
20
+ html = Nokogiri::HTML(content)
21
+ html.css('span.ocrx_word, span.ocr_word').map do |word|
22
+ @attributes = word.attributes['title'].value.to_s.gsub(';', '').split(' ')
66
23
 
67
- # Hash of word and position
68
- def to_h
69
24
  {
70
- word: @word.text,
25
+ word: word.text,
71
26
  x_start: @attributes[1].to_i,
72
27
  y_start: @attributes[2].to_i,
73
28
  x_end: @attributes[3].to_i,
@@ -76,4 +31,4 @@ class RTesseract
76
31
  end
77
32
  end
78
33
  end
79
- end
34
+ end
@@ -0,0 +1,14 @@
1
+
2
+ class RTesseract
3
+ class << self
4
+ def tesseract_version
5
+ Open3.capture2e(RTesseract.config.command, "--version").first.to_s.match(/\d+.\d+/)[0].to_f
6
+ rescue Errno::ENOENT
7
+ 0
8
+ end
9
+
10
+ def check_version!
11
+ raise RTesseract::Error.new('Tesseract OCR 3.5 or later not installed') if RTesseract.tesseract_version < 3.05
12
+ end
13
+ end
14
+ end