rtesseract 2.2.0 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +5 -5
  2. data/.document +1 -2
  3. data/.gitignore +12 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +13 -10
  6. data/CODE_OF_CONDUCT.md +74 -0
  7. data/Gemfile +4 -17
  8. data/Gemfile.lock +40 -85
  9. data/LICENSE.txt +18 -17
  10. data/README.md +137 -0
  11. data/Rakefile +4 -48
  12. data/bin/console +14 -0
  13. data/bin/setup +8 -0
  14. data/lib/rtesseract.rb +22 -220
  15. data/lib/rtesseract/box.rb +15 -60
  16. data/lib/rtesseract/check.rb +14 -0
  17. data/lib/rtesseract/command.rb +41 -0
  18. data/lib/rtesseract/configuration.rb +15 -64
  19. data/lib/rtesseract/pdf.rb +18 -0
  20. data/lib/rtesseract/text.rb +9 -0
  21. data/lib/rtesseract/tsv.rb +18 -0
  22. data/lib/rtesseract/version.rb +3 -0
  23. data/rtesseract.gemspec +27 -98
  24. metadata +36 -85
  25. data/README.rdoc +0 -156
  26. data/VERSION +0 -1
  27. data/lib/processors/mini_magick.rb +0 -43
  28. data/lib/processors/none.rb +0 -34
  29. data/lib/processors/rmagick.rb +0 -46
  30. data/lib/rtesseract/blob.rb +0 -34
  31. data/lib/rtesseract/box_char.rb +0 -31
  32. data/lib/rtesseract/errors.rb +0 -21
  33. data/lib/rtesseract/mixed.rb +0 -54
  34. data/lib/rtesseract/processor.rb +0 -19
  35. data/lib/rtesseract/utils.rb +0 -44
  36. data/lib/rtesseract/uzn.rb +0 -47
  37. data/spec/configs/eng.user-words.txt +0 -13
  38. data/spec/images/README.pdf +0 -0
  39. data/spec/images/blank.tif +0 -0
  40. data/spec/images/mixed.tif +0 -0
  41. data/spec/images/orientation_reverse.png +0 -0
  42. data/spec/images/test with spaces.tif +0 -0
  43. data/spec/images/test-pdf.png +0 -0
  44. data/spec/images/test.bmp +0 -0
  45. data/spec/images/test.jpg +0 -0
  46. data/spec/images/test.png +0 -0
  47. data/spec/images/test.tif +0 -0
  48. data/spec/images/test1.tif +0 -0
  49. data/spec/images/test_words.png +0 -0
  50. data/spec/rtesseract_box_char_spec.rb +0 -82
  51. data/spec/rtesseract_box_spec.rb +0 -36
  52. data/spec/rtesseract_mixed_spec.rb +0 -49
  53. data/spec/rtesseract_spec.rb +0 -282
  54. data/spec/rtesseract_uzn_spec.rb +0 -56
  55. data/spec/spec_helper.rb +0 -21
data/Rakefile CHANGED
@@ -1,50 +1,6 @@
1
- # encoding: utf-8
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
2
3
 
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts 'Run `bundle install` to install missing gems'
10
- exit e.status_code
11
- end
12
- require 'rake'
4
+ RSpec::Core::RakeTask.new(:spec)
13
5
 
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
17
- gem.name = 'rtesseract'
18
- gem.homepage = 'http://github.com/dannnylo/rtesseract'
19
- gem.license = 'MIT'
20
- gem.summary = 'Ruby library for working with the Tesseract OCR.'
21
- gem.description = 'Ruby library for working with the Tesseract OCR.'
22
- gem.email = 'dannnylo@gmail.com'
23
- gem.authors = ['Danilo Jeremias da Silva']
24
- # dependencies defined in Gemfile
25
- end
26
- Jeweler::RubygemsDotOrgTasks.new
27
-
28
- require 'rspec/core'
29
- require 'rspec/core/rake_task'
30
- RSpec::Core::RakeTask.new(:spec) do |spec|
31
- spec.pattern = FileList['spec/**/*_spec.rb']
32
- end
33
-
34
- desc 'Code coverage detail'
35
- task :simplecov do
36
- ENV['COVERAGE'] = 'true'
37
- Rake::Task['spec'].execute
38
- end
39
-
40
- task default: :spec
41
-
42
- require 'rdoc/task'
43
- Rake::RDocTask.new do |rdoc|
44
- version = File.exist?('VERSION') ? File.read('VERSION') : ''
45
-
46
- rdoc.rdoc_dir = 'rdoc'
47
- rdoc.title = "rtesseract #{version}"
48
- rdoc.rdoc_files.include('README*')
49
- rdoc.rdoc_files.include('lib/**/*.rb')
50
- end
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "rtesseract"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/lib/rtesseract.rb CHANGED
@@ -1,244 +1,46 @@
1
- # encoding: UTF-8
2
- require 'pathname'
3
- require 'tempfile'
1
+ require "rtesseract/check"
2
+ require "rtesseract/configuration"
3
+ require "rtesseract/command"
4
+ require "rtesseract/text"
5
+ require "rtesseract/pdf"
6
+ require "rtesseract/box"
7
+ require "rtesseract/tsv"
4
8
 
5
- require 'rtesseract/utils'
6
- require 'rtesseract/configuration'
7
- require 'rtesseract/errors'
8
-
9
- # Ruby wrapper for Tesseract OCR
10
9
  class RTesseract
11
- attr_accessor :configuration
12
- attr_reader :processor
13
- attr_reader :source
14
-
15
- def initialize(src = '', options = {})
16
- self.configuration = RTesseract.local_config(options)
17
- @options = options || {}
18
- @points = {}
19
- @processor = RTesseract::Processor.choose_processor!(configuration.processor)
20
- self.source = src
21
- initialize_hook
22
- end
23
-
24
- # Hook to end of initialize method
25
- def initialize_hook
26
- end
27
-
28
- # Define the source
29
- def source=(src)
30
- @value = nil
31
- @pdf_path = nil
32
- @source = @processor.image?(src) ? src : Pathname.new(src)
33
- end
34
-
35
- # Crop image to convert
36
- def crop!(points = {})
37
- @value = nil
38
- @points = points
39
- self
40
- end
41
-
42
- # Select the language
43
- # ===Languages
44
- ## * eng - English
45
- ## * deu - German
46
- ## * deu-f - German fraktur
47
- ## * fra - French
48
- ## * ita - Italian
49
- ## * nld - Dutch
50
- ## * por - Portuguese
51
- ## * spa - Spanish
52
- ## * vie - Vietnamese
53
- ## Note: Make sure you have installed the language to tesseract
54
- def lang
55
- language = (configuration.lang || 'eng').to_s.strip.downcase
56
- " -l #{LANGUAGES[language] || language} "
57
- rescue
58
- ''
59
- end
60
-
61
- # Convert option to command
62
- def option_to_string(prefix, value = nil)
63
- (value.nil? ? '' : " #{prefix} #{value} ")
64
- rescue
65
- ''
66
- end
67
-
68
- # Page Segment Mode
69
- def psm
70
- option_to_string('-psm', configuration.psm)
71
- end
72
-
73
- # Engine Mode
74
- def oem
75
- option_to_string '--oem', configuration.oem
76
- end
77
-
78
- # Tessdata Dir
79
- def tessdata_dir
80
- option_to_string('--tessdata-dir', configuration.tessdata_dir)
81
- end
82
-
83
- # User Words
84
- def user_words
85
- option_to_string('--user-words', configuration.user_words)
86
- end
87
-
88
- # User Patterns
89
- def user_patterns
90
- option_to_string('--user-patterns', configuration.user_patterns)
91
- end
92
-
93
- # Options on line
94
- def options_cmd
95
- configuration.options_cmd
96
- end
97
-
98
- # Hook to before config
99
- def config_hook
100
- end
101
-
102
- # Convert configurations
103
- def config
104
- @options ||= {}
105
- config_hook
106
- @options.map { |k, v| "#{k} #{v}" }.join("\n")
107
- end
10
+ class Error < StandardError; end
108
11
 
109
- # Write config to file
110
- def config_file
111
- config_hook
112
- return '' if @options == {}
113
- conf = Tempfile.new('config')
114
- conf.write(config)
115
- conf.flush
116
- conf.path
117
- end
118
-
119
- # TODO: Clear console for MacOS or Windows
120
- def clear_console_output
121
- return '' if configuration.debug
122
- return '2>/dev/null' if File.exist?('/dev/null') # Linux console clear
123
- end
124
-
125
- # Get image
126
- def image
127
- (@image = @processor.image_to_tif(@source, @points)).path
128
- end
129
-
130
- # Extension of file
131
- def file_ext
132
- '.txt'
133
- end
134
-
135
- # Detect version number
136
- def tesseract_version
137
- RTesseract::Utils.version_number
138
- end
139
-
140
-
141
- # Rand file path
142
- def file_dest
143
- @file_dest = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
144
- end
145
-
146
- # Full path of file with txt extension
147
- def file_with_ext(ext = nil)
148
- [@file_dest, ext || file_ext].join('')
149
- end
150
-
151
- # Run command
152
- def convert_command
153
- `#{configuration.command} "#{image}" "#{file_dest}" #{lang} #{oem} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{options_cmd.join(' ')}`
154
- end
12
+ check_version!
155
13
 
156
- # Is pdf output?
157
- def pdf?
158
- options_cmd.include? 'pdf'
159
- end
14
+ attr_reader :config, :source
160
15
 
161
- # Read result file
162
- def convert_text
163
- @value = File.read(file_with_ext).to_s
16
+ def initialize(src = '', options = {})
17
+ @source = src
18
+ @config = RTesseract.config.merge(options)
164
19
  end
165
20
 
166
- # Store pdf result path
167
- def convert_pdf
168
- @pdf_path = file_with_ext('.pdf')
21
+ def to_box
22
+ Box.run(@source, config)
169
23
  end
170
24
 
171
- # Convert result to proper type
172
- def convert_result
173
- if pdf?
174
- convert_pdf
175
- else
176
- convert_text
177
- RTesseract::Utils.remove_files([@image, file_with_ext])
178
- end
25
+ def words
26
+ to_box.map { |word| word[:word] }
179
27
  end
180
28
 
181
- # Hook to convert
182
- def after_convert_hook
29
+ def to_pdf
30
+ Pdf.run(@source, config)
183
31
  end
184
32
 
185
- # Convert image to string
186
- def convert
187
- convert_command
188
- after_convert_hook
189
- convert_result
190
- rescue => error
191
- raise RTesseract::ConversionError.new(error), error, caller
33
+ def to_tsv
34
+ Tsv.run(@source, config)
192
35
  end
193
36
 
194
37
  # Output value
195
38
  def to_s
196
- return @value if @value
197
-
198
- if @processor.image?(@source) || @source.file?
199
- convert
200
- @value
201
- else
202
- fail RTesseract::ImageNotSelectedError.new(@source)
203
- end
39
+ Text.run(@source, config)
204
40
  end
205
41
 
206
42
  # Remove spaces and break-lines
207
43
  def to_s_without_spaces
208
44
  to_s.gsub(/\s/, '')
209
45
  end
210
-
211
- # Output pdf path
212
- def to_pdf
213
- return @pdf_path if @pdf_path
214
-
215
- fail TesseractVersionError.new if tesseract_version.nil? || tesseract_version < 3.03
216
-
217
- if @processor.image?(@source) || @source.file?
218
- options_cmd << 'pdf'
219
- convert
220
- options_cmd.delete('pdf')
221
- @pdf_path
222
- else
223
- fail RTesseract::ImageNotSelectedError.new(@source)
224
- end
225
- end
226
-
227
- # Destroy pdf file
228
- def clean
229
- RTesseract::Utils.remove_files([@pdf_path])
230
- end
231
-
232
46
  end
233
-
234
- require 'rtesseract/mixed'
235
- require 'rtesseract/uzn'
236
- require 'rtesseract/box'
237
- require 'rtesseract/box_char'
238
- require 'rtesseract/blob'
239
- require 'rtesseract/processor'
240
-
241
- # Processors
242
- require 'processors/rmagick.rb'
243
- require 'processors/mini_magick.rb'
244
- require 'processors/none.rb'
@@ -1,73 +1,28 @@
1
- # encoding: UTF-8
2
1
  require 'nokogiri'
3
- require 'fileutils'
2
+ require 'tmpdir'
4
3
 
5
- # RTesseract
6
4
  class RTesseract
7
- # Class to read char positions from an image
8
- class Box < RTesseract
9
- # Setting value as blank array
10
- def initialize_hook
11
- @value = []
5
+ module Box
6
+ def self.temp_dir
7
+ @file_path = Pathname.new(Dir.tmpdir)
12
8
  end
13
9
 
14
- # Aditional options to config file
15
- def config_hook
16
- @options['tessedit_create_hocr'] = 1 # Split Words configuration
17
- end
18
-
19
- # Words converted
20
- def words
21
- convert if @value == []
22
- @value
23
- end
24
-
25
- # Extension of file
26
- def file_ext
27
- '.hocr'
28
- end
29
-
30
- # Read the result file
31
- def parse_file
32
- html = Nokogiri::HTML(File.read(file_with_ext))
33
- html.css('span.ocrx_word, span.ocr_word')
34
- end
35
-
36
- # Return words to value
37
- def convert_text
38
- text_objects = []
39
- parse_file.each { |word| text_objects << BoxParser.new(word).to_h }
40
- @value = text_objects
41
- end
10
+ def self.run(source, options)
11
+ name = "rtesseract_#{SecureRandom.uuid}"
12
+ options.tessedit_create_hocr = 1
42
13
 
43
- # Move file html to hocr
44
- def after_convert_hook
45
- FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil
46
- end
14
+ RTesseract::Command.new(source, temp_dir.join(name).to_s, options).run
47
15
 
48
- # Output value
49
- def to_s
50
- return @value.map { |word| word[:word] } if @value != []
51
- if @processor.image?(@source) || @source.file?
52
- convert
53
- @value.map { |word| word[:word] }.join(' ')
54
- else
55
- fail RTesseract::ImageNotSelectedError.new(@source)
56
- end
16
+ parse(temp_dir.join("#{name}.hocr").read)
57
17
  end
58
18
 
59
- # Parse word data from html.
60
- class BoxParser
61
- def initialize(word_html)
62
- @word = word_html
63
- title = @word.attributes['title'].value.to_s
64
- @attributes = title.gsub(';', '').split(' ')
65
- end
19
+ def self.parse(content)
20
+ html = Nokogiri::HTML(content)
21
+ html.css('span.ocrx_word, span.ocr_word').map do |word|
22
+ @attributes = word.attributes['title'].value.to_s.gsub(';', '').split(' ')
66
23
 
67
- # Hash of word and position
68
- def to_h
69
24
  {
70
- word: @word.text,
25
+ word: word.text,
71
26
  x_start: @attributes[1].to_i,
72
27
  y_start: @attributes[2].to_i,
73
28
  x_end: @attributes[3].to_i,
@@ -76,4 +31,4 @@ class RTesseract
76
31
  end
77
32
  end
78
33
  end
79
- end
34
+ end
@@ -0,0 +1,14 @@
1
+
2
+ class RTesseract
3
+ class << self
4
+ def tesseract_version
5
+ Open3.capture2e(RTesseract.config.command, "--version").first.to_s.match(/\d+.\d+/)[0].to_f
6
+ rescue Errno::ENOENT
7
+ 0
8
+ end
9
+
10
+ def check_version!
11
+ raise RTesseract::Error.new('Tesseract OCR 3.5 or later not installed') if RTesseract.tesseract_version < 3.05
12
+ end
13
+ end
14
+ end