rtesseract 2.2.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.document +1 -2
- data/.gitignore +12 -0
- data/.rspec +2 -0
- data/.travis.yml +13 -10
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -17
- data/Gemfile.lock +40 -85
- data/LICENSE.txt +18 -17
- data/README.md +137 -0
- data/Rakefile +4 -48
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/rtesseract.rb +22 -220
- data/lib/rtesseract/box.rb +15 -60
- data/lib/rtesseract/check.rb +14 -0
- data/lib/rtesseract/command.rb +41 -0
- data/lib/rtesseract/configuration.rb +15 -64
- data/lib/rtesseract/pdf.rb +18 -0
- data/lib/rtesseract/text.rb +9 -0
- data/lib/rtesseract/tsv.rb +18 -0
- data/lib/rtesseract/version.rb +3 -0
- data/rtesseract.gemspec +27 -98
- metadata +36 -85
- data/README.rdoc +0 -156
- data/VERSION +0 -1
- data/lib/processors/mini_magick.rb +0 -43
- data/lib/processors/none.rb +0 -34
- data/lib/processors/rmagick.rb +0 -46
- data/lib/rtesseract/blob.rb +0 -34
- data/lib/rtesseract/box_char.rb +0 -31
- data/lib/rtesseract/errors.rb +0 -21
- data/lib/rtesseract/mixed.rb +0 -54
- data/lib/rtesseract/processor.rb +0 -19
- data/lib/rtesseract/utils.rb +0 -44
- data/lib/rtesseract/uzn.rb +0 -47
- data/spec/configs/eng.user-words.txt +0 -13
- data/spec/images/README.pdf +0 -0
- data/spec/images/blank.tif +0 -0
- data/spec/images/mixed.tif +0 -0
- data/spec/images/orientation_reverse.png +0 -0
- data/spec/images/test with spaces.tif +0 -0
- data/spec/images/test-pdf.png +0 -0
- data/spec/images/test.bmp +0 -0
- data/spec/images/test.jpg +0 -0
- data/spec/images/test.png +0 -0
- data/spec/images/test.tif +0 -0
- data/spec/images/test1.tif +0 -0
- data/spec/images/test_words.png +0 -0
- data/spec/rtesseract_box_char_spec.rb +0 -82
- data/spec/rtesseract_box_spec.rb +0 -36
- data/spec/rtesseract_mixed_spec.rb +0 -49
- data/spec/rtesseract_spec.rb +0 -282
- data/spec/rtesseract_uzn_spec.rb +0 -56
- data/spec/spec_helper.rb +0 -21
data/Rakefile
CHANGED
@@ -1,50 +1,6 @@
|
|
1
|
-
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
2
3
|
|
3
|
-
|
4
|
-
require 'bundler'
|
5
|
-
begin
|
6
|
-
Bundler.setup(:default, :development)
|
7
|
-
rescue Bundler::BundlerError => e
|
8
|
-
$stderr.puts e.message
|
9
|
-
$stderr.puts 'Run `bundle install` to install missing gems'
|
10
|
-
exit e.status_code
|
11
|
-
end
|
12
|
-
require 'rake'
|
4
|
+
RSpec::Core::RakeTask.new(:spec)
|
13
5
|
|
14
|
-
|
15
|
-
Jeweler::Tasks.new do |gem|
|
16
|
-
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
|
-
gem.name = 'rtesseract'
|
18
|
-
gem.homepage = 'http://github.com/dannnylo/rtesseract'
|
19
|
-
gem.license = 'MIT'
|
20
|
-
gem.summary = 'Ruby library for working with the Tesseract OCR.'
|
21
|
-
gem.description = 'Ruby library for working with the Tesseract OCR.'
|
22
|
-
gem.email = 'dannnylo@gmail.com'
|
23
|
-
gem.authors = ['Danilo Jeremias da Silva']
|
24
|
-
# dependencies defined in Gemfile
|
25
|
-
end
|
26
|
-
Jeweler::RubygemsDotOrgTasks.new
|
27
|
-
|
28
|
-
require 'rspec/core'
|
29
|
-
require 'rspec/core/rake_task'
|
30
|
-
RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
-
spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
-
end
|
33
|
-
|
34
|
-
desc 'Code coverage detail'
|
35
|
-
task :simplecov do
|
36
|
-
ENV['COVERAGE'] = 'true'
|
37
|
-
Rake::Task['spec'].execute
|
38
|
-
end
|
39
|
-
|
40
|
-
task default: :spec
|
41
|
-
|
42
|
-
require 'rdoc/task'
|
43
|
-
Rake::RDocTask.new do |rdoc|
|
44
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ''
|
45
|
-
|
46
|
-
rdoc.rdoc_dir = 'rdoc'
|
47
|
-
rdoc.title = "rtesseract #{version}"
|
48
|
-
rdoc.rdoc_files.include('README*')
|
49
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
50
|
-
end
|
6
|
+
task :default => :spec
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "rtesseract"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/lib/rtesseract.rb
CHANGED
@@ -1,244 +1,46 @@
|
|
1
|
-
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require "rtesseract/check"
|
2
|
+
require "rtesseract/configuration"
|
3
|
+
require "rtesseract/command"
|
4
|
+
require "rtesseract/text"
|
5
|
+
require "rtesseract/pdf"
|
6
|
+
require "rtesseract/box"
|
7
|
+
require "rtesseract/tsv"
|
4
8
|
|
5
|
-
require 'rtesseract/utils'
|
6
|
-
require 'rtesseract/configuration'
|
7
|
-
require 'rtesseract/errors'
|
8
|
-
|
9
|
-
# Ruby wrapper for Tesseract OCR
|
10
9
|
class RTesseract
|
11
|
-
|
12
|
-
attr_reader :processor
|
13
|
-
attr_reader :source
|
14
|
-
|
15
|
-
def initialize(src = '', options = {})
|
16
|
-
self.configuration = RTesseract.local_config(options)
|
17
|
-
@options = options || {}
|
18
|
-
@points = {}
|
19
|
-
@processor = RTesseract::Processor.choose_processor!(configuration.processor)
|
20
|
-
self.source = src
|
21
|
-
initialize_hook
|
22
|
-
end
|
23
|
-
|
24
|
-
# Hook to end of initialize method
|
25
|
-
def initialize_hook
|
26
|
-
end
|
27
|
-
|
28
|
-
# Define the source
|
29
|
-
def source=(src)
|
30
|
-
@value = nil
|
31
|
-
@pdf_path = nil
|
32
|
-
@source = @processor.image?(src) ? src : Pathname.new(src)
|
33
|
-
end
|
34
|
-
|
35
|
-
# Crop image to convert
|
36
|
-
def crop!(points = {})
|
37
|
-
@value = nil
|
38
|
-
@points = points
|
39
|
-
self
|
40
|
-
end
|
41
|
-
|
42
|
-
# Select the language
|
43
|
-
# ===Languages
|
44
|
-
## * eng - English
|
45
|
-
## * deu - German
|
46
|
-
## * deu-f - German fraktur
|
47
|
-
## * fra - French
|
48
|
-
## * ita - Italian
|
49
|
-
## * nld - Dutch
|
50
|
-
## * por - Portuguese
|
51
|
-
## * spa - Spanish
|
52
|
-
## * vie - Vietnamese
|
53
|
-
## Note: Make sure you have installed the language to tesseract
|
54
|
-
def lang
|
55
|
-
language = (configuration.lang || 'eng').to_s.strip.downcase
|
56
|
-
" -l #{LANGUAGES[language] || language} "
|
57
|
-
rescue
|
58
|
-
''
|
59
|
-
end
|
60
|
-
|
61
|
-
# Convert option to command
|
62
|
-
def option_to_string(prefix, value = nil)
|
63
|
-
(value.nil? ? '' : " #{prefix} #{value} ")
|
64
|
-
rescue
|
65
|
-
''
|
66
|
-
end
|
67
|
-
|
68
|
-
# Page Segment Mode
|
69
|
-
def psm
|
70
|
-
option_to_string('-psm', configuration.psm)
|
71
|
-
end
|
72
|
-
|
73
|
-
# Engine Mode
|
74
|
-
def oem
|
75
|
-
option_to_string '--oem', configuration.oem
|
76
|
-
end
|
77
|
-
|
78
|
-
# Tessdata Dir
|
79
|
-
def tessdata_dir
|
80
|
-
option_to_string('--tessdata-dir', configuration.tessdata_dir)
|
81
|
-
end
|
82
|
-
|
83
|
-
# User Words
|
84
|
-
def user_words
|
85
|
-
option_to_string('--user-words', configuration.user_words)
|
86
|
-
end
|
87
|
-
|
88
|
-
# User Patterns
|
89
|
-
def user_patterns
|
90
|
-
option_to_string('--user-patterns', configuration.user_patterns)
|
91
|
-
end
|
92
|
-
|
93
|
-
# Options on line
|
94
|
-
def options_cmd
|
95
|
-
configuration.options_cmd
|
96
|
-
end
|
97
|
-
|
98
|
-
# Hook to before config
|
99
|
-
def config_hook
|
100
|
-
end
|
101
|
-
|
102
|
-
# Convert configurations
|
103
|
-
def config
|
104
|
-
@options ||= {}
|
105
|
-
config_hook
|
106
|
-
@options.map { |k, v| "#{k} #{v}" }.join("\n")
|
107
|
-
end
|
10
|
+
class Error < StandardError; end
|
108
11
|
|
109
|
-
|
110
|
-
def config_file
|
111
|
-
config_hook
|
112
|
-
return '' if @options == {}
|
113
|
-
conf = Tempfile.new('config')
|
114
|
-
conf.write(config)
|
115
|
-
conf.flush
|
116
|
-
conf.path
|
117
|
-
end
|
118
|
-
|
119
|
-
# TODO: Clear console for MacOS or Windows
|
120
|
-
def clear_console_output
|
121
|
-
return '' if configuration.debug
|
122
|
-
return '2>/dev/null' if File.exist?('/dev/null') # Linux console clear
|
123
|
-
end
|
124
|
-
|
125
|
-
# Get image
|
126
|
-
def image
|
127
|
-
(@image = @processor.image_to_tif(@source, @points)).path
|
128
|
-
end
|
129
|
-
|
130
|
-
# Extension of file
|
131
|
-
def file_ext
|
132
|
-
'.txt'
|
133
|
-
end
|
134
|
-
|
135
|
-
# Detect version number
|
136
|
-
def tesseract_version
|
137
|
-
RTesseract::Utils.version_number
|
138
|
-
end
|
139
|
-
|
140
|
-
|
141
|
-
# Rand file path
|
142
|
-
def file_dest
|
143
|
-
@file_dest = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
|
144
|
-
end
|
145
|
-
|
146
|
-
# Full path of file with txt extension
|
147
|
-
def file_with_ext(ext = nil)
|
148
|
-
[@file_dest, ext || file_ext].join('')
|
149
|
-
end
|
150
|
-
|
151
|
-
# Run command
|
152
|
-
def convert_command
|
153
|
-
`#{configuration.command} "#{image}" "#{file_dest}" #{lang} #{oem} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{options_cmd.join(' ')}`
|
154
|
-
end
|
12
|
+
check_version!
|
155
13
|
|
156
|
-
|
157
|
-
def pdf?
|
158
|
-
options_cmd.include? 'pdf'
|
159
|
-
end
|
14
|
+
attr_reader :config, :source
|
160
15
|
|
161
|
-
|
162
|
-
|
163
|
-
@
|
16
|
+
def initialize(src = '', options = {})
|
17
|
+
@source = src
|
18
|
+
@config = RTesseract.config.merge(options)
|
164
19
|
end
|
165
20
|
|
166
|
-
|
167
|
-
|
168
|
-
@pdf_path = file_with_ext('.pdf')
|
21
|
+
def to_box
|
22
|
+
Box.run(@source, config)
|
169
23
|
end
|
170
24
|
|
171
|
-
|
172
|
-
|
173
|
-
if pdf?
|
174
|
-
convert_pdf
|
175
|
-
else
|
176
|
-
convert_text
|
177
|
-
RTesseract::Utils.remove_files([@image, file_with_ext])
|
178
|
-
end
|
25
|
+
def words
|
26
|
+
to_box.map { |word| word[:word] }
|
179
27
|
end
|
180
28
|
|
181
|
-
|
182
|
-
|
29
|
+
def to_pdf
|
30
|
+
Pdf.run(@source, config)
|
183
31
|
end
|
184
32
|
|
185
|
-
|
186
|
-
|
187
|
-
convert_command
|
188
|
-
after_convert_hook
|
189
|
-
convert_result
|
190
|
-
rescue => error
|
191
|
-
raise RTesseract::ConversionError.new(error), error, caller
|
33
|
+
def to_tsv
|
34
|
+
Tsv.run(@source, config)
|
192
35
|
end
|
193
36
|
|
194
37
|
# Output value
|
195
38
|
def to_s
|
196
|
-
|
197
|
-
|
198
|
-
if @processor.image?(@source) || @source.file?
|
199
|
-
convert
|
200
|
-
@value
|
201
|
-
else
|
202
|
-
fail RTesseract::ImageNotSelectedError.new(@source)
|
203
|
-
end
|
39
|
+
Text.run(@source, config)
|
204
40
|
end
|
205
41
|
|
206
42
|
# Remove spaces and break-lines
|
207
43
|
def to_s_without_spaces
|
208
44
|
to_s.gsub(/\s/, '')
|
209
45
|
end
|
210
|
-
|
211
|
-
# Output pdf path
|
212
|
-
def to_pdf
|
213
|
-
return @pdf_path if @pdf_path
|
214
|
-
|
215
|
-
fail TesseractVersionError.new if tesseract_version.nil? || tesseract_version < 3.03
|
216
|
-
|
217
|
-
if @processor.image?(@source) || @source.file?
|
218
|
-
options_cmd << 'pdf'
|
219
|
-
convert
|
220
|
-
options_cmd.delete('pdf')
|
221
|
-
@pdf_path
|
222
|
-
else
|
223
|
-
fail RTesseract::ImageNotSelectedError.new(@source)
|
224
|
-
end
|
225
|
-
end
|
226
|
-
|
227
|
-
# Destroy pdf file
|
228
|
-
def clean
|
229
|
-
RTesseract::Utils.remove_files([@pdf_path])
|
230
|
-
end
|
231
|
-
|
232
46
|
end
|
233
|
-
|
234
|
-
require 'rtesseract/mixed'
|
235
|
-
require 'rtesseract/uzn'
|
236
|
-
require 'rtesseract/box'
|
237
|
-
require 'rtesseract/box_char'
|
238
|
-
require 'rtesseract/blob'
|
239
|
-
require 'rtesseract/processor'
|
240
|
-
|
241
|
-
# Processors
|
242
|
-
require 'processors/rmagick.rb'
|
243
|
-
require 'processors/mini_magick.rb'
|
244
|
-
require 'processors/none.rb'
|
data/lib/rtesseract/box.rb
CHANGED
@@ -1,73 +1,28 @@
|
|
1
|
-
# encoding: UTF-8
|
2
1
|
require 'nokogiri'
|
3
|
-
require '
|
2
|
+
require 'tmpdir'
|
4
3
|
|
5
|
-
# RTesseract
|
6
4
|
class RTesseract
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
def initialize_hook
|
11
|
-
@value = []
|
5
|
+
module Box
|
6
|
+
def self.temp_dir
|
7
|
+
@file_path = Pathname.new(Dir.tmpdir)
|
12
8
|
end
|
13
9
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
# Words converted
|
20
|
-
def words
|
21
|
-
convert if @value == []
|
22
|
-
@value
|
23
|
-
end
|
24
|
-
|
25
|
-
# Extension of file
|
26
|
-
def file_ext
|
27
|
-
'.hocr'
|
28
|
-
end
|
29
|
-
|
30
|
-
# Read the result file
|
31
|
-
def parse_file
|
32
|
-
html = Nokogiri::HTML(File.read(file_with_ext))
|
33
|
-
html.css('span.ocrx_word, span.ocr_word')
|
34
|
-
end
|
35
|
-
|
36
|
-
# Return words to value
|
37
|
-
def convert_text
|
38
|
-
text_objects = []
|
39
|
-
parse_file.each { |word| text_objects << BoxParser.new(word).to_h }
|
40
|
-
@value = text_objects
|
41
|
-
end
|
10
|
+
def self.run(source, options)
|
11
|
+
name = "rtesseract_#{SecureRandom.uuid}"
|
12
|
+
options.tessedit_create_hocr = 1
|
42
13
|
|
43
|
-
|
44
|
-
def after_convert_hook
|
45
|
-
FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil
|
46
|
-
end
|
14
|
+
RTesseract::Command.new(source, temp_dir.join(name).to_s, options).run
|
47
15
|
|
48
|
-
|
49
|
-
def to_s
|
50
|
-
return @value.map { |word| word[:word] } if @value != []
|
51
|
-
if @processor.image?(@source) || @source.file?
|
52
|
-
convert
|
53
|
-
@value.map { |word| word[:word] }.join(' ')
|
54
|
-
else
|
55
|
-
fail RTesseract::ImageNotSelectedError.new(@source)
|
56
|
-
end
|
16
|
+
parse(temp_dir.join("#{name}.hocr").read)
|
57
17
|
end
|
58
18
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
@
|
63
|
-
title = @word.attributes['title'].value.to_s
|
64
|
-
@attributes = title.gsub(';', '').split(' ')
|
65
|
-
end
|
19
|
+
def self.parse(content)
|
20
|
+
html = Nokogiri::HTML(content)
|
21
|
+
html.css('span.ocrx_word, span.ocr_word').map do |word|
|
22
|
+
@attributes = word.attributes['title'].value.to_s.gsub(';', '').split(' ')
|
66
23
|
|
67
|
-
# Hash of word and position
|
68
|
-
def to_h
|
69
24
|
{
|
70
|
-
word:
|
25
|
+
word: word.text,
|
71
26
|
x_start: @attributes[1].to_i,
|
72
27
|
y_start: @attributes[2].to_i,
|
73
28
|
x_end: @attributes[3].to_i,
|
@@ -76,4 +31,4 @@ class RTesseract
|
|
76
31
|
end
|
77
32
|
end
|
78
33
|
end
|
79
|
-
end
|
34
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
|
2
|
+
class RTesseract
|
3
|
+
class << self
|
4
|
+
def tesseract_version
|
5
|
+
Open3.capture2e(RTesseract.config.command, "--version").first.to_s.match(/\d+.\d+/)[0].to_f
|
6
|
+
rescue Errno::ENOENT
|
7
|
+
0
|
8
|
+
end
|
9
|
+
|
10
|
+
def check_version!
|
11
|
+
raise RTesseract::Error.new('Tesseract OCR 3.5 or later not installed') if RTesseract.tesseract_version < 3.05
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|