anjlab-tesseract 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/tesseract.rb +11 -0
- data/lib/tesseract/dependency_checker.rb +40 -0
- data/lib/tesseract/file_handler.rb +17 -0
- data/lib/tesseract/process.rb +127 -0
- data/lib/tesseract/version.rb +3 -0
- metadata +62 -0
data/lib/tesseract.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
path = File.join(File.dirname(__FILE__), 'tesseract')
|
2
|
+
['dependency_checker', 'file_handler', 'process'].each do |f|
|
3
|
+
require File.expand_path(File.join(path, f))
|
4
|
+
end
|
5
|
+
require 'pathname'
|
6
|
+
require 'digest/md5'
|
7
|
+
require 'shellwords'
|
8
|
+
|
9
|
+
module Tesseract
|
10
|
+
|
11
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Tesseract
|
2
|
+
class DependencyChecker
|
3
|
+
#putting these here so its easyer to test
|
4
|
+
IMAGE_MAGICK_ERROR = "ImageMagick \"convert\" command not found! Make sure ImageMagick is installed and in the system path"
|
5
|
+
TESSERACT_ERROR = "\"tesseract\" command not found! Make sure tesseract is installed and in the system path"
|
6
|
+
OS_ERROR = "Only Unix Based enviroments are supported Mac, Linux, etc."
|
7
|
+
|
8
|
+
def self.check!
|
9
|
+
check_os!
|
10
|
+
check_for_tesseract!
|
11
|
+
check_for_imagemagick!
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
#for easy mocking
|
17
|
+
def self.run_cmd(cmd)
|
18
|
+
`#{cmd}`
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.check_os!
|
22
|
+
case ::RUBY_PLATFORM
|
23
|
+
when /darwin/
|
24
|
+
return true
|
25
|
+
when /linux/, /unix/
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
raise Exception, OS_ERROR
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.check_for_imagemagick!
|
32
|
+
raise Exception, IMAGE_MAGICK_ERROR if run_cmd('which convert').empty?
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.check_for_tesseract!
|
36
|
+
raise Exception, TESSERACT_ERROR if run_cmd('which tesseract').empty?
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
require 'tempfile'
|
3
|
+
module Tesseract
|
4
|
+
class FileHandler
|
5
|
+
@tempfiles = []
|
6
|
+
def self.create_temp_file(filename)
|
7
|
+
file = Pathname.new(Dir::tmpdir).join(filename)
|
8
|
+
@tempfiles << file
|
9
|
+
return file
|
10
|
+
end
|
11
|
+
def self.cleanup!
|
12
|
+
@tempfiles.each do |file|
|
13
|
+
File.unlink(file.to_s) if File.exists?(file.to_s)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'shellwords'
|
2
|
+
module Tesseract
|
3
|
+
class Process
|
4
|
+
|
5
|
+
attr_reader :image
|
6
|
+
|
7
|
+
CONVERT_COMMAND = 'convert'
|
8
|
+
TESSERACT_COMMAND = 'tesseract'
|
9
|
+
# Initialize a Tesseract translation process
|
10
|
+
# image_name is the file to translate
|
11
|
+
# options can be of the following:
|
12
|
+
# * tesseract_options Hash of options for tesseract
|
13
|
+
# * convert_options Array of options for convert
|
14
|
+
# * lang Image input language (eng, fra, etc. )
|
15
|
+
# * convert_command Convert binary name/path
|
16
|
+
# * tesseract_command Tesseract binary name/path
|
17
|
+
# * check_deps Boolean value. If true, verifies dependencies. Defaults to false
|
18
|
+
def initialize(image_name, options = {})
|
19
|
+
defaults = {
|
20
|
+
:tesseract_options => {},
|
21
|
+
:convert_options => {:input => [], :output => []},
|
22
|
+
:lang => :eng,
|
23
|
+
:convert_command => CONVERT_COMMAND,
|
24
|
+
:tesseract_command => TESSERACT_COMMAND,
|
25
|
+
:check_deps => false
|
26
|
+
}
|
27
|
+
@out = nil
|
28
|
+
@image = Pathname.new(image_name)
|
29
|
+
@hash = Digest::MD5.hexdigest("#{@image}-#{Time.now}")
|
30
|
+
|
31
|
+
merge_options! defaults, options
|
32
|
+
DependencyChecker.check! if @options[:check_deps]
|
33
|
+
end
|
34
|
+
|
35
|
+
def merge_options!(defaults, options)
|
36
|
+
@options = {}
|
37
|
+
|
38
|
+
if options.has_key? :tesseract_options
|
39
|
+
@options[:tesseract_options] = defaults[:tesseract_options].merge!(options[:tesseract_options]) if options.has_key? :tesseract_options
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
if options.has_key? :convert_options
|
44
|
+
@options[:convert_options] = defaults[:convert_options]
|
45
|
+
defaults[:convert_options].each do |k,v|
|
46
|
+
next unless options[:convert_options].has_key? k
|
47
|
+
@options[:convert_options][k] = v | options[:convert_options][k]
|
48
|
+
end
|
49
|
+
options.delete :convert_options
|
50
|
+
end
|
51
|
+
|
52
|
+
[:tesseract_options, :convert_options].each do |k|
|
53
|
+
options.delete(k) if options.has_key? k
|
54
|
+
end
|
55
|
+
@options = defaults.merge options
|
56
|
+
end
|
57
|
+
|
58
|
+
def lang=(lang)
|
59
|
+
@options[:lang]
|
60
|
+
end
|
61
|
+
def lang
|
62
|
+
@options[:lang]
|
63
|
+
end
|
64
|
+
def to_s
|
65
|
+
@out ||= process!
|
66
|
+
end
|
67
|
+
|
68
|
+
# Process the image into text.
|
69
|
+
def process!
|
70
|
+
temp_image = to_tiff
|
71
|
+
begin
|
72
|
+
text = tesseract_translation(temp_image)
|
73
|
+
rescue IOError
|
74
|
+
raise
|
75
|
+
ensure
|
76
|
+
FileHandler.cleanup!
|
77
|
+
end
|
78
|
+
text.gsub(/^\//, '')
|
79
|
+
end
|
80
|
+
|
81
|
+
# Generates the convert command.
|
82
|
+
def generate_convert_command(temp_file)
|
83
|
+
cmd = [@options[:convert_command]]
|
84
|
+
input_opt = @options[:convert_options][:input]
|
85
|
+
output_opt = @options[:convert_options][:output]
|
86
|
+
|
87
|
+
cmd += input_opt unless input_opt.empty?
|
88
|
+
cmd << Shellwords.shellescape(@image.to_s)
|
89
|
+
cmd += output_opt unless output_opt.empty?
|
90
|
+
cmd << temp_file.to_s
|
91
|
+
cmd.join(" ")
|
92
|
+
end
|
93
|
+
|
94
|
+
# Converts the source image to a tiff file.
|
95
|
+
def to_tiff
|
96
|
+
temp_file = FileHandler.create_temp_file("#{@hash}.tif")
|
97
|
+
executed = system generate_convert_command(temp_file)
|
98
|
+
raise RuntimeError, "`#{@options[:convert_command]}` could not be executed." if executed.nil?
|
99
|
+
temp_file
|
100
|
+
end
|
101
|
+
|
102
|
+
# Translate a tiff file into text
|
103
|
+
def tesseract_translation(image_file)
|
104
|
+
temp_text_file = FileHandler.create_temp_file(@hash.to_s)
|
105
|
+
config_file = write_configs
|
106
|
+
txt_file = "#{temp_text_file}.txt"
|
107
|
+
executed = system [@options[:tesseract_command], image_file.to_s, temp_text_file.to_s, "-l #{@options[:lang]}", config_file, "&> /dev/null"].join(' ')
|
108
|
+
raise RuntimeError, "`#{@options[:tesseract_command]}` could not be executed." if (executed.nil? || executed == false)
|
109
|
+
# wait until file (amazon cloud bug)
|
110
|
+
until File.exist?(txt_file)
|
111
|
+
sleep 1
|
112
|
+
end
|
113
|
+
out = File.read(txt_file)
|
114
|
+
File.unlink txt_file
|
115
|
+
out
|
116
|
+
end
|
117
|
+
# Writes Tesseract configuration for the current source file
|
118
|
+
def write_configs
|
119
|
+
return '' if @options[:tesseract_options].empty?
|
120
|
+
path = FileHandler.create_temp_file("#{@hash}.config")
|
121
|
+
File.open(path, "w+") do |f|
|
122
|
+
@options[:tesseract_options].each { |k,v| f << "#{k} #{v}\n" }
|
123
|
+
end
|
124
|
+
path
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: anjlab-tesseract
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Scott Davis
|
9
|
+
- Martin Samson
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2012-04-18 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: bundler
|
17
|
+
requirement: &70257151089280 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.0.0
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *70257151089280
|
26
|
+
description: Ruby wrapper for google tesseract
|
27
|
+
email: jetviper21@gmail.com
|
28
|
+
executables: []
|
29
|
+
extensions: []
|
30
|
+
extra_rdoc_files: []
|
31
|
+
files:
|
32
|
+
- lib/tesseract.rb
|
33
|
+
- lib/tesseract/process.rb
|
34
|
+
- lib/tesseract/file_handler.rb
|
35
|
+
- lib/tesseract/dependency_checker.rb
|
36
|
+
- lib/tesseract/version.rb
|
37
|
+
homepage: http://github.com/scottdavis/ruby-tesseract
|
38
|
+
licenses: []
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options:
|
41
|
+
- --charset=UTF-8
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ! '>='
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 1.8.6
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: 1.3.6
|
56
|
+
requirements: []
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 1.8.15
|
59
|
+
signing_key:
|
60
|
+
specification_version: 3
|
61
|
+
summary: Ruby wrapper for google tesseract
|
62
|
+
test_files: []
|