anjlab-tesseract 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/tesseract.rb +11 -0
- data/lib/tesseract/dependency_checker.rb +40 -0
- data/lib/tesseract/file_handler.rb +17 -0
- data/lib/tesseract/process.rb +127 -0
- data/lib/tesseract/version.rb +3 -0
- metadata +62 -0
data/lib/tesseract.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
path = File.join(File.dirname(__FILE__), 'tesseract')
|
2
|
+
['dependency_checker', 'file_handler', 'process'].each do |f|
|
3
|
+
require File.expand_path(File.join(path, f))
|
4
|
+
end
|
5
|
+
require 'pathname'
|
6
|
+
require 'digest/md5'
|
7
|
+
require 'shellwords'
|
8
|
+
|
9
|
+
module Tesseract
|
10
|
+
|
11
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Tesseract
|
2
|
+
class DependencyChecker
|
3
|
+
#putting these here so its easyer to test
|
4
|
+
IMAGE_MAGICK_ERROR = "ImageMagick \"convert\" command not found! Make sure ImageMagick is installed and in the system path"
|
5
|
+
TESSERACT_ERROR = "\"tesseract\" command not found! Make sure tesseract is installed and in the system path"
|
6
|
+
OS_ERROR = "Only Unix Based enviroments are supported Mac, Linux, etc."
|
7
|
+
|
8
|
+
def self.check!
|
9
|
+
check_os!
|
10
|
+
check_for_tesseract!
|
11
|
+
check_for_imagemagick!
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
#for easy mocking
|
17
|
+
def self.run_cmd(cmd)
|
18
|
+
`#{cmd}`
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.check_os!
|
22
|
+
case ::RUBY_PLATFORM
|
23
|
+
when /darwin/
|
24
|
+
return true
|
25
|
+
when /linux/, /unix/
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
raise Exception, OS_ERROR
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.check_for_imagemagick!
|
32
|
+
raise Exception, IMAGE_MAGICK_ERROR if run_cmd('which convert').empty?
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.check_for_tesseract!
|
36
|
+
raise Exception, TESSERACT_ERROR if run_cmd('which tesseract').empty?
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
require 'tempfile'
|
3
|
+
module Tesseract
|
4
|
+
class FileHandler
|
5
|
+
@tempfiles = []
|
6
|
+
def self.create_temp_file(filename)
|
7
|
+
file = Pathname.new(Dir::tmpdir).join(filename)
|
8
|
+
@tempfiles << file
|
9
|
+
return file
|
10
|
+
end
|
11
|
+
def self.cleanup!
|
12
|
+
@tempfiles.each do |file|
|
13
|
+
File.unlink(file.to_s) if File.exists?(file.to_s)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'shellwords'
|
2
|
+
module Tesseract
|
3
|
+
class Process
|
4
|
+
|
5
|
+
attr_reader :image
|
6
|
+
|
7
|
+
CONVERT_COMMAND = 'convert'
|
8
|
+
TESSERACT_COMMAND = 'tesseract'
|
9
|
+
# Initialize a Tesseract translation process
|
10
|
+
# image_name is the file to translate
|
11
|
+
# options can be of the following:
|
12
|
+
# * tesseract_options Hash of options for tesseract
|
13
|
+
# * convert_options Array of options for convert
|
14
|
+
# * lang Image input language (eng, fra, etc. )
|
15
|
+
# * convert_command Convert binary name/path
|
16
|
+
# * tesseract_command Tesseract binary name/path
|
17
|
+
# * check_deps Boolean value. If true, verifies dependencies. Defaults to false
|
18
|
+
def initialize(image_name, options = {})
|
19
|
+
defaults = {
|
20
|
+
:tesseract_options => {},
|
21
|
+
:convert_options => {:input => [], :output => []},
|
22
|
+
:lang => :eng,
|
23
|
+
:convert_command => CONVERT_COMMAND,
|
24
|
+
:tesseract_command => TESSERACT_COMMAND,
|
25
|
+
:check_deps => false
|
26
|
+
}
|
27
|
+
@out = nil
|
28
|
+
@image = Pathname.new(image_name)
|
29
|
+
@hash = Digest::MD5.hexdigest("#{@image}-#{Time.now}")
|
30
|
+
|
31
|
+
merge_options! defaults, options
|
32
|
+
DependencyChecker.check! if @options[:check_deps]
|
33
|
+
end
|
34
|
+
|
35
|
+
def merge_options!(defaults, options)
|
36
|
+
@options = {}
|
37
|
+
|
38
|
+
if options.has_key? :tesseract_options
|
39
|
+
@options[:tesseract_options] = defaults[:tesseract_options].merge!(options[:tesseract_options]) if options.has_key? :tesseract_options
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
if options.has_key? :convert_options
|
44
|
+
@options[:convert_options] = defaults[:convert_options]
|
45
|
+
defaults[:convert_options].each do |k,v|
|
46
|
+
next unless options[:convert_options].has_key? k
|
47
|
+
@options[:convert_options][k] = v | options[:convert_options][k]
|
48
|
+
end
|
49
|
+
options.delete :convert_options
|
50
|
+
end
|
51
|
+
|
52
|
+
[:tesseract_options, :convert_options].each do |k|
|
53
|
+
options.delete(k) if options.has_key? k
|
54
|
+
end
|
55
|
+
@options = defaults.merge options
|
56
|
+
end
|
57
|
+
|
58
|
+
def lang=(lang)
|
59
|
+
@options[:lang]
|
60
|
+
end
|
61
|
+
def lang
|
62
|
+
@options[:lang]
|
63
|
+
end
|
64
|
+
def to_s
|
65
|
+
@out ||= process!
|
66
|
+
end
|
67
|
+
|
68
|
+
# Process the image into text.
|
69
|
+
def process!
|
70
|
+
temp_image = to_tiff
|
71
|
+
begin
|
72
|
+
text = tesseract_translation(temp_image)
|
73
|
+
rescue IOError
|
74
|
+
raise
|
75
|
+
ensure
|
76
|
+
FileHandler.cleanup!
|
77
|
+
end
|
78
|
+
text.gsub(/^\//, '')
|
79
|
+
end
|
80
|
+
|
81
|
+
# Generates the convert command.
|
82
|
+
def generate_convert_command(temp_file)
|
83
|
+
cmd = [@options[:convert_command]]
|
84
|
+
input_opt = @options[:convert_options][:input]
|
85
|
+
output_opt = @options[:convert_options][:output]
|
86
|
+
|
87
|
+
cmd += input_opt unless input_opt.empty?
|
88
|
+
cmd << Shellwords.shellescape(@image.to_s)
|
89
|
+
cmd += output_opt unless output_opt.empty?
|
90
|
+
cmd << temp_file.to_s
|
91
|
+
cmd.join(" ")
|
92
|
+
end
|
93
|
+
|
94
|
+
# Converts the source image to a tiff file.
|
95
|
+
def to_tiff
|
96
|
+
temp_file = FileHandler.create_temp_file("#{@hash}.tif")
|
97
|
+
executed = system generate_convert_command(temp_file)
|
98
|
+
raise RuntimeError, "`#{@options[:convert_command]}` could not be executed." if executed.nil?
|
99
|
+
temp_file
|
100
|
+
end
|
101
|
+
|
102
|
+
# Translate a tiff file into text
|
103
|
+
def tesseract_translation(image_file)
|
104
|
+
temp_text_file = FileHandler.create_temp_file(@hash.to_s)
|
105
|
+
config_file = write_configs
|
106
|
+
txt_file = "#{temp_text_file}.txt"
|
107
|
+
executed = system [@options[:tesseract_command], image_file.to_s, temp_text_file.to_s, "-l #{@options[:lang]}", config_file, "&> /dev/null"].join(' ')
|
108
|
+
raise RuntimeError, "`#{@options[:tesseract_command]}` could not be executed." if (executed.nil? || executed == false)
|
109
|
+
# wait until file (amazon cloud bug)
|
110
|
+
until File.exist?(txt_file)
|
111
|
+
sleep 1
|
112
|
+
end
|
113
|
+
out = File.read(txt_file)
|
114
|
+
File.unlink txt_file
|
115
|
+
out
|
116
|
+
end
|
117
|
+
# Writes Tesseract configuration for the current source file
|
118
|
+
def write_configs
|
119
|
+
return '' if @options[:tesseract_options].empty?
|
120
|
+
path = FileHandler.create_temp_file("#{@hash}.config")
|
121
|
+
File.open(path, "w+") do |f|
|
122
|
+
@options[:tesseract_options].each { |k,v| f << "#{k} #{v}\n" }
|
123
|
+
end
|
124
|
+
path
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: anjlab-tesseract
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Scott Davis
|
9
|
+
- Martin Samson
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2012-04-18 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: bundler
|
17
|
+
requirement: &70257151089280 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.0.0
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *70257151089280
|
26
|
+
description: Ruby wrapper for google tesseract
|
27
|
+
email: jetviper21@gmail.com
|
28
|
+
executables: []
|
29
|
+
extensions: []
|
30
|
+
extra_rdoc_files: []
|
31
|
+
files:
|
32
|
+
- lib/tesseract.rb
|
33
|
+
- lib/tesseract/process.rb
|
34
|
+
- lib/tesseract/file_handler.rb
|
35
|
+
- lib/tesseract/dependency_checker.rb
|
36
|
+
- lib/tesseract/version.rb
|
37
|
+
homepage: http://github.com/scottdavis/ruby-tesseract
|
38
|
+
licenses: []
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options:
|
41
|
+
- --charset=UTF-8
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ! '>='
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 1.8.6
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: 1.3.6
|
56
|
+
requirements: []
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 1.8.15
|
59
|
+
signing_key:
|
60
|
+
specification_version: 3
|
61
|
+
summary: Ruby wrapper for google tesseract
|
62
|
+
test_files: []
|