anjlab-tesseract 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/tesseract.rb ADDED
@@ -0,0 +1,11 @@
1
+ path = File.join(File.dirname(__FILE__), 'tesseract')
2
+ ['dependency_checker', 'file_handler', 'process'].each do |f|
3
+ require File.expand_path(File.join(path, f))
4
+ end
5
+ require 'pathname'
6
+ require 'digest/md5'
7
+ require 'shellwords'
8
+
9
+ module Tesseract
10
+
11
+ end
@@ -0,0 +1,40 @@
1
+ module Tesseract
2
+ class DependencyChecker
3
+ #putting these here so its easyer to test
4
+ IMAGE_MAGICK_ERROR = "ImageMagick \"convert\" command not found! Make sure ImageMagick is installed and in the system path"
5
+ TESSERACT_ERROR = "\"tesseract\" command not found! Make sure tesseract is installed and in the system path"
6
+ OS_ERROR = "Only Unix Based enviroments are supported Mac, Linux, etc."
7
+
8
+ def self.check!
9
+ check_os!
10
+ check_for_tesseract!
11
+ check_for_imagemagick!
12
+ true
13
+ end
14
+
15
+ private
16
+ #for easy mocking
17
+ def self.run_cmd(cmd)
18
+ `#{cmd}`
19
+ end
20
+
21
+ def self.check_os!
22
+ case ::RUBY_PLATFORM
23
+ when /darwin/
24
+ return true
25
+ when /linux/, /unix/
26
+ return true
27
+ end
28
+ raise Exception, OS_ERROR
29
+ end
30
+
31
+ def self.check_for_imagemagick!
32
+ raise Exception, IMAGE_MAGICK_ERROR if run_cmd('which convert').empty?
33
+ end
34
+
35
+ def self.check_for_tesseract!
36
+ raise Exception, TESSERACT_ERROR if run_cmd('which tesseract').empty?
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,17 @@
1
+ require 'pathname'
2
+ require 'tempfile'
3
+ module Tesseract
4
+ class FileHandler
5
+ @tempfiles = []
6
+ def self.create_temp_file(filename)
7
+ file = Pathname.new(Dir::tmpdir).join(filename)
8
+ @tempfiles << file
9
+ return file
10
+ end
11
+ def self.cleanup!
12
+ @tempfiles.each do |file|
13
+ File.unlink(file.to_s) if File.exists?(file.to_s)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,127 @@
1
+ require 'shellwords'
2
+ module Tesseract
3
+ class Process
4
+
5
+ attr_reader :image
6
+
7
+ CONVERT_COMMAND = 'convert'
8
+ TESSERACT_COMMAND = 'tesseract'
9
+ # Initialize a Tesseract translation process
10
+ # image_name is the file to translate
11
+ # options can be of the following:
12
+ # * tesseract_options Hash of options for tesseract
13
+ # * convert_options Array of options for convert
14
+ # * lang Image input language (eng, fra, etc. )
15
+ # * convert_command Convert binary name/path
16
+ # * tesseract_command Tesseract binary name/path
17
+ # * check_deps Boolean value. If true, verifies dependencies. Defaults to false
18
+ def initialize(image_name, options = {})
19
+ defaults = {
20
+ :tesseract_options => {},
21
+ :convert_options => {:input => [], :output => []},
22
+ :lang => :eng,
23
+ :convert_command => CONVERT_COMMAND,
24
+ :tesseract_command => TESSERACT_COMMAND,
25
+ :check_deps => false
26
+ }
27
+ @out = nil
28
+ @image = Pathname.new(image_name)
29
+ @hash = Digest::MD5.hexdigest("#{@image}-#{Time.now}")
30
+
31
+ merge_options! defaults, options
32
+ DependencyChecker.check! if @options[:check_deps]
33
+ end
34
+
35
+ def merge_options!(defaults, options)
36
+ @options = {}
37
+
38
+ if options.has_key? :tesseract_options
39
+ @options[:tesseract_options] = defaults[:tesseract_options].merge!(options[:tesseract_options]) if options.has_key? :tesseract_options
40
+ end
41
+
42
+
43
+ if options.has_key? :convert_options
44
+ @options[:convert_options] = defaults[:convert_options]
45
+ defaults[:convert_options].each do |k,v|
46
+ next unless options[:convert_options].has_key? k
47
+ @options[:convert_options][k] = v | options[:convert_options][k]
48
+ end
49
+ options.delete :convert_options
50
+ end
51
+
52
+ [:tesseract_options, :convert_options].each do |k|
53
+ options.delete(k) if options.has_key? k
54
+ end
55
+ @options = defaults.merge options
56
+ end
57
+
58
+ def lang=(lang)
59
+ @options[:lang]
60
+ end
61
+ def lang
62
+ @options[:lang]
63
+ end
64
+ def to_s
65
+ @out ||= process!
66
+ end
67
+
68
+ # Process the image into text.
69
+ def process!
70
+ temp_image = to_tiff
71
+ begin
72
+ text = tesseract_translation(temp_image)
73
+ rescue IOError
74
+ raise
75
+ ensure
76
+ FileHandler.cleanup!
77
+ end
78
+ text.gsub(/^\//, '')
79
+ end
80
+
81
+ # Generates the convert command.
82
+ def generate_convert_command(temp_file)
83
+ cmd = [@options[:convert_command]]
84
+ input_opt = @options[:convert_options][:input]
85
+ output_opt = @options[:convert_options][:output]
86
+
87
+ cmd += input_opt unless input_opt.empty?
88
+ cmd << Shellwords.shellescape(@image.to_s)
89
+ cmd += output_opt unless output_opt.empty?
90
+ cmd << temp_file.to_s
91
+ cmd.join(" ")
92
+ end
93
+
94
+ # Converts the source image to a tiff file.
95
+ def to_tiff
96
+ temp_file = FileHandler.create_temp_file("#{@hash}.tif")
97
+ executed = system generate_convert_command(temp_file)
98
+ raise RuntimeError, "`#{@options[:convert_command]}` could not be executed." if executed.nil?
99
+ temp_file
100
+ end
101
+
102
+ # Translate a tiff file into text
103
+ def tesseract_translation(image_file)
104
+ temp_text_file = FileHandler.create_temp_file(@hash.to_s)
105
+ config_file = write_configs
106
+ txt_file = "#{temp_text_file}.txt"
107
+ executed = system [@options[:tesseract_command], image_file.to_s, temp_text_file.to_s, "-l #{@options[:lang]}", config_file, "&> /dev/null"].join(' ')
108
+ raise RuntimeError, "`#{@options[:tesseract_command]}` could not be executed." if (executed.nil? || executed == false)
109
+ # wait until file (amazon cloud bug)
110
+ until File.exist?(txt_file)
111
+ sleep 1
112
+ end
113
+ out = File.read(txt_file)
114
+ File.unlink txt_file
115
+ out
116
+ end
117
+ # Writes Tesseract configuration for the current source file
118
+ def write_configs
119
+ return '' if @options[:tesseract_options].empty?
120
+ path = FileHandler.create_temp_file("#{@hash}.config")
121
+ File.open(path, "w+") do |f|
122
+ @options[:tesseract_options].each { |k,v| f << "#{k} #{v}\n" }
123
+ end
124
+ path
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,3 @@
1
+ module Tesseract
2
+ VERSION = '0.1.1'
3
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: anjlab-tesseract
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Scott Davis
9
+ - Martin Samson
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-04-18 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: bundler
17
+ requirement: &70257151089280 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: 1.0.0
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: *70257151089280
26
+ description: Ruby wrapper for google tesseract
27
+ email: jetviper21@gmail.com
28
+ executables: []
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - lib/tesseract.rb
33
+ - lib/tesseract/process.rb
34
+ - lib/tesseract/file_handler.rb
35
+ - lib/tesseract/dependency_checker.rb
36
+ - lib/tesseract/version.rb
37
+ homepage: http://github.com/scottdavis/ruby-tesseract
38
+ licenses: []
39
+ post_install_message:
40
+ rdoc_options:
41
+ - --charset=UTF-8
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ none: false
46
+ requirements:
47
+ - - ! '>='
48
+ - !ruby/object:Gem::Version
49
+ version: 1.8.6
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: 1.3.6
56
+ requirements: []
57
+ rubyforge_project:
58
+ rubygems_version: 1.8.15
59
+ signing_key:
60
+ specification_version: 3
61
+ summary: Ruby wrapper for google tesseract
62
+ test_files: []