anjlab-tesseract 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/tesseract.rb ADDED
@@ -0,0 +1,11 @@
1
+ path = File.join(File.dirname(__FILE__), 'tesseract')
2
+ ['dependency_checker', 'file_handler', 'process'].each do |f|
3
+ require File.expand_path(File.join(path, f))
4
+ end
5
+ require 'pathname'
6
+ require 'digest/md5'
7
+ require 'shellwords'
8
+
9
+ module Tesseract
10
+
11
+ end
@@ -0,0 +1,40 @@
1
+ module Tesseract
2
+ class DependencyChecker
3
+ #putting these here so its easyer to test
4
+ IMAGE_MAGICK_ERROR = "ImageMagick \"convert\" command not found! Make sure ImageMagick is installed and in the system path"
5
+ TESSERACT_ERROR = "\"tesseract\" command not found! Make sure tesseract is installed and in the system path"
6
+ OS_ERROR = "Only Unix Based enviroments are supported Mac, Linux, etc."
7
+
8
+ def self.check!
9
+ check_os!
10
+ check_for_tesseract!
11
+ check_for_imagemagick!
12
+ true
13
+ end
14
+
15
+ private
16
+ #for easy mocking
17
+ def self.run_cmd(cmd)
18
+ `#{cmd}`
19
+ end
20
+
21
+ def self.check_os!
22
+ case ::RUBY_PLATFORM
23
+ when /darwin/
24
+ return true
25
+ when /linux/, /unix/
26
+ return true
27
+ end
28
+ raise Exception, OS_ERROR
29
+ end
30
+
31
+ def self.check_for_imagemagick!
32
+ raise Exception, IMAGE_MAGICK_ERROR if run_cmd('which convert').empty?
33
+ end
34
+
35
+ def self.check_for_tesseract!
36
+ raise Exception, TESSERACT_ERROR if run_cmd('which tesseract').empty?
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,17 @@
1
+ require 'pathname'
2
+ require 'tempfile'
3
+ module Tesseract
4
+ class FileHandler
5
+ @tempfiles = []
6
+ def self.create_temp_file(filename)
7
+ file = Pathname.new(Dir::tmpdir).join(filename)
8
+ @tempfiles << file
9
+ return file
10
+ end
11
+ def self.cleanup!
12
+ @tempfiles.each do |file|
13
+ File.unlink(file.to_s) if File.exists?(file.to_s)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,127 @@
1
+ require 'shellwords'
2
+ module Tesseract
3
+ class Process
4
+
5
+ attr_reader :image
6
+
7
+ CONVERT_COMMAND = 'convert'
8
+ TESSERACT_COMMAND = 'tesseract'
9
+ # Initialize a Tesseract translation process
10
+ # image_name is the file to translate
11
+ # options can be of the following:
12
+ # * tesseract_options Hash of options for tesseract
13
+ # * convert_options Array of options for convert
14
+ # * lang Image input language (eng, fra, etc. )
15
+ # * convert_command Convert binary name/path
16
+ # * tesseract_command Tesseract binary name/path
17
+ # * check_deps Boolean value. If true, verifies dependencies. Defaults to false
18
+ def initialize(image_name, options = {})
19
+ defaults = {
20
+ :tesseract_options => {},
21
+ :convert_options => {:input => [], :output => []},
22
+ :lang => :eng,
23
+ :convert_command => CONVERT_COMMAND,
24
+ :tesseract_command => TESSERACT_COMMAND,
25
+ :check_deps => false
26
+ }
27
+ @out = nil
28
+ @image = Pathname.new(image_name)
29
+ @hash = Digest::MD5.hexdigest("#{@image}-#{Time.now}")
30
+
31
+ merge_options! defaults, options
32
+ DependencyChecker.check! if @options[:check_deps]
33
+ end
34
+
35
+ def merge_options!(defaults, options)
36
+ @options = {}
37
+
38
+ if options.has_key? :tesseract_options
39
+ @options[:tesseract_options] = defaults[:tesseract_options].merge!(options[:tesseract_options]) if options.has_key? :tesseract_options
40
+ end
41
+
42
+
43
+ if options.has_key? :convert_options
44
+ @options[:convert_options] = defaults[:convert_options]
45
+ defaults[:convert_options].each do |k,v|
46
+ next unless options[:convert_options].has_key? k
47
+ @options[:convert_options][k] = v | options[:convert_options][k]
48
+ end
49
+ options.delete :convert_options
50
+ end
51
+
52
+ [:tesseract_options, :convert_options].each do |k|
53
+ options.delete(k) if options.has_key? k
54
+ end
55
+ @options = defaults.merge options
56
+ end
57
+
58
+ def lang=(lang)
59
+ @options[:lang]
60
+ end
61
+ def lang
62
+ @options[:lang]
63
+ end
64
+ def to_s
65
+ @out ||= process!
66
+ end
67
+
68
+ # Process the image into text.
69
+ def process!
70
+ temp_image = to_tiff
71
+ begin
72
+ text = tesseract_translation(temp_image)
73
+ rescue IOError
74
+ raise
75
+ ensure
76
+ FileHandler.cleanup!
77
+ end
78
+ text.gsub(/^\//, '')
79
+ end
80
+
81
+ # Generates the convert command.
82
+ def generate_convert_command(temp_file)
83
+ cmd = [@options[:convert_command]]
84
+ input_opt = @options[:convert_options][:input]
85
+ output_opt = @options[:convert_options][:output]
86
+
87
+ cmd += input_opt unless input_opt.empty?
88
+ cmd << Shellwords.shellescape(@image.to_s)
89
+ cmd += output_opt unless output_opt.empty?
90
+ cmd << temp_file.to_s
91
+ cmd.join(" ")
92
+ end
93
+
94
+ # Converts the source image to a tiff file.
95
+ def to_tiff
96
+ temp_file = FileHandler.create_temp_file("#{@hash}.tif")
97
+ executed = system generate_convert_command(temp_file)
98
+ raise RuntimeError, "`#{@options[:convert_command]}` could not be executed." if executed.nil?
99
+ temp_file
100
+ end
101
+
102
+ # Translate a tiff file into text
103
+ def tesseract_translation(image_file)
104
+ temp_text_file = FileHandler.create_temp_file(@hash.to_s)
105
+ config_file = write_configs
106
+ txt_file = "#{temp_text_file}.txt"
107
+ executed = system [@options[:tesseract_command], image_file.to_s, temp_text_file.to_s, "-l #{@options[:lang]}", config_file, "&> /dev/null"].join(' ')
108
+ raise RuntimeError, "`#{@options[:tesseract_command]}` could not be executed." if (executed.nil? || executed == false)
109
+ # wait until file (amazon cloud bug)
110
+ until File.exist?(txt_file)
111
+ sleep 1
112
+ end
113
+ out = File.read(txt_file)
114
+ File.unlink txt_file
115
+ out
116
+ end
117
+ # Writes Tesseract configuration for the current source file
118
+ def write_configs
119
+ return '' if @options[:tesseract_options].empty?
120
+ path = FileHandler.create_temp_file("#{@hash}.config")
121
+ File.open(path, "w+") do |f|
122
+ @options[:tesseract_options].each { |k,v| f << "#{k} #{v}\n" }
123
+ end
124
+ path
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,3 @@
1
+ module Tesseract
2
+ VERSION = '0.1.1'
3
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: anjlab-tesseract
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Scott Davis
9
+ - Martin Samson
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-04-18 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: bundler
17
+ requirement: &70257151089280 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: 1.0.0
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: *70257151089280
26
+ description: Ruby wrapper for google tesseract
27
+ email: jetviper21@gmail.com
28
+ executables: []
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - lib/tesseract.rb
33
+ - lib/tesseract/process.rb
34
+ - lib/tesseract/file_handler.rb
35
+ - lib/tesseract/dependency_checker.rb
36
+ - lib/tesseract/version.rb
37
+ homepage: http://github.com/scottdavis/ruby-tesseract
38
+ licenses: []
39
+ post_install_message:
40
+ rdoc_options:
41
+ - --charset=UTF-8
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ none: false
46
+ requirements:
47
+ - - ! '>='
48
+ - !ruby/object:Gem::Version
49
+ version: 1.8.6
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: 1.3.6
56
+ requirements: []
57
+ rubyforge_project:
58
+ rubygems_version: 1.8.15
59
+ signing_key:
60
+ specification_version: 3
61
+ summary: Ruby wrapper for google tesseract
62
+ test_files: []