tesseract 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,15 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tesseract (0.0.1)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+
10
+ PLATFORMS
11
+ ruby
12
+
13
+ DEPENDENCIES
14
+ bundler (>= 1.0.0)
15
+ tesseract!
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "tesseract"
6
+ t.libs << "test"
7
+ t.test_files = FileList['test/*_test.rb']
8
+ t.verbose = true
9
+ end
data/Readme.md ADDED
@@ -0,0 +1,22 @@
1
+ # Ruby Tesseract
2
+
3
+ This is a library for using the tesseract OCR in ruby applications
4
+
5
+ ## Dependcies
6
+
7
+ 1. [Terreract](http://code.google.com/p/tesseract-ocr/)
8
+ 2. [ImageMagick](http://www.imagemagick.org/script/index.php) - Note the command line program `convert` needs to be accessible to ruby
9
+ 3. *nix based operating system
10
+
11
+ ##Usage
12
+
13
+ *Please Note the default language is english*
14
+
15
+ tess = Tesseract::Process.new("photo.jpg")
16
+ tess.to_s
17
+
18
+ Config options are also supported
19
+
20
+ tess = Tesseract::Process.new("photo.jpg", {:lang => 'some language', :chop_enable => 0})
21
+ tess.to_s
22
+
data/tesseract.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ Gem::Specification.new do |s|
2
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
3
+
4
+ s.name = %q{tesseract}
5
+ s.version = '0.0.1'
6
+ s.platform = Gem::Platform::RUBY
7
+
8
+ s.authors = ["Scott Davis"]
9
+ s.description = %q{Ruby wrapper for google tesseract}
10
+ s.summary = %q{Ruby wrapper for google tesseract}
11
+ s.email = %q{jetviper21@gmail.com}
12
+ s.date = Date.today.to_s
13
+ s.files = `git ls-files`.split("\n")
14
+ s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
15
+ s.require_path = 'tesseract'
16
+ s.homepage = %q{http://github.com/jetviper21/ruby-tesseract}
17
+ s.rdoc_options = ["--charset=UTF-8"]
18
+ s.required_rubygems_version = ">= 1.3.6"
19
+ s.add_development_dependency "bundler", ">= 1.0.0"
20
+ end
data/tesseract.rb ADDED
@@ -0,0 +1,54 @@
1
+ Dir["tesseract/*.rb"].each { |file| require file }
2
+ require 'pathname'
3
+ require 'digest/md5'
4
+ module Tesseract
5
+ class Process
6
+ attr_reader :image
7
+ attr_accessor :lang
8
+ CONVERT_COMMAND = 'convert'
9
+ TESSERACT_COMMAND = 'tesseract'
10
+
11
+ def initialize(image_name, options = {})
12
+ DependencyChecker.check!
13
+ @image = Pathname.new(image_name)
14
+ @hash = Digest::MD5.hexdigest("#{@image}-#{Time.now}")
15
+ @lang = options[:lang].nil? ? 'eng' : options.delete(:lang)
16
+ @options = options
17
+ end
18
+
19
+ def to_s
20
+ @out ||= process!
21
+ end
22
+
23
+ def process!
24
+ temp_image = to_tiff
25
+ text = tesseract_translation(temp_image)
26
+ FileHandler.cleanup!
27
+ text.gsub(/^\//, '')
28
+ end
29
+
30
+ def to_tiff
31
+ temp_file = FileHandler.create_temp_file("#{@hash}.tif")
32
+ system [CONVERT_COMMAND, image, temp_file].join(" ")
33
+ temp_file
34
+ end
35
+
36
+ def tesseract_translation(image_file)
37
+ temp_text_file = FileHandler.create_temp_file("#{@hash}")
38
+ config_file = write_configs
39
+ system [TESSERACT_COMMAND, image_file, temp_text_file, "-l #{@lang}", config_file, "&> /dev/null"].join(" ")
40
+ File.read("#{temp_text_file}.txt")
41
+ end
42
+
43
+ def write_configs
44
+ return '' if @options.empty?
45
+ path = FileHandler.create_temp_file("#{@hash}.config")
46
+ File.open(path, "w+") do |f|
47
+ @options.each { |k,v| f << "#{k} #{v}\n" }
48
+ end
49
+ path
50
+ end
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,40 @@
1
+ module Tesseract
2
+ class DependencyChecker
3
+ #putting these here so its easyer to test
4
+ IMAGE_MAGICK_ERROR = "ImageMagick \"convert\" command not found! Make sure ImageMagick is installed and in the system path"
5
+ TESSERACT_ERROR = "\"tesseract\" command not found! Make sure tesseract is installed and in the system path"
6
+ OS_ERROR = "Only Unix Based enviroments are supported Mac, Linux, etc."
7
+
8
+ def self.check!
9
+ check_os!
10
+ check_for_tesseract!
11
+ check_for_imagemagick!
12
+ true
13
+ end
14
+
15
+ private
16
+ #for easy mocking
17
+ def self.run_cmd(cmd)
18
+ `#{cmd}`
19
+ end
20
+
21
+ def self.check_os!
22
+ case ::RUBY_PLATFORM
23
+ when /darwin/
24
+ return true
25
+ when /linux/, /unix/
26
+ return true
27
+ end
28
+ raise Exception, OS_ERROR
29
+ end
30
+
31
+ def self.check_for_imagemagick!
32
+ raise Exception, IMAGE_MAGICK_ERROR if run_cmd('which convert').empty?
33
+ end
34
+
35
+ def self.check_for_tesseract!
36
+ raise Exception, TESSERACT_ERROR if run_cmd('which tesseract').empty?
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,20 @@
1
+ require 'pathname'
2
+ require 'tempfile'
3
+ module Tesseract
4
+ class FileHandler
5
+ @tempfiles = []
6
+
7
+ def self.create_temp_file(filename)
8
+ file = Pathname.new(Dir::tmpdir).join(filename)
9
+ @tempfiles << file
10
+ return file
11
+ end
12
+
13
+ def self.cleanup!
14
+ @tempfiles.each do |file|
15
+ File.unlink(file.to_s) if File.exists?(file.to_s)
16
+ end
17
+ end
18
+
19
+ end
20
+ end
data/test/photo.jpeg ADDED
Binary file
@@ -0,0 +1,82 @@
1
+ require 'test/unit'
2
+ require 'test/unit/assertions'
3
+ require 'rubygems'
4
+ require 'shoulda'
5
+ require 'test_helper'
6
+ require 'mocha'
7
+ require '../tesseract/tesseract'
8
+
9
+ class TesseractTest < Test::Unit::TestCase
10
+ TEST_FILE = File.join(File.dirname(__FILE__), 'photo.jpeg')
11
+ context "dependency os check fails windows" do
12
+ setup do
13
+ @old_val = RUBY_PLATFORM
14
+ silence_stream(STDERR) { Object.const_set("RUBY_PLATFORM", 'windows') }
15
+ end
16
+ should "throw exception" do
17
+ assert_raises Exception, Tesseract::DependencyChecker::OS_ERROR do
18
+ Tesseract::Process.new(TEST_FILE)
19
+ end
20
+ end
21
+ teardown do
22
+ silence_stream(STDERR) { Object.const_set("RUBY_PLATFORM", @old_val) }
23
+ end
24
+ end
25
+
26
+ context "dependency imagemagic fails" do
27
+ setup do
28
+ Tesseract::DependencyChecker.expects(:run_cmd).with("which tesseract").returns('foo').once
29
+ Tesseract::DependencyChecker.expects(:run_cmd).with("which convert").returns('').once
30
+ end
31
+ should "throw exception" do
32
+ assert_raises Exception, Tesseract::DependencyChecker::IMAGE_MAGICK_ERROR do
33
+ Tesseract::Process.new(TEST_FILE)
34
+ end
35
+ end
36
+ end
37
+
38
+ context "dependency tesseract fails" do
39
+ setup do
40
+ Tesseract::DependencyChecker.expects(:run_cmd).with("which tesseract").returns('').once
41
+ end
42
+ should "throw exception" do
43
+ assert_raises Exception, Tesseract::DependencyChecker::TESSERACT_ERROR do
44
+ Tesseract::Process.new(TEST_FILE)
45
+ end
46
+ end
47
+ end
48
+
49
+ context "tesseract" do
50
+ setup do
51
+ @tess = Tesseract::Process.new(TEST_FILE)
52
+ end
53
+ should "return text" do
54
+ assert !@tess.to_s.empty?
55
+ end
56
+ should "hanve lang of eng" do
57
+ assert_equal 'eng', @tess.lang
58
+ end
59
+ end
60
+
61
+ context "tesseract diff lang" do
62
+ setup do
63
+ @tess = Tesseract::Process.new(TEST_FILE, {:lang => 'butts'})
64
+ end
65
+ should "have lang of butts" do
66
+ assert_equal 'butts', @tess.lang
67
+ end
68
+ end
69
+
70
+ context "tesseract configs" do
71
+ setup do
72
+ @tess = Tesseract::Process.new(TEST_FILE, {:chop_enable=>0})
73
+ end
74
+ should "return text" do
75
+ assert !@tess.to_s.empty?
76
+ end
77
+ should "hanve lang of eng" do
78
+ assert_equal 'eng', @tess.lang
79
+ end
80
+ end
81
+
82
+ end
@@ -0,0 +1,10 @@
1
+ class Test::Unit::TestCase
2
+ def silence_stream(stream)
3
+ old_stream = stream.dup
4
+ stream.reopen('/dev/null')
5
+ stream.sync = true
6
+ yield
7
+ ensure
8
+ stream.reopen(old_stream)
9
+ end
10
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tesseract
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Scott Davis
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-02-22 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: bundler
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 1
32
+ - 0
33
+ - 0
34
+ version: 1.0.0
35
+ type: :development
36
+ version_requirements: *id001
37
+ description: Ruby wrapper for google tesseract
38
+ email: jetviper21@gmail.com
39
+ executables: []
40
+
41
+ extensions: []
42
+
43
+ extra_rdoc_files: []
44
+
45
+ files:
46
+ - Gemfile
47
+ - Gemfile.lock
48
+ - Rakefile
49
+ - Readme.md
50
+ - tesseract.gemspec
51
+ - tesseract.rb
52
+ - tesseract/dependency_checker.rb
53
+ - tesseract/file_handler.rb
54
+ - test/photo.jpeg
55
+ - test/tesseract_test.rb
56
+ - test/test_helper.rb
57
+ has_rdoc: true
58
+ homepage: http://github.com/jetviper21/ruby-tesseract
59
+ licenses: []
60
+
61
+ post_install_message:
62
+ rdoc_options:
63
+ - --charset=UTF-8
64
+ require_paths:
65
+ - tesseract
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ hash: 3
72
+ segments:
73
+ - 0
74
+ version: "0"
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ hash: 23
81
+ segments:
82
+ - 1
83
+ - 3
84
+ - 6
85
+ version: 1.3.6
86
+ requirements: []
87
+
88
+ rubyforge_project:
89
+ rubygems_version: 1.4.2
90
+ signing_key:
91
+ specification_version: 3
92
+ summary: Ruby wrapper for google tesseract
93
+ test_files: []
94
+