tesseract 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,15 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tesseract (0.0.1)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+
10
+ PLATFORMS
11
+ ruby
12
+
13
+ DEPENDENCIES
14
+ bundler (>= 1.0.0)
15
+ tesseract!
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/rdoctask'
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "tesseract"
6
+ t.libs << "test"
7
+ t.test_files = FileList['test/*_test.rb']
8
+ t.verbose = true
9
+ end
data/Readme.md ADDED
@@ -0,0 +1,22 @@
1
+ # Ruby Tesseract
2
+
3
+ This is a library for using the tesseract OCR in ruby applications
4
+
5
+ ## Dependcies
6
+
7
+ 1. [Terreract](http://code.google.com/p/tesseract-ocr/)
8
+ 2. [ImageMagick](http://www.imagemagick.org/script/index.php) - Note the command line program `convert` needs to be accessible to ruby
9
+ 3. *nix based operating system
10
+
11
+ ##Usage
12
+
13
+ *Please Note the default language is english*
14
+
15
+ tess = Tesseract::Process.new("photo.jpg")
16
+ tess.to_s
17
+
18
+ Config options are also supported
19
+
20
+ tess = Tesseract::Process.new("photo.jpg", {:lang => 'some language', :chop_enable => 0})
21
+ tess.to_s
22
+
data/tesseract.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ Gem::Specification.new do |s|
2
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
3
+
4
+ s.name = %q{tesseract}
5
+ s.version = '0.0.1'
6
+ s.platform = Gem::Platform::RUBY
7
+
8
+ s.authors = ["Scott Davis"]
9
+ s.description = %q{Ruby wrapper for google tesseract}
10
+ s.summary = %q{Ruby wrapper for google tesseract}
11
+ s.email = %q{jetviper21@gmail.com}
12
+ s.date = Date.today.to_s
13
+ s.files = `git ls-files`.split("\n")
14
+ s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
15
+ s.require_path = 'tesseract'
16
+ s.homepage = %q{http://github.com/jetviper21/ruby-tesseract}
17
+ s.rdoc_options = ["--charset=UTF-8"]
18
+ s.required_rubygems_version = ">= 1.3.6"
19
+ s.add_development_dependency "bundler", ">= 1.0.0"
20
+ end
data/tesseract.rb ADDED
@@ -0,0 +1,54 @@
1
+ Dir["tesseract/*.rb"].each { |file| require file }
2
+ require 'pathname'
3
+ require 'digest/md5'
4
+ module Tesseract
5
+ class Process
6
+ attr_reader :image
7
+ attr_accessor :lang
8
+ CONVERT_COMMAND = 'convert'
9
+ TESSERACT_COMMAND = 'tesseract'
10
+
11
+ def initialize(image_name, options = {})
12
+ DependencyChecker.check!
13
+ @image = Pathname.new(image_name)
14
+ @hash = Digest::MD5.hexdigest("#{@image}-#{Time.now}")
15
+ @lang = options[:lang].nil? ? 'eng' : options.delete(:lang)
16
+ @options = options
17
+ end
18
+
19
+ def to_s
20
+ @out ||= process!
21
+ end
22
+
23
+ def process!
24
+ temp_image = to_tiff
25
+ text = tesseract_translation(temp_image)
26
+ FileHandler.cleanup!
27
+ text.gsub(/^\//, '')
28
+ end
29
+
30
+ def to_tiff
31
+ temp_file = FileHandler.create_temp_file("#{@hash}.tif")
32
+ system [CONVERT_COMMAND, image, temp_file].join(" ")
33
+ temp_file
34
+ end
35
+
36
+ def tesseract_translation(image_file)
37
+ temp_text_file = FileHandler.create_temp_file("#{@hash}")
38
+ config_file = write_configs
39
+ system [TESSERACT_COMMAND, image_file, temp_text_file, "-l #{@lang}", config_file, "&> /dev/null"].join(" ")
40
+ File.read("#{temp_text_file}.txt")
41
+ end
42
+
43
+ def write_configs
44
+ return '' if @options.empty?
45
+ path = FileHandler.create_temp_file("#{@hash}.config")
46
+ File.open(path, "w+") do |f|
47
+ @options.each { |k,v| f << "#{k} #{v}\n" }
48
+ end
49
+ path
50
+ end
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,40 @@
1
+ module Tesseract
2
+ class DependencyChecker
3
+ #putting these here so its easyer to test
4
+ IMAGE_MAGICK_ERROR = "ImageMagick \"convert\" command not found! Make sure ImageMagick is installed and in the system path"
5
+ TESSERACT_ERROR = "\"tesseract\" command not found! Make sure tesseract is installed and in the system path"
6
+ OS_ERROR = "Only Unix Based enviroments are supported Mac, Linux, etc."
7
+
8
+ def self.check!
9
+ check_os!
10
+ check_for_tesseract!
11
+ check_for_imagemagick!
12
+ true
13
+ end
14
+
15
+ private
16
+ #for easy mocking
17
+ def self.run_cmd(cmd)
18
+ `#{cmd}`
19
+ end
20
+
21
+ def self.check_os!
22
+ case ::RUBY_PLATFORM
23
+ when /darwin/
24
+ return true
25
+ when /linux/, /unix/
26
+ return true
27
+ end
28
+ raise Exception, OS_ERROR
29
+ end
30
+
31
+ def self.check_for_imagemagick!
32
+ raise Exception, IMAGE_MAGICK_ERROR if run_cmd('which convert').empty?
33
+ end
34
+
35
+ def self.check_for_tesseract!
36
+ raise Exception, TESSERACT_ERROR if run_cmd('which tesseract').empty?
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,20 @@
1
+ require 'pathname'
2
+ require 'tempfile'
3
+ module Tesseract
4
+ class FileHandler
5
+ @tempfiles = []
6
+
7
+ def self.create_temp_file(filename)
8
+ file = Pathname.new(Dir::tmpdir).join(filename)
9
+ @tempfiles << file
10
+ return file
11
+ end
12
+
13
+ def self.cleanup!
14
+ @tempfiles.each do |file|
15
+ File.unlink(file.to_s) if File.exists?(file.to_s)
16
+ end
17
+ end
18
+
19
+ end
20
+ end
data/test/photo.jpeg ADDED
Binary file
@@ -0,0 +1,82 @@
1
+ require 'test/unit'
2
+ require 'test/unit/assertions'
3
+ require 'rubygems'
4
+ require 'shoulda'
5
+ require 'test_helper'
6
+ require 'mocha'
7
+ require '../tesseract/tesseract'
8
+
9
+ class TesseractTest < Test::Unit::TestCase
10
+ TEST_FILE = File.join(File.dirname(__FILE__), 'photo.jpeg')
11
+ context "dependency os check fails windows" do
12
+ setup do
13
+ @old_val = RUBY_PLATFORM
14
+ silence_stream(STDERR) { Object.const_set("RUBY_PLATFORM", 'windows') }
15
+ end
16
+ should "throw exception" do
17
+ assert_raises Exception, Tesseract::DependencyChecker::OS_ERROR do
18
+ Tesseract::Process.new(TEST_FILE)
19
+ end
20
+ end
21
+ teardown do
22
+ silence_stream(STDERR) { Object.const_set("RUBY_PLATFORM", @old_val) }
23
+ end
24
+ end
25
+
26
+ context "dependency imagemagic fails" do
27
+ setup do
28
+ Tesseract::DependencyChecker.expects(:run_cmd).with("which tesseract").returns('foo').once
29
+ Tesseract::DependencyChecker.expects(:run_cmd).with("which convert").returns('').once
30
+ end
31
+ should "throw exception" do
32
+ assert_raises Exception, Tesseract::DependencyChecker::IMAGE_MAGICK_ERROR do
33
+ Tesseract::Process.new(TEST_FILE)
34
+ end
35
+ end
36
+ end
37
+
38
+ context "dependency tesseract fails" do
39
+ setup do
40
+ Tesseract::DependencyChecker.expects(:run_cmd).with("which tesseract").returns('').once
41
+ end
42
+ should "throw exception" do
43
+ assert_raises Exception, Tesseract::DependencyChecker::TESSERACT_ERROR do
44
+ Tesseract::Process.new(TEST_FILE)
45
+ end
46
+ end
47
+ end
48
+
49
+ context "tesseract" do
50
+ setup do
51
+ @tess = Tesseract::Process.new(TEST_FILE)
52
+ end
53
+ should "return text" do
54
+ assert !@tess.to_s.empty?
55
+ end
56
+ should "hanve lang of eng" do
57
+ assert_equal 'eng', @tess.lang
58
+ end
59
+ end
60
+
61
+ context "tesseract diff lang" do
62
+ setup do
63
+ @tess = Tesseract::Process.new(TEST_FILE, {:lang => 'butts'})
64
+ end
65
+ should "have lang of butts" do
66
+ assert_equal 'butts', @tess.lang
67
+ end
68
+ end
69
+
70
+ context "tesseract configs" do
71
+ setup do
72
+ @tess = Tesseract::Process.new(TEST_FILE, {:chop_enable=>0})
73
+ end
74
+ should "return text" do
75
+ assert !@tess.to_s.empty?
76
+ end
77
+ should "hanve lang of eng" do
78
+ assert_equal 'eng', @tess.lang
79
+ end
80
+ end
81
+
82
+ end
@@ -0,0 +1,10 @@
1
+ class Test::Unit::TestCase
2
+ def silence_stream(stream)
3
+ old_stream = stream.dup
4
+ stream.reopen('/dev/null')
5
+ stream.sync = true
6
+ yield
7
+ ensure
8
+ stream.reopen(old_stream)
9
+ end
10
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tesseract
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Scott Davis
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-02-22 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: bundler
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 1
32
+ - 0
33
+ - 0
34
+ version: 1.0.0
35
+ type: :development
36
+ version_requirements: *id001
37
+ description: Ruby wrapper for google tesseract
38
+ email: jetviper21@gmail.com
39
+ executables: []
40
+
41
+ extensions: []
42
+
43
+ extra_rdoc_files: []
44
+
45
+ files:
46
+ - Gemfile
47
+ - Gemfile.lock
48
+ - Rakefile
49
+ - Readme.md
50
+ - tesseract.gemspec
51
+ - tesseract.rb
52
+ - tesseract/dependency_checker.rb
53
+ - tesseract/file_handler.rb
54
+ - test/photo.jpeg
55
+ - test/tesseract_test.rb
56
+ - test/test_helper.rb
57
+ has_rdoc: true
58
+ homepage: http://github.com/jetviper21/ruby-tesseract
59
+ licenses: []
60
+
61
+ post_install_message:
62
+ rdoc_options:
63
+ - --charset=UTF-8
64
+ require_paths:
65
+ - tesseract
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ hash: 3
72
+ segments:
73
+ - 0
74
+ version: "0"
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ hash: 23
81
+ segments:
82
+ - 1
83
+ - 3
84
+ - 6
85
+ version: 1.3.6
86
+ requirements: []
87
+
88
+ rubyforge_project:
89
+ rubygems_version: 1.4.2
90
+ signing_key:
91
+ specification_version: 3
92
+ summary: Ruby wrapper for google tesseract
93
+ test_files: []
94
+