tesseract 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +3 -0
- data/Gemfile.lock +15 -0
- data/Rakefile +9 -0
- data/Readme.md +22 -0
- data/tesseract.gemspec +20 -0
- data/tesseract.rb +54 -0
- data/tesseract/dependency_checker.rb +40 -0
- data/tesseract/file_handler.rb +20 -0
- data/test/photo.jpeg +0 -0
- data/test/tesseract_test.rb +82 -0
- data/test/test_helper.rb +10 -0
- metadata +94 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
data/Rakefile
ADDED
data/Readme.md
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Ruby Tesseract
|
2
|
+
|
3
|
+
This is a library for using the tesseract OCR in ruby applications
|
4
|
+
|
5
|
+
## Dependcies
|
6
|
+
|
7
|
+
1. [Terreract](http://code.google.com/p/tesseract-ocr/)
|
8
|
+
2. [ImageMagick](http://www.imagemagick.org/script/index.php) - Note the command line program `convert` needs to be accessible to ruby
|
9
|
+
3. *nix based operating system
|
10
|
+
|
11
|
+
##Usage
|
12
|
+
|
13
|
+
*Please Note the default language is english*
|
14
|
+
|
15
|
+
tess = Tesseract::Process.new("photo.jpg")
|
16
|
+
tess.to_s
|
17
|
+
|
18
|
+
Config options are also supported
|
19
|
+
|
20
|
+
tess = Tesseract::Process.new("photo.jpg", {:lang => 'some language', :chop_enable => 0})
|
21
|
+
tess.to_s
|
22
|
+
|
data/tesseract.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
3
|
+
|
4
|
+
s.name = %q{tesseract}
|
5
|
+
s.version = '0.0.1'
|
6
|
+
s.platform = Gem::Platform::RUBY
|
7
|
+
|
8
|
+
s.authors = ["Scott Davis"]
|
9
|
+
s.description = %q{Ruby wrapper for google tesseract}
|
10
|
+
s.summary = %q{Ruby wrapper for google tesseract}
|
11
|
+
s.email = %q{jetviper21@gmail.com}
|
12
|
+
s.date = Date.today.to_s
|
13
|
+
s.files = `git ls-files`.split("\n")
|
14
|
+
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
15
|
+
s.require_path = 'tesseract'
|
16
|
+
s.homepage = %q{http://github.com/jetviper21/ruby-tesseract}
|
17
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
18
|
+
s.required_rubygems_version = ">= 1.3.6"
|
19
|
+
s.add_development_dependency "bundler", ">= 1.0.0"
|
20
|
+
end
|
data/tesseract.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
Dir["tesseract/*.rb"].each { |file| require file }
|
2
|
+
require 'pathname'
|
3
|
+
require 'digest/md5'
|
4
|
+
module Tesseract
|
5
|
+
class Process
|
6
|
+
attr_reader :image
|
7
|
+
attr_accessor :lang
|
8
|
+
CONVERT_COMMAND = 'convert'
|
9
|
+
TESSERACT_COMMAND = 'tesseract'
|
10
|
+
|
11
|
+
def initialize(image_name, options = {})
|
12
|
+
DependencyChecker.check!
|
13
|
+
@image = Pathname.new(image_name)
|
14
|
+
@hash = Digest::MD5.hexdigest("#{@image}-#{Time.now}")
|
15
|
+
@lang = options[:lang].nil? ? 'eng' : options.delete(:lang)
|
16
|
+
@options = options
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_s
|
20
|
+
@out ||= process!
|
21
|
+
end
|
22
|
+
|
23
|
+
def process!
|
24
|
+
temp_image = to_tiff
|
25
|
+
text = tesseract_translation(temp_image)
|
26
|
+
FileHandler.cleanup!
|
27
|
+
text.gsub(/^\//, '')
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_tiff
|
31
|
+
temp_file = FileHandler.create_temp_file("#{@hash}.tif")
|
32
|
+
system [CONVERT_COMMAND, image, temp_file].join(" ")
|
33
|
+
temp_file
|
34
|
+
end
|
35
|
+
|
36
|
+
def tesseract_translation(image_file)
|
37
|
+
temp_text_file = FileHandler.create_temp_file("#{@hash}")
|
38
|
+
config_file = write_configs
|
39
|
+
system [TESSERACT_COMMAND, image_file, temp_text_file, "-l #{@lang}", config_file, "&> /dev/null"].join(" ")
|
40
|
+
File.read("#{temp_text_file}.txt")
|
41
|
+
end
|
42
|
+
|
43
|
+
def write_configs
|
44
|
+
return '' if @options.empty?
|
45
|
+
path = FileHandler.create_temp_file("#{@hash}.config")
|
46
|
+
File.open(path, "w+") do |f|
|
47
|
+
@options.each { |k,v| f << "#{k} #{v}\n" }
|
48
|
+
end
|
49
|
+
path
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Tesseract
|
2
|
+
class DependencyChecker
|
3
|
+
#putting these here so its easyer to test
|
4
|
+
IMAGE_MAGICK_ERROR = "ImageMagick \"convert\" command not found! Make sure ImageMagick is installed and in the system path"
|
5
|
+
TESSERACT_ERROR = "\"tesseract\" command not found! Make sure tesseract is installed and in the system path"
|
6
|
+
OS_ERROR = "Only Unix Based enviroments are supported Mac, Linux, etc."
|
7
|
+
|
8
|
+
def self.check!
|
9
|
+
check_os!
|
10
|
+
check_for_tesseract!
|
11
|
+
check_for_imagemagick!
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
#for easy mocking
|
17
|
+
def self.run_cmd(cmd)
|
18
|
+
`#{cmd}`
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.check_os!
|
22
|
+
case ::RUBY_PLATFORM
|
23
|
+
when /darwin/
|
24
|
+
return true
|
25
|
+
when /linux/, /unix/
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
raise Exception, OS_ERROR
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.check_for_imagemagick!
|
32
|
+
raise Exception, IMAGE_MAGICK_ERROR if run_cmd('which convert').empty?
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.check_for_tesseract!
|
36
|
+
raise Exception, TESSERACT_ERROR if run_cmd('which tesseract').empty?
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
require 'tempfile'
|
3
|
+
module Tesseract
|
4
|
+
class FileHandler
|
5
|
+
@tempfiles = []
|
6
|
+
|
7
|
+
def self.create_temp_file(filename)
|
8
|
+
file = Pathname.new(Dir::tmpdir).join(filename)
|
9
|
+
@tempfiles << file
|
10
|
+
return file
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.cleanup!
|
14
|
+
@tempfiles.each do |file|
|
15
|
+
File.unlink(file.to_s) if File.exists?(file.to_s)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
data/test/photo.jpeg
ADDED
Binary file
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'test/unit/assertions'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'shoulda'
|
5
|
+
require 'test_helper'
|
6
|
+
require 'mocha'
|
7
|
+
require '../tesseract/tesseract'
|
8
|
+
|
9
|
+
class TesseractTest < Test::Unit::TestCase
|
10
|
+
TEST_FILE = File.join(File.dirname(__FILE__), 'photo.jpeg')
|
11
|
+
context "dependency os check fails windows" do
|
12
|
+
setup do
|
13
|
+
@old_val = RUBY_PLATFORM
|
14
|
+
silence_stream(STDERR) { Object.const_set("RUBY_PLATFORM", 'windows') }
|
15
|
+
end
|
16
|
+
should "throw exception" do
|
17
|
+
assert_raises Exception, Tesseract::DependencyChecker::OS_ERROR do
|
18
|
+
Tesseract::Process.new(TEST_FILE)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
teardown do
|
22
|
+
silence_stream(STDERR) { Object.const_set("RUBY_PLATFORM", @old_val) }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
context "dependency imagemagic fails" do
|
27
|
+
setup do
|
28
|
+
Tesseract::DependencyChecker.expects(:run_cmd).with("which tesseract").returns('foo').once
|
29
|
+
Tesseract::DependencyChecker.expects(:run_cmd).with("which convert").returns('').once
|
30
|
+
end
|
31
|
+
should "throw exception" do
|
32
|
+
assert_raises Exception, Tesseract::DependencyChecker::IMAGE_MAGICK_ERROR do
|
33
|
+
Tesseract::Process.new(TEST_FILE)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
context "dependency tesseract fails" do
|
39
|
+
setup do
|
40
|
+
Tesseract::DependencyChecker.expects(:run_cmd).with("which tesseract").returns('').once
|
41
|
+
end
|
42
|
+
should "throw exception" do
|
43
|
+
assert_raises Exception, Tesseract::DependencyChecker::TESSERACT_ERROR do
|
44
|
+
Tesseract::Process.new(TEST_FILE)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
context "tesseract" do
|
50
|
+
setup do
|
51
|
+
@tess = Tesseract::Process.new(TEST_FILE)
|
52
|
+
end
|
53
|
+
should "return text" do
|
54
|
+
assert !@tess.to_s.empty?
|
55
|
+
end
|
56
|
+
should "hanve lang of eng" do
|
57
|
+
assert_equal 'eng', @tess.lang
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
context "tesseract diff lang" do
|
62
|
+
setup do
|
63
|
+
@tess = Tesseract::Process.new(TEST_FILE, {:lang => 'butts'})
|
64
|
+
end
|
65
|
+
should "have lang of butts" do
|
66
|
+
assert_equal 'butts', @tess.lang
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
context "tesseract configs" do
|
71
|
+
setup do
|
72
|
+
@tess = Tesseract::Process.new(TEST_FILE, {:chop_enable=>0})
|
73
|
+
end
|
74
|
+
should "return text" do
|
75
|
+
assert !@tess.to_s.empty?
|
76
|
+
end
|
77
|
+
should "hanve lang of eng" do
|
78
|
+
assert_equal 'eng', @tess.lang
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tesseract
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Scott Davis
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-02-22 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: bundler
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 23
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 0
|
33
|
+
- 0
|
34
|
+
version: 1.0.0
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id001
|
37
|
+
description: Ruby wrapper for google tesseract
|
38
|
+
email: jetviper21@gmail.com
|
39
|
+
executables: []
|
40
|
+
|
41
|
+
extensions: []
|
42
|
+
|
43
|
+
extra_rdoc_files: []
|
44
|
+
|
45
|
+
files:
|
46
|
+
- Gemfile
|
47
|
+
- Gemfile.lock
|
48
|
+
- Rakefile
|
49
|
+
- Readme.md
|
50
|
+
- tesseract.gemspec
|
51
|
+
- tesseract.rb
|
52
|
+
- tesseract/dependency_checker.rb
|
53
|
+
- tesseract/file_handler.rb
|
54
|
+
- test/photo.jpeg
|
55
|
+
- test/tesseract_test.rb
|
56
|
+
- test/test_helper.rb
|
57
|
+
has_rdoc: true
|
58
|
+
homepage: http://github.com/jetviper21/ruby-tesseract
|
59
|
+
licenses: []
|
60
|
+
|
61
|
+
post_install_message:
|
62
|
+
rdoc_options:
|
63
|
+
- --charset=UTF-8
|
64
|
+
require_paths:
|
65
|
+
- tesseract
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
hash: 3
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
hash: 23
|
81
|
+
segments:
|
82
|
+
- 1
|
83
|
+
- 3
|
84
|
+
- 6
|
85
|
+
version: 1.3.6
|
86
|
+
requirements: []
|
87
|
+
|
88
|
+
rubyforge_project:
|
89
|
+
rubygems_version: 1.4.2
|
90
|
+
signing_key:
|
91
|
+
specification_version: 3
|
92
|
+
summary: Ruby wrapper for google tesseract
|
93
|
+
test_files: []
|
94
|
+
|