tesseract 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +3 -0
- data/Gemfile.lock +15 -0
- data/Rakefile +9 -0
- data/Readme.md +22 -0
- data/tesseract.gemspec +20 -0
- data/tesseract.rb +54 -0
- data/tesseract/dependency_checker.rb +40 -0
- data/tesseract/file_handler.rb +20 -0
- data/test/photo.jpeg +0 -0
- data/test/tesseract_test.rb +82 -0
- data/test/test_helper.rb +10 -0
- metadata +94 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
data/Rakefile
ADDED
data/Readme.md
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Ruby Tesseract
|
2
|
+
|
3
|
+
This is a library for using the tesseract OCR in ruby applications
|
4
|
+
|
5
|
+
## Dependcies
|
6
|
+
|
7
|
+
1. [Terreract](http://code.google.com/p/tesseract-ocr/)
|
8
|
+
2. [ImageMagick](http://www.imagemagick.org/script/index.php) - Note the command line program `convert` needs to be accessible to ruby
|
9
|
+
3. *nix based operating system
|
10
|
+
|
11
|
+
##Usage
|
12
|
+
|
13
|
+
*Please Note the default language is english*
|
14
|
+
|
15
|
+
tess = Tesseract::Process.new("photo.jpg")
|
16
|
+
tess.to_s
|
17
|
+
|
18
|
+
Config options are also supported
|
19
|
+
|
20
|
+
tess = Tesseract::Process.new("photo.jpg", {:lang => 'some language', :chop_enable => 0})
|
21
|
+
tess.to_s
|
22
|
+
|
data/tesseract.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
3
|
+
|
4
|
+
s.name = %q{tesseract}
|
5
|
+
s.version = '0.0.1'
|
6
|
+
s.platform = Gem::Platform::RUBY
|
7
|
+
|
8
|
+
s.authors = ["Scott Davis"]
|
9
|
+
s.description = %q{Ruby wrapper for google tesseract}
|
10
|
+
s.summary = %q{Ruby wrapper for google tesseract}
|
11
|
+
s.email = %q{jetviper21@gmail.com}
|
12
|
+
s.date = Date.today.to_s
|
13
|
+
s.files = `git ls-files`.split("\n")
|
14
|
+
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
15
|
+
s.require_path = 'tesseract'
|
16
|
+
s.homepage = %q{http://github.com/jetviper21/ruby-tesseract}
|
17
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
18
|
+
s.required_rubygems_version = ">= 1.3.6"
|
19
|
+
s.add_development_dependency "bundler", ">= 1.0.0"
|
20
|
+
end
|
data/tesseract.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
Dir["tesseract/*.rb"].each { |file| require file }
|
2
|
+
require 'pathname'
|
3
|
+
require 'digest/md5'
|
4
|
+
module Tesseract
|
5
|
+
class Process
|
6
|
+
attr_reader :image
|
7
|
+
attr_accessor :lang
|
8
|
+
CONVERT_COMMAND = 'convert'
|
9
|
+
TESSERACT_COMMAND = 'tesseract'
|
10
|
+
|
11
|
+
def initialize(image_name, options = {})
|
12
|
+
DependencyChecker.check!
|
13
|
+
@image = Pathname.new(image_name)
|
14
|
+
@hash = Digest::MD5.hexdigest("#{@image}-#{Time.now}")
|
15
|
+
@lang = options[:lang].nil? ? 'eng' : options.delete(:lang)
|
16
|
+
@options = options
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_s
|
20
|
+
@out ||= process!
|
21
|
+
end
|
22
|
+
|
23
|
+
def process!
|
24
|
+
temp_image = to_tiff
|
25
|
+
text = tesseract_translation(temp_image)
|
26
|
+
FileHandler.cleanup!
|
27
|
+
text.gsub(/^\//, '')
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_tiff
|
31
|
+
temp_file = FileHandler.create_temp_file("#{@hash}.tif")
|
32
|
+
system [CONVERT_COMMAND, image, temp_file].join(" ")
|
33
|
+
temp_file
|
34
|
+
end
|
35
|
+
|
36
|
+
def tesseract_translation(image_file)
|
37
|
+
temp_text_file = FileHandler.create_temp_file("#{@hash}")
|
38
|
+
config_file = write_configs
|
39
|
+
system [TESSERACT_COMMAND, image_file, temp_text_file, "-l #{@lang}", config_file, "&> /dev/null"].join(" ")
|
40
|
+
File.read("#{temp_text_file}.txt")
|
41
|
+
end
|
42
|
+
|
43
|
+
def write_configs
|
44
|
+
return '' if @options.empty?
|
45
|
+
path = FileHandler.create_temp_file("#{@hash}.config")
|
46
|
+
File.open(path, "w+") do |f|
|
47
|
+
@options.each { |k,v| f << "#{k} #{v}\n" }
|
48
|
+
end
|
49
|
+
path
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Tesseract
|
2
|
+
class DependencyChecker
|
3
|
+
#putting these here so its easyer to test
|
4
|
+
IMAGE_MAGICK_ERROR = "ImageMagick \"convert\" command not found! Make sure ImageMagick is installed and in the system path"
|
5
|
+
TESSERACT_ERROR = "\"tesseract\" command not found! Make sure tesseract is installed and in the system path"
|
6
|
+
OS_ERROR = "Only Unix Based enviroments are supported Mac, Linux, etc."
|
7
|
+
|
8
|
+
def self.check!
|
9
|
+
check_os!
|
10
|
+
check_for_tesseract!
|
11
|
+
check_for_imagemagick!
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
#for easy mocking
|
17
|
+
def self.run_cmd(cmd)
|
18
|
+
`#{cmd}`
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.check_os!
|
22
|
+
case ::RUBY_PLATFORM
|
23
|
+
when /darwin/
|
24
|
+
return true
|
25
|
+
when /linux/, /unix/
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
raise Exception, OS_ERROR
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.check_for_imagemagick!
|
32
|
+
raise Exception, IMAGE_MAGICK_ERROR if run_cmd('which convert').empty?
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.check_for_tesseract!
|
36
|
+
raise Exception, TESSERACT_ERROR if run_cmd('which tesseract').empty?
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
require 'tempfile'
|
3
|
+
module Tesseract
|
4
|
+
class FileHandler
|
5
|
+
@tempfiles = []
|
6
|
+
|
7
|
+
def self.create_temp_file(filename)
|
8
|
+
file = Pathname.new(Dir::tmpdir).join(filename)
|
9
|
+
@tempfiles << file
|
10
|
+
return file
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.cleanup!
|
14
|
+
@tempfiles.each do |file|
|
15
|
+
File.unlink(file.to_s) if File.exists?(file.to_s)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
data/test/photo.jpeg
ADDED
Binary file
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'test/unit/assertions'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'shoulda'
|
5
|
+
require 'test_helper'
|
6
|
+
require 'mocha'
|
7
|
+
require '../tesseract/tesseract'
|
8
|
+
|
9
|
+
class TesseractTest < Test::Unit::TestCase
|
10
|
+
TEST_FILE = File.join(File.dirname(__FILE__), 'photo.jpeg')
|
11
|
+
context "dependency os check fails windows" do
|
12
|
+
setup do
|
13
|
+
@old_val = RUBY_PLATFORM
|
14
|
+
silence_stream(STDERR) { Object.const_set("RUBY_PLATFORM", 'windows') }
|
15
|
+
end
|
16
|
+
should "throw exception" do
|
17
|
+
assert_raises Exception, Tesseract::DependencyChecker::OS_ERROR do
|
18
|
+
Tesseract::Process.new(TEST_FILE)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
teardown do
|
22
|
+
silence_stream(STDERR) { Object.const_set("RUBY_PLATFORM", @old_val) }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
context "dependency imagemagic fails" do
|
27
|
+
setup do
|
28
|
+
Tesseract::DependencyChecker.expects(:run_cmd).with("which tesseract").returns('foo').once
|
29
|
+
Tesseract::DependencyChecker.expects(:run_cmd).with("which convert").returns('').once
|
30
|
+
end
|
31
|
+
should "throw exception" do
|
32
|
+
assert_raises Exception, Tesseract::DependencyChecker::IMAGE_MAGICK_ERROR do
|
33
|
+
Tesseract::Process.new(TEST_FILE)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
context "dependency tesseract fails" do
|
39
|
+
setup do
|
40
|
+
Tesseract::DependencyChecker.expects(:run_cmd).with("which tesseract").returns('').once
|
41
|
+
end
|
42
|
+
should "throw exception" do
|
43
|
+
assert_raises Exception, Tesseract::DependencyChecker::TESSERACT_ERROR do
|
44
|
+
Tesseract::Process.new(TEST_FILE)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
context "tesseract" do
|
50
|
+
setup do
|
51
|
+
@tess = Tesseract::Process.new(TEST_FILE)
|
52
|
+
end
|
53
|
+
should "return text" do
|
54
|
+
assert !@tess.to_s.empty?
|
55
|
+
end
|
56
|
+
should "hanve lang of eng" do
|
57
|
+
assert_equal 'eng', @tess.lang
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
context "tesseract diff lang" do
|
62
|
+
setup do
|
63
|
+
@tess = Tesseract::Process.new(TEST_FILE, {:lang => 'butts'})
|
64
|
+
end
|
65
|
+
should "have lang of butts" do
|
66
|
+
assert_equal 'butts', @tess.lang
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
context "tesseract configs" do
|
71
|
+
setup do
|
72
|
+
@tess = Tesseract::Process.new(TEST_FILE, {:chop_enable=>0})
|
73
|
+
end
|
74
|
+
should "return text" do
|
75
|
+
assert !@tess.to_s.empty?
|
76
|
+
end
|
77
|
+
should "hanve lang of eng" do
|
78
|
+
assert_equal 'eng', @tess.lang
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tesseract
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Scott Davis
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-02-22 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: bundler
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 23
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 0
|
33
|
+
- 0
|
34
|
+
version: 1.0.0
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id001
|
37
|
+
description: Ruby wrapper for google tesseract
|
38
|
+
email: jetviper21@gmail.com
|
39
|
+
executables: []
|
40
|
+
|
41
|
+
extensions: []
|
42
|
+
|
43
|
+
extra_rdoc_files: []
|
44
|
+
|
45
|
+
files:
|
46
|
+
- Gemfile
|
47
|
+
- Gemfile.lock
|
48
|
+
- Rakefile
|
49
|
+
- Readme.md
|
50
|
+
- tesseract.gemspec
|
51
|
+
- tesseract.rb
|
52
|
+
- tesseract/dependency_checker.rb
|
53
|
+
- tesseract/file_handler.rb
|
54
|
+
- test/photo.jpeg
|
55
|
+
- test/tesseract_test.rb
|
56
|
+
- test/test_helper.rb
|
57
|
+
has_rdoc: true
|
58
|
+
homepage: http://github.com/jetviper21/ruby-tesseract
|
59
|
+
licenses: []
|
60
|
+
|
61
|
+
post_install_message:
|
62
|
+
rdoc_options:
|
63
|
+
- --charset=UTF-8
|
64
|
+
require_paths:
|
65
|
+
- tesseract
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
hash: 3
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
hash: 23
|
81
|
+
segments:
|
82
|
+
- 1
|
83
|
+
- 3
|
84
|
+
- 6
|
85
|
+
version: 1.3.6
|
86
|
+
requirements: []
|
87
|
+
|
88
|
+
rubyforge_project:
|
89
|
+
rubygems_version: 1.4.2
|
90
|
+
signing_key:
|
91
|
+
specification_version: 3
|
92
|
+
summary: Ruby wrapper for google tesseract
|
93
|
+
test_files: []
|
94
|
+
|