tesseract-ocr 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/tesseract-train.rb
CHANGED
@@ -7,12 +7,8 @@ require 'shellwords'
|
|
7
7
|
options = {}
|
8
8
|
|
9
9
|
OptionParser.new do |o|
|
10
|
-
o.on '-
|
11
|
-
options[:
|
12
|
-
end
|
13
|
-
|
14
|
-
o.on '-i', '--image FILE...', Array, 'the image files to use' do |value|
|
15
|
-
options[:image] = value.map { |path| File.realpath(path) }
|
10
|
+
o.on '-d', '--data DATA...', Array, 'the data to use' do |value|
|
11
|
+
options[:data] = Hash[value.map { |e| e.split(?:).map { |p| File.realpath(p) } }]
|
16
12
|
end
|
17
13
|
|
18
14
|
o.on '-o', '--output FILE', 'the path where to output the traineddata' do |value|
|
@@ -32,12 +28,12 @@ Dir.chdir FileUtils.mkpath(File.join(Dir.tmpdir, rand.to_s)).first
|
|
32
28
|
|
33
29
|
language = language.shellescape
|
34
30
|
|
35
|
-
options[:
|
31
|
+
options[:data].each_with_index {|(box, image), index|
|
36
32
|
%x{
|
37
33
|
cp #{box.shellescape} #{language}.#{index}.box
|
38
|
-
cp #{
|
34
|
+
cp #{image.shellescape} #{language}.#{index}#{File.extname(image)}
|
39
35
|
|
40
|
-
tesseract #{language}#{File.extname(
|
36
|
+
tesseract #{language}.#{index}#{File.extname(image)} #{language} nobatch box.train.stderr
|
41
37
|
|
42
38
|
unicharset_extractor #{language}.box
|
43
39
|
|
@@ -60,8 +56,10 @@ options[:box].each_with_index {|box, index|
|
|
60
56
|
mv #{language}.traineddata #{options[:output].shellescape}
|
61
57
|
}
|
62
58
|
|
59
|
+
=begin
|
63
60
|
path = File.realpath(Dir.pwd)
|
64
61
|
|
65
62
|
Dir.chdir '/'
|
66
63
|
|
67
64
|
FileUtils.rm_rf path
|
65
|
+
=end
|
data/bin/tesseract.rb
CHANGED
@@ -44,6 +44,14 @@ OptionParser.new do |o|
|
|
44
44
|
o.on '-w', '--whitelist LIST', 'whitelist the following chars' do |value|
|
45
45
|
options[:whitelist] = value
|
46
46
|
end
|
47
|
+
|
48
|
+
o.on '-s', '--scale VALUE', Float, 'scale the image before analyzing it' do |value|
|
49
|
+
options[:scale] = value
|
50
|
+
end
|
51
|
+
|
52
|
+
o.on '-r', '--resize VALUE', Float, 'resize the image before analyzing it' do |value|
|
53
|
+
options[:resize] = value
|
54
|
+
end
|
47
55
|
end.parse!
|
48
56
|
|
49
57
|
Tesseract::Engine.new(options[:path], options[:language], options[:mode]) {|engine|
|
@@ -53,11 +61,19 @@ Tesseract::Engine.new(options[:path], options[:language], options[:mode]) {|engi
|
|
53
61
|
engine.page_segmentation_mode = options[:psm] if options[:psm]
|
54
62
|
engine.load_config options[:config] if options[:config]
|
55
63
|
}.tap {|engine|
|
64
|
+
image = if options[:scale]
|
65
|
+
require 'RMagick'; Magick::Image.read(ARGV.first).first.scale(options[:scale])
|
66
|
+
elsif options[:resize]
|
67
|
+
require 'RMagick'; Magick::Image.read(ARGV.first).first.resize(options[:resize])
|
68
|
+
else
|
69
|
+
ARGV.first
|
70
|
+
end
|
71
|
+
|
56
72
|
if options[:unlv]
|
57
|
-
puts engine.text_for(
|
73
|
+
puts engine.text_for(image).unlv.strip
|
58
74
|
elsif options[:confidence]
|
59
|
-
puts engine.text_for(
|
75
|
+
puts engine.text_for(image).confidence
|
60
76
|
else
|
61
|
-
puts engine.text_for(
|
77
|
+
puts engine.text_for(image).strip
|
62
78
|
end
|
63
79
|
}
|
Binary file
|
data/lib/tesseract-ocr.rb
CHANGED
data/lib/tesseract/version.rb
CHANGED
data/tesseract-ocr.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tesseract-ocr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-02-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: call-me
|
16
|
-
requirement: &
|
16
|
+
requirement: &19410320 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *19410320
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: iso-639
|
27
|
-
requirement: &
|
27
|
+
requirement: &19408960 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *19408960
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ffi-extra
|
38
|
-
requirement: &
|
38
|
+
requirement: &19407840 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *19407840
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
|
-
name: ffi-
|
49
|
-
requirement: &
|
48
|
+
name: ffi-inline
|
49
|
+
requirement: &19407000 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *19407000
|
58
58
|
description:
|
59
59
|
email: meh@paranoici.org
|
60
60
|
executables:
|
@@ -220,7 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
220
220
|
version: '0'
|
221
221
|
requirements: []
|
222
222
|
rubyforge_project:
|
223
|
-
rubygems_version: 1.8.
|
223
|
+
rubygems_version: 1.8.15
|
224
224
|
signing_key:
|
225
225
|
specification_version: 3
|
226
226
|
summary: A wrapper library to the tesseract-ocr API.
|