rtesseract 2.2.0 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.document +1 -2
- data/.gitignore +12 -0
- data/.rspec +2 -0
- data/.travis.yml +13 -10
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -17
- data/Gemfile.lock +40 -85
- data/LICENSE.txt +18 -17
- data/README.md +137 -0
- data/Rakefile +4 -48
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/rtesseract.rb +22 -220
- data/lib/rtesseract/box.rb +15 -60
- data/lib/rtesseract/check.rb +14 -0
- data/lib/rtesseract/command.rb +41 -0
- data/lib/rtesseract/configuration.rb +15 -64
- data/lib/rtesseract/pdf.rb +18 -0
- data/lib/rtesseract/text.rb +9 -0
- data/lib/rtesseract/tsv.rb +18 -0
- data/lib/rtesseract/version.rb +3 -0
- data/rtesseract.gemspec +27 -98
- metadata +36 -85
- data/README.rdoc +0 -156
- data/VERSION +0 -1
- data/lib/processors/mini_magick.rb +0 -43
- data/lib/processors/none.rb +0 -34
- data/lib/processors/rmagick.rb +0 -46
- data/lib/rtesseract/blob.rb +0 -34
- data/lib/rtesseract/box_char.rb +0 -31
- data/lib/rtesseract/errors.rb +0 -21
- data/lib/rtesseract/mixed.rb +0 -54
- data/lib/rtesseract/processor.rb +0 -19
- data/lib/rtesseract/utils.rb +0 -44
- data/lib/rtesseract/uzn.rb +0 -47
- data/spec/configs/eng.user-words.txt +0 -13
- data/spec/images/README.pdf +0 -0
- data/spec/images/blank.tif +0 -0
- data/spec/images/mixed.tif +0 -0
- data/spec/images/orientation_reverse.png +0 -0
- data/spec/images/test with spaces.tif +0 -0
- data/spec/images/test-pdf.png +0 -0
- data/spec/images/test.bmp +0 -0
- data/spec/images/test.jpg +0 -0
- data/spec/images/test.png +0 -0
- data/spec/images/test.tif +0 -0
- data/spec/images/test1.tif +0 -0
- data/spec/images/test_words.png +0 -0
- data/spec/rtesseract_box_char_spec.rb +0 -82
- data/spec/rtesseract_box_spec.rb +0 -36
- data/spec/rtesseract_mixed_spec.rb +0 -49
- data/spec/rtesseract_spec.rb +0 -282
- data/spec/rtesseract_uzn_spec.rb +0 -56
- data/spec/spec_helper.rb +0 -21
data/README.rdoc
DELETED
@@ -1,156 +0,0 @@
|
|
1
|
-
= rtesseract
|
2
|
-
{<img src="https://badge.fury.io/rb/rtesseract.png" alt="Gem Version" />}[http://badge.fury.io/rb/rtesseract]
|
3
|
-
{<img src="https://travis-ci.org/dannnylo/rtesseract.png?branch=master" alt="Build Status" />}[https://travis-ci.org/dannnylo/rtesseract]
|
4
|
-
{<img src="https://coveralls.io/repos/dannnylo/rtesseract/badge.png?branch=master" alt="Coverage Status" />}[https://coveralls.io/r/dannnylo/rtesseract?branch=master]
|
5
|
-
{<img src="https://codeclimate.com/github/dannnylo/rtesseract.png" />}[https://codeclimate.com/github/dannnylo/rtesseract]
|
6
|
-
|
7
|
-
|
8
|
-
Ruby library for working with the Tesseract OCR.
|
9
|
-
|
10
|
-
== REQUIREMENTS:
|
11
|
-
|
12
|
-
To work properly rtesseract are needed:
|
13
|
-
* Tesseract - Program
|
14
|
-
* ImageMagick - Program
|
15
|
-
* RMagick or mini_magick - Gem
|
16
|
-
|
17
|
-
Atention: Version 1.0.0 works fine with Ruby 2.0 and tesseract 3.0 and lower versions of rtesseract works fine with Ruby 1.8 and tesseract 2.0.4.
|
18
|
-
|
19
|
-
PDF support requires a newer version of tesseract, specifically V.3.03 or above.
|
20
|
-
|
21
|
-
== EXAMPLE USAGE
|
22
|
-
|
23
|
-
It's very simple to use rtesseract:
|
24
|
-
|
25
|
-
=== CONVERT IMAGE TO STRING
|
26
|
-
|
27
|
-
image = RTesseract.new("my_image.jpg")
|
28
|
-
image.to_s # Getting the value
|
29
|
-
|
30
|
-
=== CONVERT IMAGE TO SEARCHABLE PDF
|
31
|
-
|
32
|
-
image = RTesseract.new("my_image.jpg")
|
33
|
-
image.to_pdf # Getting the pdf path
|
34
|
-
image.to_s # Still can get the value only.
|
35
|
-
# ...
|
36
|
-
# some stuff
|
37
|
-
# ...
|
38
|
-
image.clean # to delete file once finished
|
39
|
-
|
40
|
-
This will preserve the image colors, pictures and structure in the generated pdf.
|
41
|
-
|
42
|
-
=== CHANGE THE IMAGE
|
43
|
-
|
44
|
-
image = RTesseract.new("my_image.jpg")
|
45
|
-
image.source = "new_image.png"
|
46
|
-
image.to_s
|
47
|
-
|
48
|
-
=== TRANSFORM THE IMAGE
|
49
|
-
|
50
|
-
image = RTesseract.read("my_image.jpg") do |img|
|
51
|
-
img = img.white_threshold(245)
|
52
|
-
img = img.quantize(256,Magick::GRAYColorspace)
|
53
|
-
end
|
54
|
-
image.to_s
|
55
|
-
|
56
|
-
=== CONVERT PARTS OF IMAGE TO STRING
|
57
|
-
|
58
|
-
mix_block = RTesseract::Mixed.new("test.jpg") do |image|
|
59
|
-
image.area(28, 19, 25, 25)
|
60
|
-
image.area(180, 22, 20, 28)
|
61
|
-
image.area(218, 22, 24, 28)
|
62
|
-
image.area(248, 24, 22, 22)
|
63
|
-
end
|
64
|
-
mix_block.to_s
|
65
|
-
|
66
|
-
OR
|
67
|
-
|
68
|
-
mix_block = RTesseract::Mixed.new("test.jpg",{:areas => [
|
69
|
-
{:x => 28, :y=>19, :w=>25, :h=>25 },
|
70
|
-
{:x => 180, :y=>22, :w=>20, :h=>28},
|
71
|
-
{:x => 218, :y=>22, :w=>24, :h=>28},
|
72
|
-
{:x => 248, :y=>24, :w=>22, :h=>22}
|
73
|
-
]})
|
74
|
-
mix_block.to_s
|
75
|
-
|
76
|
-
=== OPTIONS
|
77
|
-
|
78
|
-
Processors Options (_Rmagick_ is default)
|
79
|
-
|
80
|
-
RTesseract.new("test.jpg", :processor => "mini_magick")
|
81
|
-
|
82
|
-
Note: For non process the image use NoneProcessor
|
83
|
-
|
84
|
-
RTesseract.new("test.jpg", :processor => "none")
|
85
|
-
|
86
|
-
Or you can config default processor first:
|
87
|
-
|
88
|
-
RTesseract.configure do |config|
|
89
|
-
config.processor = "mini_magick"
|
90
|
-
end
|
91
|
-
|
92
|
-
RTesseract.new("test.jpg") # It will use mini_magick by default
|
93
|
-
|
94
|
-
Language Options
|
95
|
-
|
96
|
-
RTesseract.new("test.jpg", :lang => "deu")
|
97
|
-
* eng - English
|
98
|
-
* deu - German
|
99
|
-
* deu-f - German fraktur
|
100
|
-
* fra - French
|
101
|
-
* ita - Italian
|
102
|
-
* nld - Dutch
|
103
|
-
* por - Portuguese
|
104
|
-
* spa - Spanish
|
105
|
-
* vie - Vietnamese
|
106
|
-
* or any other supported by tesseract.
|
107
|
-
Note: Make sure you have installed the language to tesseract
|
108
|
-
|
109
|
-
Other Options
|
110
|
-
|
111
|
-
RTesseract.new("test.jpg", options: :digits) # Only digit recognition
|
112
|
-
|
113
|
-
OR
|
114
|
-
|
115
|
-
RTesseract.new("test.jpg", options: [:digits, :quiet])
|
116
|
-
|
117
|
-
=== BOUNDING BOX: TO GET WORDS WITH THEIR POSITIONS
|
118
|
-
|
119
|
-
RTesseract::Box.new('test_words.png').words
|
120
|
-
# => [
|
121
|
-
# {:word => 'If', :x_start=>52, :y_start=>13, :x_end=>63, :y_end=>27},
|
122
|
-
# {:word => 'you', :x_start=>69, :y_start=>17, :x_end=>100, :y_end=>31},
|
123
|
-
# {:word => 'are', :x_start=>108, :y_start=>17, :x_end=>136, :y_end=>27},
|
124
|
-
# {:word => 'a', :x_start=>143, :y_start=>17, :x_end=>151, :y_end=>27},
|
125
|
-
# {:word => 'friend,', :x_start=>158, :y_start=>13, :x_end=>214, :y_end=>29},
|
126
|
-
# {:word => 'you', :x_start=>51, :y_start=>39, :x_end=>82, :y_end=>53},
|
127
|
-
# {:word => 'speak', :x_start=>90, :y_start=>35, :x_end=>140, :y_end=>53},
|
128
|
-
# {:word => 'the', :x_start=>146, :y_start=>35, :x_end=>174, :y_end=>49},
|
129
|
-
# {:word => 'password,', :x_start=>182, :y_start=>35, :x_end=>267, :y_end=>53},
|
130
|
-
# {:word => 'and', :x_start=>51, :y_start=>57, :x_end=>81, :y_end=>71},
|
131
|
-
# {:word => 'the', :x_start=>89, :y_start=>57, :x_end=>117, :y_end=>71},
|
132
|
-
# {:word => 'doors', :x_start=>124, :y_start=>57, :x_end=>172, :y_end=>71},
|
133
|
-
# {:word => 'will', :x_start=>180, :y_start=>57, :x_end=>208, :y_end=>71},
|
134
|
-
# {:word => 'open.', :x_start=>216, :y_start=>61, :x_end=>263, :y_end=>75}
|
135
|
-
# ]
|
136
|
-
|
137
|
-
== Contributing to rtesseract
|
138
|
-
|
139
|
-
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
140
|
-
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
|
141
|
-
* Fork the project.
|
142
|
-
* Start a feature/bugfix branch.
|
143
|
-
* Commit and push until you are happy with your contribution.
|
144
|
-
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
145
|
-
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
146
|
-
|
147
|
-
|
148
|
-
== Links
|
149
|
-
|
150
|
-
* Github - http://github.com/dannnylo/rtesseract
|
151
|
-
* Rubygems - http://rubygems.org/gems/rtesseract
|
152
|
-
|
153
|
-
== Copyright
|
154
|
-
|
155
|
-
Copyright (c) 2014 Danilo Jeremias da Silva. See LICENSE.txt for
|
156
|
-
further details.
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
2.2.0
|
@@ -1,43 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
# RTesseract class
|
3
|
-
class RTesseract
|
4
|
-
# Processor Module
|
5
|
-
module Processor
|
6
|
-
# Add to rtesseract a image manipulation with MiniMagick
|
7
|
-
module MiniMagickProcessor
|
8
|
-
# Setup Processor
|
9
|
-
def self.setup
|
10
|
-
require 'mini_magick'
|
11
|
-
end
|
12
|
-
|
13
|
-
# Check if is this Processor
|
14
|
-
def self.a_name?(name)
|
15
|
-
%w(mini_magick MiniMagickProcessor).include?(name.to_s)
|
16
|
-
end
|
17
|
-
|
18
|
-
# Convert Image to Tiff
|
19
|
-
def self.image_to_tif(source, points = {})
|
20
|
-
tmp_file = Tempfile.new(['', '.tif'])
|
21
|
-
cat = source.is_a?(Pathname) ? read_with_processor(source.to_s) : source
|
22
|
-
cat.format('tif') do |c|
|
23
|
-
c.compress 'None'
|
24
|
-
c.alpha 'off' if MiniMagick.cli != :graphicsmagick
|
25
|
-
end
|
26
|
-
cat.crop("#{points[:w]}x#{points[:h]}+#{points[:x]}+#{points[:y]}") if points.is_a?(Hash) && points.values.compact != []
|
27
|
-
cat.alpha 'off' if MiniMagick.cli != :graphicsmagick
|
28
|
-
cat.write tmp_file.path.to_s
|
29
|
-
tmp_file
|
30
|
-
end
|
31
|
-
|
32
|
-
# Cast instance of image
|
33
|
-
def self.read_with_processor(path)
|
34
|
-
MiniMagick::Image.open(path.to_s)
|
35
|
-
end
|
36
|
-
|
37
|
-
# Check if is a MiniMagick image
|
38
|
-
def self.image?(object)
|
39
|
-
object.class == MiniMagick::Image
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
data/lib/processors/none.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
# RTesseract class
|
3
|
-
class RTesseract
|
4
|
-
# Processor Module
|
5
|
-
module Processor
|
6
|
-
# Add to rtesseract a image without manipulation
|
7
|
-
module NoneProcessor
|
8
|
-
# Setup Processor
|
9
|
-
def self.setup
|
10
|
-
end
|
11
|
-
|
12
|
-
# Check if is this Processor
|
13
|
-
def self.a_name?(name)
|
14
|
-
%w(none NoneProcessor).include?(name.to_s)
|
15
|
-
end
|
16
|
-
|
17
|
-
# Convert Image to Tiff
|
18
|
-
def self.image_to_tif(source, _points = {})
|
19
|
-
tmp_file = Tempfile.new(['', '.tif'])
|
20
|
-
tmp_file.write(read_with_processor(source))
|
21
|
-
tmp_file
|
22
|
-
end
|
23
|
-
|
24
|
-
# Cast instance of image
|
25
|
-
def self.read_with_processor(path)
|
26
|
-
File.read(path)
|
27
|
-
end
|
28
|
-
|
29
|
-
# Check if is a image
|
30
|
-
def self.image?(*)
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
data/lib/processors/rmagick.rb
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
# RTesseract class
|
3
|
-
class RTesseract
|
4
|
-
# Processor Module
|
5
|
-
module Processor
|
6
|
-
# Add to rtesseract a image manipulation with RMagick
|
7
|
-
module RMagickProcessor
|
8
|
-
# Setup Processor
|
9
|
-
def self.setup
|
10
|
-
require 'rmagick'
|
11
|
-
rescue LoadError
|
12
|
-
# :nocov:
|
13
|
-
require 'RMagick'
|
14
|
-
# :nocov:
|
15
|
-
end
|
16
|
-
|
17
|
-
# Check if is this Processor
|
18
|
-
def self.a_name?(name)
|
19
|
-
%w(rmagick RMagickProcessor).include?(name.to_s)
|
20
|
-
end
|
21
|
-
|
22
|
-
# Convert Image to Tiff
|
23
|
-
def self.image_to_tif(source, points = {})
|
24
|
-
tmp_file = Tempfile.new(['', '.tif'])
|
25
|
-
cat = source.is_a?(Pathname) ? read_with_processor(source.to_s) : source
|
26
|
-
cat.crop!(points[:x], points[:y], points[:w], points[:h]) if points.is_a?(Hash) && points.values.compact != []
|
27
|
-
cat.alpha Magick::DeactivateAlphaChannel
|
28
|
-
cat.write(tmp_file.path.to_s) do
|
29
|
-
# self.depth = 16
|
30
|
-
self.compression = Magick::NoCompression
|
31
|
-
end
|
32
|
-
tmp_file
|
33
|
-
end
|
34
|
-
|
35
|
-
# Cast instance of image
|
36
|
-
def self.read_with_processor(path)
|
37
|
-
Magick::Image.read(path.to_s).first
|
38
|
-
end
|
39
|
-
|
40
|
-
# Check if is a RMagick image
|
41
|
-
def self.image?(object)
|
42
|
-
object.class == Magick::Image
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
data/lib/rtesseract/blob.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
# Blob methods
|
2
|
-
class RTesseract
|
3
|
-
# Read image from memory blob
|
4
|
-
def self.read(src = nil, options = {})
|
5
|
-
fail RTesseract::ImageNotSelectedError if src.nil?
|
6
|
-
processor = RTesseract::Processor.choose_processor!(options[:processor])
|
7
|
-
image = processor.read_with_processor(src.to_s)
|
8
|
-
yield(image)
|
9
|
-
object = RTesseract.new('', options).from_blob(image.to_blob)
|
10
|
-
object
|
11
|
-
end
|
12
|
-
|
13
|
-
# Read image from memory blob
|
14
|
-
def read
|
15
|
-
image = @processor.read_with_processor(@source.to_s)
|
16
|
-
new_image = yield(image)
|
17
|
-
from_blob(new_image.to_blob, File.extname(@source.to_s))
|
18
|
-
self
|
19
|
-
end
|
20
|
-
|
21
|
-
# Read image from memory blob
|
22
|
-
def from_blob(blob, ext = '')
|
23
|
-
blob_file = Tempfile.new(['blob', ext], encoding: 'ascii-8bit')
|
24
|
-
blob_file.binmode.write(blob)
|
25
|
-
blob_file.rewind
|
26
|
-
blob_file.flush
|
27
|
-
self.source = blob_file.path
|
28
|
-
convert
|
29
|
-
RTesseract::Utils.remove_files([blob_file])
|
30
|
-
self
|
31
|
-
rescue => error
|
32
|
-
raise RTesseract::ConversionError.new(error), error, caller
|
33
|
-
end
|
34
|
-
end
|
data/lib/rtesseract/box_char.rb
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
# RTesseract
|
3
|
-
class RTesseract
|
4
|
-
# Class to read char positions from an image
|
5
|
-
class BoxChar < Box
|
6
|
-
def config_hook
|
7
|
-
@options['tessedit_create_boxfile'] = 1 # Split chars
|
8
|
-
end
|
9
|
-
|
10
|
-
alias_method :characters, :words
|
11
|
-
|
12
|
-
# Extension of file
|
13
|
-
def file_ext
|
14
|
-
'.box'
|
15
|
-
end
|
16
|
-
|
17
|
-
# Read the result file
|
18
|
-
def parse_file
|
19
|
-
File.read(file_with_ext).to_s
|
20
|
-
end
|
21
|
-
|
22
|
-
def convert_text
|
23
|
-
text_objects = []
|
24
|
-
parse_file.each_line do |line|
|
25
|
-
char, x_start, y_start, x_end, y_end, _word = line.split(' ')
|
26
|
-
text_objects << { char: char, x_start: x_start.to_i, y_start: y_start.to_i, x_end: x_end.to_i, y_end: y_end.to_i }
|
27
|
-
end
|
28
|
-
@value = text_objects
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
data/lib/rtesseract/errors.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
# RTesseract
|
2
|
-
class RTesseract
|
3
|
-
# Class of error with storage of normal errors
|
4
|
-
class ErrorWithMemory < StandardError
|
5
|
-
attr_accessor :old_error
|
6
|
-
|
7
|
-
def initialize(stored_error = nil)
|
8
|
-
@old_error = stored_error
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
class ConversionError < ErrorWithMemory; end
|
13
|
-
class ImageNotSelectedError < ErrorWithMemory; end
|
14
|
-
class TempFilesNotRemovedError < ErrorWithMemory; end
|
15
|
-
|
16
|
-
class TesseractVersionError < StandardError
|
17
|
-
def initialize
|
18
|
-
super "Tesseract version is unknown or below 3.03 which is required for pdf output."
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
data/lib/rtesseract/mixed.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
# RTesseract
|
3
|
-
class RTesseract
|
4
|
-
# Class to read an image from specified areas
|
5
|
-
class Mixed
|
6
|
-
attr_reader :areas
|
7
|
-
|
8
|
-
def initialize(src = '', options = {})
|
9
|
-
@source = Pathname.new src
|
10
|
-
@options = options
|
11
|
-
@value = ''
|
12
|
-
@areas = options.delete(:areas) || []
|
13
|
-
yield self if block_given?
|
14
|
-
end
|
15
|
-
|
16
|
-
# Add areas
|
17
|
-
def area(points)
|
18
|
-
@value = ''
|
19
|
-
@areas << points
|
20
|
-
end
|
21
|
-
|
22
|
-
# Clear areas
|
23
|
-
def clear_areas
|
24
|
-
@areas = []
|
25
|
-
end
|
26
|
-
|
27
|
-
# Convert parts of image to string
|
28
|
-
def convert
|
29
|
-
@value = []
|
30
|
-
@areas.each_with_object(RTesseract.new(@source.to_s, @options.dup)) do |area, image|
|
31
|
-
image.crop!(area)
|
32
|
-
@value << image.to_s
|
33
|
-
end
|
34
|
-
rescue => error
|
35
|
-
raise RTesseract::ConversionError.new(error), error, caller
|
36
|
-
end
|
37
|
-
|
38
|
-
# Output value
|
39
|
-
def to_s
|
40
|
-
return @value if @value != ''
|
41
|
-
if @source.file?
|
42
|
-
convert
|
43
|
-
@value.join
|
44
|
-
else
|
45
|
-
fail RTesseract::ImageNotSelectedError.new(@source)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
# Remove spaces and break-lines
|
50
|
-
def to_s_without_spaces
|
51
|
-
to_s.delete(' ').delete("\n").delete("\r")
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
data/lib/rtesseract/processor.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
# RTesseract
|
2
|
-
class RTesseract
|
3
|
-
# Processor managment
|
4
|
-
module Processor
|
5
|
-
# Return the processor
|
6
|
-
def self.choose_processor!(processor)
|
7
|
-
processor =
|
8
|
-
if RTesseract::Processor::MiniMagickProcessor.a_name?(processor.to_s)
|
9
|
-
MiniMagickProcessor
|
10
|
-
elsif RTesseract::Processor::NoneProcessor.a_name?(processor.to_s)
|
11
|
-
NoneProcessor
|
12
|
-
else
|
13
|
-
RMagickProcessor
|
14
|
-
end
|
15
|
-
processor.setup
|
16
|
-
processor
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|