rtesseract 2.2.0 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.document +1 -2
- data/.gitignore +12 -0
- data/.rspec +2 -0
- data/.travis.yml +13 -10
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -17
- data/Gemfile.lock +40 -85
- data/LICENSE.txt +18 -17
- data/README.md +137 -0
- data/Rakefile +4 -48
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/rtesseract.rb +22 -220
- data/lib/rtesseract/box.rb +15 -60
- data/lib/rtesseract/check.rb +14 -0
- data/lib/rtesseract/command.rb +41 -0
- data/lib/rtesseract/configuration.rb +15 -64
- data/lib/rtesseract/pdf.rb +18 -0
- data/lib/rtesseract/text.rb +9 -0
- data/lib/rtesseract/tsv.rb +18 -0
- data/lib/rtesseract/version.rb +3 -0
- data/rtesseract.gemspec +27 -98
- metadata +36 -85
- data/README.rdoc +0 -156
- data/VERSION +0 -1
- data/lib/processors/mini_magick.rb +0 -43
- data/lib/processors/none.rb +0 -34
- data/lib/processors/rmagick.rb +0 -46
- data/lib/rtesseract/blob.rb +0 -34
- data/lib/rtesseract/box_char.rb +0 -31
- data/lib/rtesseract/errors.rb +0 -21
- data/lib/rtesseract/mixed.rb +0 -54
- data/lib/rtesseract/processor.rb +0 -19
- data/lib/rtesseract/utils.rb +0 -44
- data/lib/rtesseract/uzn.rb +0 -47
- data/spec/configs/eng.user-words.txt +0 -13
- data/spec/images/README.pdf +0 -0
- data/spec/images/blank.tif +0 -0
- data/spec/images/mixed.tif +0 -0
- data/spec/images/orientation_reverse.png +0 -0
- data/spec/images/test with spaces.tif +0 -0
- data/spec/images/test-pdf.png +0 -0
- data/spec/images/test.bmp +0 -0
- data/spec/images/test.jpg +0 -0
- data/spec/images/test.png +0 -0
- data/spec/images/test.tif +0 -0
- data/spec/images/test1.tif +0 -0
- data/spec/images/test_words.png +0 -0
- data/spec/rtesseract_box_char_spec.rb +0 -82
- data/spec/rtesseract_box_spec.rb +0 -36
- data/spec/rtesseract_mixed_spec.rb +0 -49
- data/spec/rtesseract_spec.rb +0 -282
- data/spec/rtesseract_uzn_spec.rb +0 -56
- data/spec/spec_helper.rb +0 -21
data/lib/rtesseract/utils.rb
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
require 'open3'
|
2
|
-
|
3
|
-
# RTesseract
|
4
|
-
class RTesseract
|
5
|
-
# Some utils methods
|
6
|
-
module Utils
|
7
|
-
# Remove files or Tempfile
|
8
|
-
def self.remove_files(files = [])
|
9
|
-
files.each do |file|
|
10
|
-
self.remove_file(file)
|
11
|
-
end
|
12
|
-
true
|
13
|
-
rescue => error
|
14
|
-
raise RTesseract::TempFilesNotRemovedError.new(error: error, files: files)
|
15
|
-
end
|
16
|
-
|
17
|
-
# Remove file or Tempfile
|
18
|
-
def self.remove_file(file)
|
19
|
-
if file.is_a?(Tempfile)
|
20
|
-
file.close
|
21
|
-
file.unlink
|
22
|
-
else
|
23
|
-
File.unlink(file)
|
24
|
-
end
|
25
|
-
true
|
26
|
-
end
|
27
|
-
|
28
|
-
# Extract tesseract version number
|
29
|
-
def self.version_number
|
30
|
-
output, st = Open3.capture2e(RTesseract.default_command, "--version")
|
31
|
-
|
32
|
-
version = output.split("\n")[0].split(" ")[1].split('.')[0, 2].join('.')
|
33
|
-
Float(version) rescue nil
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# Hash
|
39
|
-
class Hash
|
40
|
-
# return the value and remove from hash
|
41
|
-
def option(attr_name, default)
|
42
|
-
delete(attr_name.to_s) || delete(attr_name) || default
|
43
|
-
end
|
44
|
-
end
|
data/lib/rtesseract/uzn.rb
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
# RTesseract
|
3
|
-
class RTesseract
|
4
|
-
# Alternative approach to Mixed when you want to read from specific areas.
|
5
|
-
# Requires `-psm 4` which means the text must be "a single column of text of variable sizes".
|
6
|
-
class Uzn < RTesseract
|
7
|
-
attr_reader :areas
|
8
|
-
DEFAULT_ALPHABET = 'Text/Latin'
|
9
|
-
|
10
|
-
def initialize(src = '', options = {})
|
11
|
-
@areas = options.delete(:areas) || []
|
12
|
-
@alphabet = options.delete(:alphabet) || DEFAULT_ALPHABET
|
13
|
-
super(src, options.merge(psm: 4))
|
14
|
-
yield self if block_given?
|
15
|
-
end
|
16
|
-
|
17
|
-
# Add areas
|
18
|
-
def area(points)
|
19
|
-
areas << points
|
20
|
-
end
|
21
|
-
|
22
|
-
def convert_command
|
23
|
-
@image = image
|
24
|
-
write_uzn_file
|
25
|
-
`#{configuration.command} "#{@image}" "#{file_dest}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{options_cmd.join(' ')}`
|
26
|
-
end
|
27
|
-
|
28
|
-
def after_convert_hook
|
29
|
-
RTesseract::Utils.remove_files([@uzn_file])
|
30
|
-
end
|
31
|
-
|
32
|
-
private
|
33
|
-
|
34
|
-
def write_uzn_file
|
35
|
-
folder = File.dirname(@image)
|
36
|
-
basename = File.basename(@image, '.tif')
|
37
|
-
@uzn_file = File.new("#{folder}/#{basename}.uzn", File::CREAT|File::TRUNC|File::RDWR)
|
38
|
-
|
39
|
-
areas.each do |points|
|
40
|
-
s = "#{points[:x]} #{points[:y]} #{points[:w]} #{points[:h]} #{@alphabet}\n"
|
41
|
-
@uzn_file.write(s)
|
42
|
-
@uzn_file.flush
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
end
|
47
|
-
end
|
data/spec/images/README.pdf
DELETED
Binary file
|
data/spec/images/blank.tif
DELETED
Binary file
|
data/spec/images/mixed.tif
DELETED
Binary file
|
Binary file
|
Binary file
|
data/spec/images/test-pdf.png
DELETED
Binary file
|
data/spec/images/test.bmp
DELETED
Binary file
|
data/spec/images/test.jpg
DELETED
Binary file
|
data/spec/images/test.png
DELETED
Binary file
|
data/spec/images/test.tif
DELETED
Binary file
|
data/spec/images/test1.tif
DELETED
Binary file
|
data/spec/images/test_words.png
DELETED
Binary file
|
@@ -1,82 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
3
|
-
|
4
|
-
describe 'Rtesseract::BoxChar' do
|
5
|
-
before do
|
6
|
-
@path = Pathname.new(__FILE__.gsub('rtesseract_box_char_spec.rb', '')).expand_path
|
7
|
-
@image_tiff = @path.join('images', 'test.tif').to_s
|
8
|
-
@words_image = @path.join('images', 'test_words.png').to_s
|
9
|
-
@values = [
|
10
|
-
{ char: 'I', x_start: 52, y_start: 91, x_end: 54, y_end: 104 },
|
11
|
-
{ char: 'f', x_start: 56, y_start: 91, x_end: 63, y_end: 105 },
|
12
|
-
{ char: 'y', x_start: 69, y_start: 87, x_end: 79, y_end: 101 },
|
13
|
-
{ char: 'o', x_start: 80, y_start: 91, x_end: 90, y_end: 101 },
|
14
|
-
{ char: 'u', x_start: 92, y_start: 91, x_end: 100, y_end: 101 },
|
15
|
-
{ char: 'a', x_start: 108, y_start: 91, x_end: 116, y_end: 101 },
|
16
|
-
{ char: 'r', x_start: 119, y_start: 91, x_end: 125, y_end: 101 },
|
17
|
-
{ char: 'e', x_start: 126, y_start: 91, x_end: 136, y_end: 101 },
|
18
|
-
{ char: 'a', x_start: 143, y_start: 91, x_end: 151, y_end: 101 },
|
19
|
-
{ char: 'f', x_start: 158, y_start: 91, x_end: 165, y_end: 105 },
|
20
|
-
{ char: 'r', x_start: 166, y_start: 91, x_end: 172, y_end: 101 },
|
21
|
-
{ char: 'i', x_start: 174, y_start: 91, x_end: 176, y_end: 105 },
|
22
|
-
{ char: 'e', x_start: 178, y_start: 91, x_end: 188, y_end: 101 },
|
23
|
-
{ char: 'n', x_start: 190, y_start: 91, x_end: 198, y_end: 101 },
|
24
|
-
{ char: 'd', x_start: 200, y_start: 91, x_end: 209, y_end: 105 },
|
25
|
-
{ char: ',', x_start: 211, y_start: 89, x_end: 214, y_end: 93 },
|
26
|
-
{ char: 'y', x_start: 51, y_start: 65, x_end: 61, y_end: 79 },
|
27
|
-
{ char: 'o', x_start: 62, y_start: 69, x_end: 72, y_end: 79 },
|
28
|
-
{ char: 'u', x_start: 74, y_start: 69, x_end: 82, y_end: 79 },
|
29
|
-
{ char: 's', x_start: 90, y_start: 69, x_end: 97, y_end: 79 },
|
30
|
-
{ char: 'p', x_start: 99, y_start: 65, x_end: 108, y_end: 79 },
|
31
|
-
{ char: 'e', x_start: 109, y_start: 69, x_end: 119, y_end: 79 },
|
32
|
-
{ char: 'a', x_start: 120, y_start: 69, x_end: 128, y_end: 79 },
|
33
|
-
{ char: 'k', x_start: 131, y_start: 69, x_end: 140, y_end: 83 },
|
34
|
-
{ char: 't', x_start: 146, y_start: 69, x_end: 152, y_end: 82 },
|
35
|
-
{ char: 'h', x_start: 154, y_start: 69, x_end: 162, y_end: 83 },
|
36
|
-
{ char: 'e', x_start: 164, y_start: 69, x_end: 174, y_end: 79 },
|
37
|
-
{ char: 'p', x_start: 182, y_start: 65, x_end: 191, y_end: 79 },
|
38
|
-
{ char: 'a', x_start: 192, y_start: 69, x_end: 200, y_end: 79 },
|
39
|
-
{ char: 's', x_start: 202, y_start: 69, x_end: 209, y_end: 79 },
|
40
|
-
{ char: 's', x_start: 210, y_start: 69, x_end: 217, y_end: 79 },
|
41
|
-
{ char: 'w', x_start: 219, y_start: 69, x_end: 232, y_end: 79 },
|
42
|
-
{ char: 'o', x_start: 234, y_start: 69, x_end: 244, y_end: 79 },
|
43
|
-
{ char: 'r', x_start: 246, y_start: 69, x_end: 252, y_end: 79 },
|
44
|
-
{ char: 'd', x_start: 253, y_start: 69, x_end: 262, y_end: 83 },
|
45
|
-
{ char: ',', x_start: 264, y_start: 67, x_end: 267, y_end: 71 },
|
46
|
-
{ char: 'a', x_start: 51, y_start: 47, x_end: 59, y_end: 57 },
|
47
|
-
{ char: 'n', x_start: 62, y_start: 47, x_end: 70, y_end: 57 },
|
48
|
-
{ char: 'd', x_start: 72, y_start: 47, x_end: 81, y_end: 61 },
|
49
|
-
{ char: 't', x_start: 89, y_start: 47, x_end: 95, y_end: 60 },
|
50
|
-
{ char: 'h', x_start: 97, y_start: 47, x_end: 105, y_end: 61 },
|
51
|
-
{ char: 'e', x_start: 107, y_start: 47, x_end: 117, y_end: 57 },
|
52
|
-
{ char: 'd', x_start: 124, y_start: 47, x_end: 133, y_end: 61 },
|
53
|
-
{ char: 'o', x_start: 135, y_start: 47, x_end: 145, y_end: 57 },
|
54
|
-
{ char: 'o', x_start: 146, y_start: 47, x_end: 156, y_end: 57 },
|
55
|
-
{ char: 'r', x_start: 158, y_start: 47, x_end: 164, y_end: 57 },
|
56
|
-
{ char: 's', x_start: 165, y_start: 47, x_end: 172, y_end: 57 },
|
57
|
-
{ char: 'w', x_start: 180, y_start: 47, x_end: 193, y_end: 57 },
|
58
|
-
{ char: 'i', x_start: 196, y_start: 47, x_end: 198, y_end: 61 },
|
59
|
-
{ char: 'l', x_start: 201, y_start: 47, x_end: 203, y_end: 61 },
|
60
|
-
{ char: 'l', x_start: 206, y_start: 47, x_end: 208, y_end: 61 },
|
61
|
-
{ char: 'o', x_start: 216, y_start: 47, x_end: 226, y_end: 57 },
|
62
|
-
{ char: 'p', x_start: 228, y_start: 43, x_end: 237, y_end: 57 },
|
63
|
-
{ char: 'e', x_start: 238, y_start: 47, x_end: 248, y_end: 57 },
|
64
|
-
{ char: 'n', x_start: 250, y_start: 47, x_end: 258, y_end: 57 },
|
65
|
-
{ char: '.', x_start: 261, y_start: 47, x_end: 263, y_end: 49 }]
|
66
|
-
end
|
67
|
-
|
68
|
-
it 'bounding box by char' do
|
69
|
-
expect(RTesseract::BoxChar.new(@image_tiff).characters.is_a?(Array)).to eql(true)
|
70
|
-
expect(RTesseract::BoxChar.new(@image_tiff).characters).to eql([
|
71
|
-
{ char: '4', x_start: 145, y_start: 14, x_end: 159, y_end: 33 },
|
72
|
-
{ char: '3', x_start: 184, y_start: 14, x_end: 196, y_end: 33 },
|
73
|
-
{ char: 'X', x_start: 222, y_start: 14, x_end: 238, y_end: 32 },
|
74
|
-
{ char: 'F', x_start: 260, y_start: 14, x_end: 273, y_end: 32 }])
|
75
|
-
|
76
|
-
expect(RTesseract::BoxChar.new(@words_image).characters).to eql(@values)
|
77
|
-
|
78
|
-
expect { RTesseract::BoxChar.new(@image_tiff, command: 'tesseract_error').to_s }.to raise_error(RTesseract::ConversionError)
|
79
|
-
expect { RTesseract::BoxChar.new(@image_tiff + '_not_exist').to_s }.to raise_error(RTesseract::ImageNotSelectedError)
|
80
|
-
# expect(RTesseract::BoxChar.new(@path.join('images', 'blank.tif').to_s, options: :digits).characters).to eql([])
|
81
|
-
end
|
82
|
-
end
|
data/spec/rtesseract_box_spec.rb
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
3
|
-
|
4
|
-
describe 'Rtesseract::Box' do
|
5
|
-
before do
|
6
|
-
@path = Pathname.new(__FILE__.gsub('rtesseract_box_spec.rb', '')).expand_path
|
7
|
-
@image_tiff = @path.join('images', 'test.tif').to_s
|
8
|
-
@words_image = @path.join('images', 'test_words.png').to_s
|
9
|
-
end
|
10
|
-
|
11
|
-
it 'bounding box' do
|
12
|
-
expect(RTesseract.new(@words_image).to_s).to eql("If you are a friend,\nyou speak the password,\nand the doors will open.\n\n")
|
13
|
-
expect(RTesseract::Box.new(@words_image).words).to eql([
|
14
|
-
{ word: 'If', x_start: 52, y_start: 13, x_end: 63, y_end: 27 },
|
15
|
-
{ word: 'you', x_start: 69, y_start: 17, x_end: 100, y_end: 31 },
|
16
|
-
{ word: 'are', x_start: 108, y_start: 17, x_end: 136, y_end: 27 },
|
17
|
-
{ word: 'a', x_start: 143, y_start: 17, x_end: 151, y_end: 27 },
|
18
|
-
{ word: 'friend,', x_start: 158, y_start: 13, x_end: 214, y_end: 29 },
|
19
|
-
{ word: 'you', x_start: 51, y_start: 39, x_end: 82, y_end: 53 },
|
20
|
-
{ word: 'speak', x_start: 90, y_start: 35, x_end: 140, y_end: 53 },
|
21
|
-
{ word: 'the', x_start: 146, y_start: 35, x_end: 174, y_end: 49 },
|
22
|
-
{ word: 'password,', x_start: 182, y_start: 35, x_end: 267, y_end: 53 },
|
23
|
-
{ word: 'and', x_start: 51, y_start: 57, x_end: 81, y_end: 71 },
|
24
|
-
{ word: 'the', x_start: 89, y_start: 57, x_end: 117, y_end: 71 },
|
25
|
-
{ word: 'doors', x_start: 124, y_start: 57, x_end: 172, y_end: 71 },
|
26
|
-
{ word: 'will', x_start: 180, y_start: 57, x_end: 208, y_end: 71 },
|
27
|
-
{ word: 'open.', x_start: 216, y_start: 61, x_end: 263, y_end: 75 }
|
28
|
-
])
|
29
|
-
|
30
|
-
expect(RTesseract::Box.new(@image_tiff).words.is_a?(Array)).to eql(true)
|
31
|
-
expect(RTesseract::Box.new(@words_image).to_s).to eql('If you are a friend, you speak the password, and the doors will open.')
|
32
|
-
expect { RTesseract::Box.new(@image_tiff, command: 'tesseract_error').to_s }.to raise_error(RTesseract::ConversionError)
|
33
|
-
expect { RTesseract::Box.new(@image_tiff + '_not_exist').to_s }.to raise_error(RTesseract::ImageNotSelectedError)
|
34
|
-
# expect(RTesseract::Box.new(@path.join('images', 'blank.tif').to_s, options: :digits).words).to eql([])
|
35
|
-
end
|
36
|
-
end
|
@@ -1,49 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
3
|
-
|
4
|
-
describe 'Rtesseract::Mixed' do
|
5
|
-
before do
|
6
|
-
@path = Pathname.new(__FILE__.gsub('rtesseract_mixed_spec.rb', '')).expand_path
|
7
|
-
@image_tif = @path.join('images', 'mixed.tif').to_s
|
8
|
-
@image2_tif = @path.join('images', 'mixed2.tif').to_s
|
9
|
-
end
|
10
|
-
|
11
|
-
it 'should be instantiable' do
|
12
|
-
expect(RTesseract::Mixed.new.class).to eql(RTesseract::Mixed)
|
13
|
-
expect(RTesseract::Mixed.new(@image_tif).class).to eql(RTesseract::Mixed)
|
14
|
-
end
|
15
|
-
|
16
|
-
it 'should translate parts of the image to text' do
|
17
|
-
mix_block = RTesseract::Mixed.new(@image_tif, psm: 7) do |image|
|
18
|
-
image.area(x: 28, y: 19, w: 25, h: 25) # position of 4
|
19
|
-
image.area(x: 180, y: 22, w: 20, h: 28) # position of 3
|
20
|
-
image.area(x: 218, y: 22, w: 24, h: 28) # position of F
|
21
|
-
image.area(x: 248, y: 24, w: 22, h: 22) # position of F
|
22
|
-
end
|
23
|
-
expect(mix_block.to_s_without_spaces).to eql('43FF')
|
24
|
-
mix_block.clear_areas
|
25
|
-
expect(mix_block.areas).to eql([])
|
26
|
-
|
27
|
-
@areas = []
|
28
|
-
@areas << { x: 28, y: 19, w: 25, h: 25 } # position of 4
|
29
|
-
@areas << { x: 180, y: 22, w: 20, h: 28 } # position of 3
|
30
|
-
@areas << { x: 218, y: 22, w: 24, h: 28 } # position of f
|
31
|
-
@areas << { x: 248, y: 24, w: 22, h: 22 } # position of f
|
32
|
-
|
33
|
-
mix_block = RTesseract::Mixed.new(@image_tif, areas: @areas, psm: 7)
|
34
|
-
expect(mix_block.to_s_without_spaces).to eql('43FF')
|
35
|
-
|
36
|
-
mix_block = RTesseract::Mixed.new(@path.join('images', 'blank.tif').to_s, areas: @areas, psm: 7)
|
37
|
-
expect(mix_block.to_s_without_spaces).to eql('')
|
38
|
-
end
|
39
|
-
|
40
|
-
it ' get a error' do
|
41
|
-
@areas = [{ x: 28, y: 19, w: 25, h: 25 }]
|
42
|
-
|
43
|
-
mix_block = RTesseract::Mixed.new(@path.join('images', 'test_not_exists.png').to_s, areas: @areas, psm: 7)
|
44
|
-
expect { mix_block.to_s_without_spaces }.to raise_error(RTesseract::ImageNotSelectedError)
|
45
|
-
|
46
|
-
mix_block = RTesseract::Mixed.new(@image_tif, areas: @areas, psm: 7, command: 'tesseract_error')
|
47
|
-
expect { mix_block.to_s }.to raise_error(RTesseract::ConversionError)
|
48
|
-
end
|
49
|
-
end
|
data/spec/rtesseract_spec.rb
DELETED
@@ -1,282 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
-
# encoding: UTF-8
|
3
|
-
require 'pathname'
|
4
|
-
RTesseract::Processor::RMagickProcessor.setup
|
5
|
-
|
6
|
-
# Class to rise error
|
7
|
-
class MakeStringError
|
8
|
-
def to_s
|
9
|
-
fail 'error'
|
10
|
-
end
|
11
|
-
end
|
12
|
-
|
13
|
-
describe 'Rtesseract' do
|
14
|
-
before do
|
15
|
-
@path = Pathname.new(__FILE__.gsub('rtesseract_spec.rb', '')).expand_path
|
16
|
-
@image_tif = @path.join('images', 'test.tif').to_s
|
17
|
-
@image_for_pdf = @path.join('images', 'test-pdf.png').to_s
|
18
|
-
end
|
19
|
-
|
20
|
-
it ' be instantiable' do
|
21
|
-
expect(RTesseract.new.class).to eql(RTesseract)
|
22
|
-
expect(RTesseract.new('').class).to eql(RTesseract)
|
23
|
-
expect(RTesseract.new(@image_tif).class).to eql(RTesseract)
|
24
|
-
end
|
25
|
-
|
26
|
-
it ' translate image to text' do
|
27
|
-
expect(RTesseract.new(@image_tif).to_s_without_spaces).to eql('43XF')
|
28
|
-
expect(RTesseract.new(@image_tif, processor: 'mini_magick').to_s_without_spaces).to eql('43XF')
|
29
|
-
expect(RTesseract.new(@path.join('images', 'test1.tif').to_s).to_s_without_spaces).to eql('V2V4')
|
30
|
-
expect(RTesseract.new(@path.join('images', 'test with spaces.tif').to_s).to_s_without_spaces).to eql('V2V4')
|
31
|
-
end
|
32
|
-
|
33
|
-
it ' translate images .png, .jpg, .bmp' do
|
34
|
-
expect(RTesseract.new(@path.join('images', 'test.png').to_s).to_s_without_spaces).to eql('HW9W')
|
35
|
-
expect(RTesseract.new(@path.join('images', 'test.jpg').to_s).to_s_without_spaces).to eql('3R8F')
|
36
|
-
expect(RTesseract.new(@path.join('images', 'test.bmp').to_s).to_s_without_spaces).to eql('FLA6')
|
37
|
-
end
|
38
|
-
|
39
|
-
it ' should not error with depth > 32' do
|
40
|
-
# expect(RTesseract.new(@path.join('images', 'README.pdf').to_s, debug: true).to_s_without_spaces).to eql('')
|
41
|
-
end
|
42
|
-
|
43
|
-
it ' support different processors' do
|
44
|
-
# Rmagick
|
45
|
-
expect(RTesseract.new(@image_tif).to_s_without_spaces).to eql('43XF')
|
46
|
-
expect(RTesseract.new(@image_tif, processor: 'rmagick').to_s_without_spaces).to eql('43XF')
|
47
|
-
expect(RTesseract.new(@path.join('images', 'test.png').to_s, processor: 'rmagick').to_s_without_spaces).to eql('HW9W')
|
48
|
-
|
49
|
-
# MiniMagick
|
50
|
-
expect(RTesseract.new(@image_tif, processor: 'mini_magick').to_s_without_spaces).to eql('43XF')
|
51
|
-
expect(RTesseract.new(@path.join('images', 'test.png').to_s, processor: 'mini_magick').to_s_without_spaces).to eql('HW9W')
|
52
|
-
|
53
|
-
# NoneMagick
|
54
|
-
expect(RTesseract.new(@image_tif, processor: 'none').to_s_without_spaces).to eql('43XF')
|
55
|
-
end
|
56
|
-
|
57
|
-
it ' change the image' do
|
58
|
-
image = RTesseract.new(@image_tif)
|
59
|
-
expect(image.to_s_without_spaces).to eql('43XF')
|
60
|
-
image.source = @path.join('images', 'test1.tif').to_s
|
61
|
-
expect(image.to_s_without_spaces).to eql('V2V4')
|
62
|
-
end
|
63
|
-
|
64
|
-
it ' returns the source' do
|
65
|
-
image = RTesseract.new(@image_tif)
|
66
|
-
expect(image.source).to eql(Pathname.new(@image_tif))
|
67
|
-
end
|
68
|
-
|
69
|
-
it ' select the language' do
|
70
|
-
# English
|
71
|
-
expect(RTesseract.new(@image_tif, lang: 'eng').lang).to eql(' -l eng ')
|
72
|
-
expect(RTesseract.new(@image_tif, lang: 'en').lang).to eql(' -l eng ')
|
73
|
-
expect(RTesseract.new(@image_tif, lang: 'en-US').lang).to eql(' -l eng ')
|
74
|
-
expect(RTesseract.new(@image_tif, lang: 'english').lang).to eql(' -l eng ')
|
75
|
-
|
76
|
-
# Portuguese
|
77
|
-
expect(RTesseract.new(@image_tif, lang: 'por').lang).to eql(' -l por ')
|
78
|
-
expect(RTesseract.new(@image_tif, lang: 'pt-BR').lang).to eql(' -l por ')
|
79
|
-
expect(RTesseract.new(@image_tif, lang: 'pt-br').lang).to eql(' -l por ')
|
80
|
-
expect(RTesseract.new(@image_tif, lang: 'pt').lang).to eql(' -l por ')
|
81
|
-
expect(RTesseract.new(@image_tif, lang: 'portuguese').lang).to eql(' -l por ')
|
82
|
-
|
83
|
-
expect(RTesseract.new(@image_tif, lang: 'eng').to_s_without_spaces).to eql('43XF')
|
84
|
-
|
85
|
-
expect(RTesseract.new(@image_tif, lang: 'eng').lang).to eql(' -l eng ')
|
86
|
-
expect(RTesseract.new(@image_tif, lang: 'it').lang).to eql(' -l ita ')
|
87
|
-
|
88
|
-
# Invalid lang object
|
89
|
-
expect(RTesseract.new(@image_tif, lang: MakeStringError.new).lang).to eql('')
|
90
|
-
end
|
91
|
-
|
92
|
-
it ' select options' do
|
93
|
-
expect(RTesseract.new(@image_tif).options_cmd).to eql([])
|
94
|
-
expect(RTesseract.new(@image_tif, options: 'digits').options_cmd).to eql(['digits'])
|
95
|
-
expect(RTesseract.new(@image_tif, options: :digits).options_cmd).to eql([:digits])
|
96
|
-
expect(RTesseract.new(@image_tif, options: [:digits, :quiet]).options_cmd).to eql([:digits, :quiet])
|
97
|
-
end
|
98
|
-
|
99
|
-
it ' support pdf output mode' do
|
100
|
-
# Internal test. Consider 'pdf' option only when #to_pdf is called.
|
101
|
-
expect(RTesseract.new(@image_tif, options: 'pdf').options_cmd).to eql([])
|
102
|
-
expect(RTesseract.new(@image_for_pdf, options: :pdf).options_cmd).to eql([])
|
103
|
-
|
104
|
-
pdf_ocr = RTesseract.new(@image_for_pdf)
|
105
|
-
expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
|
106
|
-
expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
|
107
|
-
# Comment next line and go to tmp dir to see generated pdf.
|
108
|
-
expect(pdf_ocr.clean).to eq(true)
|
109
|
-
expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
|
110
|
-
|
111
|
-
# Still have original functionality (i.e. #to_s, #to_s_without_spaces).
|
112
|
-
pdf_ocr = RTesseract.new(@image_tif)
|
113
|
-
expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
|
114
|
-
expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
|
115
|
-
expect(pdf_ocr.to_s_without_spaces).to eql('43XF')
|
116
|
-
expect(pdf_ocr.clean).to eq(true)
|
117
|
-
expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
|
118
|
-
end
|
119
|
-
|
120
|
-
it ' warn when tesseract cannot give pdf' do
|
121
|
-
rtesseract = RTesseract.new(@image_for_pdf)
|
122
|
-
|
123
|
-
allow(rtesseract).to receive(:tesseract_version).and_return(3.02)
|
124
|
-
expect { rtesseract.to_pdf }.to raise_error(RTesseract::TesseractVersionError)
|
125
|
-
|
126
|
-
allow(rtesseract).to receive(:tesseract_version).and_return(3.03)
|
127
|
-
expect { rtesseract.to_pdf }.not_to raise_error
|
128
|
-
end
|
129
|
-
|
130
|
-
it ' be configurable' do
|
131
|
-
expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0, display_text: 0).config).to eql("chop_enable 0\nenable_assoc 0\ndisplay_text 0")
|
132
|
-
expect(RTesseract.new(@image_tif, chop_enable: 0).config).to eql('chop_enable 0')
|
133
|
-
expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0).config).to eql("chop_enable 0\nenable_assoc 0")
|
134
|
-
expect(RTesseract.new(@image_tif, chop_enable: 0).to_s_without_spaces).to eql('43XF')
|
135
|
-
expect(RTesseract.new(@image_tif, tessedit_char_whitelist: 'ABCDEF12345').to_s_without_spaces).to eql('43F')
|
136
|
-
end
|
137
|
-
|
138
|
-
it ' crop image' do
|
139
|
-
expect(RTesseract.new(@image_tif, psm: 7).crop!(w: 36, h: 40, x: 140, y: 10).to_s_without_spaces).to eql('4')
|
140
|
-
expect(RTesseract.new(@image_tif, psm: 7).crop!(w: 36, h: 40, x: 180, y: 10).to_s_without_spaces).to eql('3')
|
141
|
-
expect(RTesseract.new(@image_tif, psm: 7).crop!(w: 20, h: 40, x: 216, y: 10).to_s_without_spaces).to eql('X')
|
142
|
-
expect(RTesseract.new(@image_tif, psm: 7).crop!(w: 30, h: 40, x: 240, y: 10).to_s_without_spaces).to eql('F')
|
143
|
-
end
|
144
|
-
|
145
|
-
it ' read image from blob' do
|
146
|
-
image = Magick::Image.read(@path.join('images', 'test.png').to_s).first
|
147
|
-
blob = image.quantize(256, Magick::GRAYColorspace).to_blob
|
148
|
-
|
149
|
-
test = RTesseract.new('', psm: 7)
|
150
|
-
test.from_blob(blob)
|
151
|
-
expect(test.to_s_without_spaces).to eql('HW9W')
|
152
|
-
|
153
|
-
test = RTesseract.new('', psm: 7)
|
154
|
-
expect { test.from_blob('') }.to raise_error(RTesseract::ConversionError)
|
155
|
-
end
|
156
|
-
|
157
|
-
it ' use a instance' do
|
158
|
-
expect(RTesseract.new(Magick::Image.read(@image_tif.to_s).first).to_s_without_spaces).to eql('43XF')
|
159
|
-
expect(RTesseract::Processor::RMagickProcessor.a_name?('teste')).to eql(false)
|
160
|
-
expect(RTesseract::Processor::RMagickProcessor.a_name?('rmagick')).to eql(true)
|
161
|
-
expect(RTesseract::Processor::RMagickProcessor.a_name?('RMagickProcessor')).to eql(true)
|
162
|
-
expect(RTesseract::Processor::MiniMagickProcessor.a_name?('teste')).to eql(false)
|
163
|
-
expect(RTesseract::Processor::MiniMagickProcessor.a_name?('mini_magick')).to eql(true)
|
164
|
-
expect(RTesseract::Processor::MiniMagickProcessor.a_name?('MiniMagickProcessor')).to eql(true)
|
165
|
-
expect(RTesseract::Processor::NoneProcessor.a_name?('none')).to eql(true)
|
166
|
-
expect(RTesseract::Processor::NoneProcessor.a_name?('NoneProcessor')).to eql(true)
|
167
|
-
end
|
168
|
-
|
169
|
-
it ' change image in a block' do
|
170
|
-
test = RTesseract.read(@path.join('images', 'test.png').to_s) {}
|
171
|
-
expect(test.class).to eql(RTesseract)
|
172
|
-
|
173
|
-
test = RTesseract.new(@image_tif)
|
174
|
-
test.read do |_image|
|
175
|
-
_image = _image.quantize(256, Magick::GRAYColorspace)
|
176
|
-
end
|
177
|
-
expect(test.to_s_without_spaces).to eql('43XF')
|
178
|
-
|
179
|
-
test = RTesseract.new(@path.join('images', 'blank.tif').to_s)
|
180
|
-
test.read do |_image|
|
181
|
-
_image
|
182
|
-
end
|
183
|
-
expect(test.to_s_without_spaces).to eql('')
|
184
|
-
|
185
|
-
test = RTesseract.read(@path.join('images', 'test.png').to_s) do |_image|
|
186
|
-
_image.rotate(90)
|
187
|
-
end
|
188
|
-
expect(test.to_s_without_spaces).to eql('HW9W')
|
189
|
-
|
190
|
-
test = RTesseract.read(@path.join('images', 'test.jpg').to_s, lang: 'en') do |_image|
|
191
|
-
_image = _image.white_threshold(245).quantize(256, Magick::GRAYColorspace)
|
192
|
-
end
|
193
|
-
expect(test.to_s_without_spaces).to eql('3R8F')
|
194
|
-
|
195
|
-
test = RTesseract.read(@path.join('images', 'test.jpg').to_s, lang: 'en', processor: 'mini_magick') do |_image|
|
196
|
-
_image.gravity 'south'
|
197
|
-
end
|
198
|
-
expect(test.to_s_without_spaces).to eql('3R8F')
|
199
|
-
end
|
200
|
-
|
201
|
-
it 'does not raise on read with image_magick processor' do
|
202
|
-
expect {
|
203
|
-
instance = RTesseract.read(@image_tif, processor: 'mini_magick') {}
|
204
|
-
expect(instance.processor.a_name?('mini_magick')).to be_truthy
|
205
|
-
}.not_to raise_error
|
206
|
-
end
|
207
|
-
|
208
|
-
it ' get a error' do
|
209
|
-
expect { RTesseract.new(@path.join('images', 'test.jpg').to_s, command: 'tesseract_error').to_s }.to raise_error(RTesseract::ConversionError)
|
210
|
-
expect { RTesseract.new(@path.join('images', 'test_not_exists.png').to_s).to_s }.to raise_error(RTesseract::ImageNotSelectedError)
|
211
|
-
|
212
|
-
# Invalid psm object
|
213
|
-
expect(RTesseract.new(@image_tif, psm: MakeStringError.new).psm).to eql('')
|
214
|
-
end
|
215
|
-
|
216
|
-
it 'remove a file' do
|
217
|
-
RTesseract::Utils.remove_files(Tempfile.new('config'))
|
218
|
-
|
219
|
-
expect { RTesseract::Utils.remove_files(Pathname.new(Dir.tmpdir).join('test_not_exists')) }.to raise_error(RTesseract::TempFilesNotRemovedError)
|
220
|
-
end
|
221
|
-
|
222
|
-
it ' get a numeric value for tesseract version' do
|
223
|
-
expect(RTesseract::Utils.version_number).to be_a Float
|
224
|
-
end
|
225
|
-
|
226
|
-
it ' support default config processors' do
|
227
|
-
# Rmagick
|
228
|
-
RTesseract.configure { |config| config.processor = 'rmagick' }
|
229
|
-
expect(RTesseract.new(@image_tif).processor.a_name?('rmagick')).to eql(true)
|
230
|
-
|
231
|
-
# MiniMagick
|
232
|
-
RTesseract.configure { |config| config.processor = 'mini_magick' }
|
233
|
-
expect(RTesseract.new(@image_tif).processor.a_name?('mini_magick')).to eql(true)
|
234
|
-
|
235
|
-
# NoneMagick
|
236
|
-
RTesseract.configure { |config| config.processor = 'none' }
|
237
|
-
expect(RTesseract.new(@image_tif).processor.a_name?('none')).to eql(true)
|
238
|
-
|
239
|
-
# overwrite default
|
240
|
-
RTesseract.configure { |config| config.processor = 'rmagick' }
|
241
|
-
expect(RTesseract.new(@image_tif, processor: 'mini_magick').processor.a_name?('mini_magick')).to eql(true)
|
242
|
-
|
243
|
-
RTesseract.configure { |config| config.lang = 'portuguese' }
|
244
|
-
expect(RTesseract.new(@image_tif).lang).to eql(' -l por ')
|
245
|
-
|
246
|
-
RTesseract.configure { |config| config.psm = 7 }
|
247
|
-
expect(RTesseract.new(@image_tif).psm).to eql(' -psm 7 ')
|
248
|
-
|
249
|
-
RTesseract.configure { |config| config.tessdata_dir = '/tmp/test' }
|
250
|
-
expect(RTesseract.new(@image_tif).tessdata_dir).to eql(' --tessdata-dir /tmp/test ')
|
251
|
-
|
252
|
-
RTesseract.configure { |config| config.user_words = '/tmp/test' }
|
253
|
-
expect(RTesseract.new(@image_tif).user_words).to eql(' --user-words /tmp/test ')
|
254
|
-
|
255
|
-
RTesseract.configure { |config| config.user_patterns = '/tmp/test' }
|
256
|
-
expect(RTesseract.new(@image_tif).user_patterns).to eql(' --user-patterns /tmp/test ')
|
257
|
-
end
|
258
|
-
|
259
|
-
it ' configure pdf has no effect and kept in-house' do
|
260
|
-
# So it does not interfere with #to_s outputting.
|
261
|
-
RTesseract.configure { |config| config.options_cmd = ['pdf'] }
|
262
|
-
expect(RTesseract.new(@image_tif).options_cmd).to eql([])
|
263
|
-
|
264
|
-
RTesseract.configure { |config| config.options_cmd = [:pdf] }
|
265
|
-
expect(RTesseract.new(@image_tif).options_cmd).to eql([])
|
266
|
-
|
267
|
-
RTesseract.configure { |config| config.options_cmd = [:pdf, 'pdf'] }
|
268
|
-
expect(RTesseract.new(@image_tif).options_cmd).to eql([])
|
269
|
-
end
|
270
|
-
|
271
|
-
it ' support new configs' do
|
272
|
-
expect(RTesseract.new(@image_tif, tessdata_dir: '/tmp/test').tessdata_dir).to eql(' --tessdata-dir /tmp/test ')
|
273
|
-
expect(RTesseract.new(@image_tif, user_words: '/tmp/test').user_words).to eql(' --user-words /tmp/test ')
|
274
|
-
expect(RTesseract.new(@image_tif, user_patterns: '/tmp/test').user_patterns).to eql(' --user-patterns /tmp/test ')
|
275
|
-
|
276
|
-
expect(RTesseract.new(@image_tif, tessdata_dir: MakeStringError.new).tessdata_dir).to eql('')
|
277
|
-
expect(RTesseract.new(@image_tif, user_words: MakeStringError.new).user_words).to eql('')
|
278
|
-
expect(RTesseract.new(@image_tif, user_patterns: MakeStringError.new).user_patterns).to eql('')
|
279
|
-
|
280
|
-
# expect(RTesseract.new(@path.join('images', 'test_words.png').to_s, psm: 3, user_words: @path.join('configs', 'eng.user-words.txt').to_s).to_s).to eql("If you are a friend,\nyou speak the password,\nand the doors will open.\n\n")
|
281
|
-
end
|
282
|
-
end
|