rtesseract 2.0.1 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/CHANGELOG.md +6 -0
- data/README.rdoc +16 -1
- data/VERSION +1 -1
- data/lib/rtesseract.rb +60 -13
- data/lib/rtesseract/box.rb +2 -2
- data/lib/rtesseract/box_char.rb +1 -1
- data/lib/rtesseract/configuration.rb +12 -2
- data/lib/rtesseract/errors.rb +6 -0
- data/lib/rtesseract/utils.rb +11 -1
- data/rtesseract.gemspec +4 -3
- data/spec/images/test-pdf.png +0 -0
- data/spec/rtesseract_spec.rb +50 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: db992168bb87c6c4f3124403f9417c4cd46aca3e
|
4
|
+
data.tar.gz: b329a5ebc7316f28f63bcd7d84aa575cb661b3b9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a74dc8fd03a678ecdff1425bc188173615da59b636cbe9ceb072353a1e4d3f2aa076b9787007ef9bfebb9c928e2a170d8c0149f864741c888c6d74ea04a78889
|
7
|
+
data.tar.gz: 97d3bfea1ae54841ce68427893e8e69fa358a3018ba80b8f06d23759abe0d89c81e481c5775699b3c0e98234fb39dfb898a42b833c48b8ac42128c7c7c292b88
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
data/README.rdoc
CHANGED
@@ -16,6 +16,8 @@ To work properly rtesseract are needed:
|
|
16
16
|
|
17
17
|
Atention: Version 1.0.0 works fine with Ruby 2.0 and tesseract 3.0 and lower versions of rtesseract works fine with Ruby 1.8 and tesseract 2.0.4.
|
18
18
|
|
19
|
+
PDF support requires a newer version of tesseract, specifically V.3.03 or above.
|
20
|
+
|
19
21
|
== EXAMPLE USAGE
|
20
22
|
|
21
23
|
It's very simple to use rtesseract:
|
@@ -23,7 +25,19 @@ It's very simple to use rtesseract:
|
|
23
25
|
=== CONVERT IMAGE TO STRING
|
24
26
|
|
25
27
|
image = RTesseract.new("my_image.jpg")
|
26
|
-
image.to_s #Getting the value
|
28
|
+
image.to_s # Getting the value
|
29
|
+
|
30
|
+
=== CONVERT IMAGE TO SEARCHABLE PDF
|
31
|
+
|
32
|
+
image = RTesseract.new("my_image.jpg")
|
33
|
+
image.to_pdf # Getting the pdf path
|
34
|
+
image.to_s # Still can get the value only.
|
35
|
+
# ...
|
36
|
+
# some stuff
|
37
|
+
# ...
|
38
|
+
image.clean # to delete file once finished
|
39
|
+
|
40
|
+
This will preserve the image colors, pictures and structure in the generated pdf.
|
27
41
|
|
28
42
|
=== CHANGE THE IMAGE
|
29
43
|
|
@@ -89,6 +103,7 @@ Language Options
|
|
89
103
|
* por - Portuguese
|
90
104
|
* spa - Spanish
|
91
105
|
* vie - Vietnamese
|
106
|
+
* or any other supported by tesseract.
|
92
107
|
Note: Make sure you have installed the language to tesseract
|
93
108
|
|
94
109
|
Other Options
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.0
|
1
|
+
2.1.0
|
data/lib/rtesseract.rb
CHANGED
@@ -15,10 +15,9 @@ class RTesseract
|
|
15
15
|
def initialize(src = '', options = {})
|
16
16
|
self.configuration = RTesseract.local_config(options)
|
17
17
|
@options = options || {}
|
18
|
-
@value = nil
|
19
18
|
@points = {}
|
20
19
|
@processor = RTesseract::Processor.choose_processor!(configuration.processor)
|
21
|
-
|
20
|
+
self.source = src
|
22
21
|
initialize_hook
|
23
22
|
end
|
24
23
|
|
@@ -29,6 +28,7 @@ class RTesseract
|
|
29
28
|
# Define the source
|
30
29
|
def source=(src)
|
31
30
|
@value = nil
|
31
|
+
@pdf_path = nil
|
32
32
|
@source = @processor.image?(src) ? src : Pathname.new(src)
|
33
33
|
end
|
34
34
|
|
@@ -127,24 +127,50 @@ class RTesseract
|
|
127
127
|
'.txt'
|
128
128
|
end
|
129
129
|
|
130
|
+
# Detect version number
|
131
|
+
def tesseract_version
|
132
|
+
RTesseract::Utils.version_number
|
133
|
+
end
|
134
|
+
|
135
|
+
|
130
136
|
# Rand file path
|
131
|
-
def
|
132
|
-
@
|
137
|
+
def file_dest
|
138
|
+
@file_dest = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
|
133
139
|
end
|
134
140
|
|
135
|
-
# Full path of file with extension
|
136
|
-
def
|
137
|
-
[@
|
141
|
+
# Full path of file with txt extension
|
142
|
+
def file_with_ext(ext = nil)
|
143
|
+
[@file_dest, ext || file_ext].join('')
|
138
144
|
end
|
139
145
|
|
140
146
|
# Run command
|
141
147
|
def convert_command
|
142
|
-
`#{configuration.command} "#{image}" "#{
|
148
|
+
`#{configuration.command} "#{image}" "#{file_dest}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{options_cmd.join(' ')}`
|
149
|
+
end
|
150
|
+
|
151
|
+
# Is pdf output?
|
152
|
+
def pdf?
|
153
|
+
options_cmd.include? 'pdf'
|
143
154
|
end
|
144
155
|
|
145
156
|
# Read result file
|
146
157
|
def convert_text
|
147
|
-
@value = File.read(
|
158
|
+
@value = File.read(file_with_ext).to_s
|
159
|
+
end
|
160
|
+
|
161
|
+
# Store pdf result path
|
162
|
+
def convert_pdf
|
163
|
+
@pdf_path = file_with_ext('.pdf')
|
164
|
+
end
|
165
|
+
|
166
|
+
# Convert result to proper type
|
167
|
+
def convert_result
|
168
|
+
if pdf?
|
169
|
+
convert_pdf
|
170
|
+
else
|
171
|
+
convert_text
|
172
|
+
RTesseract::Utils.remove_files([@image, file_with_ext])
|
173
|
+
end
|
148
174
|
end
|
149
175
|
|
150
176
|
# Hook to convert
|
@@ -155,15 +181,14 @@ class RTesseract
|
|
155
181
|
def convert
|
156
182
|
convert_command
|
157
183
|
after_convert_hook
|
158
|
-
|
159
|
-
RTesseract::Utils.remove_files([@image, text_file_with_ext])
|
184
|
+
convert_result
|
160
185
|
rescue => error
|
161
186
|
raise RTesseract::ConversionError.new(error), error, caller
|
162
187
|
end
|
163
188
|
|
164
189
|
# Output value
|
165
190
|
def to_s
|
166
|
-
return @value if @value
|
191
|
+
return @value if @value
|
167
192
|
|
168
193
|
if @processor.image?(@source) || @source.file?
|
169
194
|
convert
|
@@ -175,8 +200,30 @@ class RTesseract
|
|
175
200
|
|
176
201
|
# Remove spaces and break-lines
|
177
202
|
def to_s_without_spaces
|
178
|
-
to_s.
|
203
|
+
to_s.gsub(/\s/, '')
|
179
204
|
end
|
205
|
+
|
206
|
+
# Output pdf path
|
207
|
+
def to_pdf
|
208
|
+
return @pdf_path if @pdf_path
|
209
|
+
|
210
|
+
fail TesseractVersionError.new if tesseract_version.nil? || tesseract_version < 3.03
|
211
|
+
|
212
|
+
if @processor.image?(@source) || @source.file?
|
213
|
+
options_cmd << 'pdf'
|
214
|
+
convert
|
215
|
+
options_cmd.delete('pdf')
|
216
|
+
@pdf_path
|
217
|
+
else
|
218
|
+
fail RTesseract::ImageNotSelectedError.new(@source)
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
# Destroy pdf file
|
223
|
+
def clean
|
224
|
+
RTesseract::Utils.remove_files([@pdf_path])
|
225
|
+
end
|
226
|
+
|
180
227
|
end
|
181
228
|
|
182
229
|
require 'rtesseract/mixed'
|
data/lib/rtesseract/box.rb
CHANGED
@@ -29,7 +29,7 @@ class RTesseract
|
|
29
29
|
|
30
30
|
# Read the result file
|
31
31
|
def parse_file
|
32
|
-
html = Nokogiri::HTML(File.read(
|
32
|
+
html = Nokogiri::HTML(File.read(file_with_ext))
|
33
33
|
html.css('span.ocrx_word, span.ocr_word')
|
34
34
|
end
|
35
35
|
|
@@ -42,7 +42,7 @@ class RTesseract
|
|
42
42
|
|
43
43
|
# Move file html to hocr
|
44
44
|
def after_convert_hook
|
45
|
-
FileUtils.mv(
|
45
|
+
FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil
|
46
46
|
end
|
47
47
|
|
48
48
|
# Output value
|
data/lib/rtesseract/box_char.rb
CHANGED
@@ -43,6 +43,15 @@ class RTesseract
|
|
43
43
|
def self.configure
|
44
44
|
self.configuration ||= Configuration.new
|
45
45
|
yield(configuration)
|
46
|
+
self.clear_pdf_option
|
47
|
+
end
|
48
|
+
|
49
|
+
# Clear pdf option
|
50
|
+
def self.clear_pdf_option
|
51
|
+
if self.configuration.options_cmd
|
52
|
+
self.configuration.options_cmd.delete('pdf')
|
53
|
+
self.configuration.options_cmd.delete(:pdf)
|
54
|
+
end
|
46
55
|
end
|
47
56
|
|
48
57
|
# Default command
|
@@ -59,7 +68,8 @@ class RTesseract
|
|
59
68
|
config.processor = config.option(options, :processor, 'rmagick')
|
60
69
|
config.load_options(options, [:lang, :psm, :tessdata_dir, :user_words, :user_patterns])
|
61
70
|
config.debug = config.option(options, :debug, false)
|
62
|
-
|
71
|
+
pdf_opts = lambda { |o| o == 'pdf' || o == :pdf }
|
72
|
+
config.options_cmd = [options.option(:options, nil)].delete_if(&pdf_opts).flatten.compact
|
63
73
|
end
|
64
74
|
end
|
65
|
-
end
|
75
|
+
end
|
data/lib/rtesseract/errors.rb
CHANGED
@@ -12,4 +12,10 @@ class RTesseract
|
|
12
12
|
class ConversionError < ErrorWithMemory; end
|
13
13
|
class ImageNotSelectedError < ErrorWithMemory; end
|
14
14
|
class TempFilesNotRemovedError < ErrorWithMemory; end
|
15
|
+
|
16
|
+
class TesseractVersionError < StandardError
|
17
|
+
def initialize
|
18
|
+
super "Tesseract version is unknown or below 3.03 which is required for pdf output."
|
19
|
+
end
|
20
|
+
end
|
15
21
|
end
|
data/lib/rtesseract/utils.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
1
3
|
# RTesseract
|
2
4
|
class RTesseract
|
3
5
|
# Some utils methods
|
@@ -22,6 +24,14 @@ class RTesseract
|
|
22
24
|
end
|
23
25
|
true
|
24
26
|
end
|
27
|
+
|
28
|
+
# Extract tesseract version number
|
29
|
+
def self.version_number
|
30
|
+
out, err, st = Open3.capture3(RTesseract.default_command, "--version")
|
31
|
+
|
32
|
+
version = err.split("\n")[0].split(" ")[1].split('.')[0, 2].join('.')
|
33
|
+
Float(version) rescue nil
|
34
|
+
end
|
25
35
|
end
|
26
36
|
end
|
27
37
|
|
@@ -31,4 +41,4 @@ class Hash
|
|
31
41
|
def option(attr_name, default)
|
32
42
|
delete(attr_name.to_s) || delete(attr_name) || default
|
33
43
|
end
|
34
|
-
end
|
44
|
+
end
|
data/rtesseract.gemspec
CHANGED
@@ -2,16 +2,16 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: rtesseract 2.0
|
5
|
+
# stub: rtesseract 2.1.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "rtesseract"
|
9
|
-
s.version = "2.0
|
9
|
+
s.version = "2.1.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib"]
|
13
13
|
s.authors = ["Danilo Jeremias da Silva"]
|
14
|
-
s.date = "2016-
|
14
|
+
s.date = "2016-09-08"
|
15
15
|
s.description = "Ruby library for working with the Tesseract OCR."
|
16
16
|
s.email = "dannnylo@gmail.com"
|
17
17
|
s.extra_rdoc_files = [
|
@@ -48,6 +48,7 @@ Gem::Specification.new do |s|
|
|
48
48
|
"spec/images/mixed.tif",
|
49
49
|
"spec/images/orientation_reverse.png",
|
50
50
|
"spec/images/test with spaces.tif",
|
51
|
+
"spec/images/test-pdf.png",
|
51
52
|
"spec/images/test.bmp",
|
52
53
|
"spec/images/test.jpg",
|
53
54
|
"spec/images/test.png",
|
Binary file
|
data/spec/rtesseract_spec.rb
CHANGED
@@ -13,6 +13,7 @@ describe 'Rtesseract' do
|
|
13
13
|
before do
|
14
14
|
@path = Pathname.new(__FILE__.gsub('rtesseract_spec.rb', '')).expand_path
|
15
15
|
@image_tif = @path.join('images', 'test.tif').to_s
|
16
|
+
@image_for_pdf = @path.join('images', 'test-pdf.png').to_s
|
16
17
|
end
|
17
18
|
|
18
19
|
it ' be instantiable' do
|
@@ -94,12 +95,43 @@ describe 'Rtesseract' do
|
|
94
95
|
expect(RTesseract.new(@image_tif, options: [:digits, :quiet]).options_cmd).to eql([:digits, :quiet])
|
95
96
|
end
|
96
97
|
|
98
|
+
it ' support pdf output mode' do
|
99
|
+
# Internal test. Consider 'pdf' option only when #to_pdf is called.
|
100
|
+
expect(RTesseract.new(@image_tif, options: 'pdf').options_cmd).to eql([])
|
101
|
+
expect(RTesseract.new(@image_for_pdf, options: :pdf).options_cmd).to eql([])
|
102
|
+
|
103
|
+
pdf_ocr = RTesseract.new(@image_for_pdf)
|
104
|
+
expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
|
105
|
+
expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
|
106
|
+
# Comment next line and go to tmp dir to see generated pdf.
|
107
|
+
expect(pdf_ocr.clean).to eq(true)
|
108
|
+
expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
|
109
|
+
|
110
|
+
# Still have original functionality (i.e. #to_s, #to_s_without_spaces).
|
111
|
+
pdf_ocr = RTesseract.new(@image_tif)
|
112
|
+
expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
|
113
|
+
expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
|
114
|
+
expect(pdf_ocr.to_s_without_spaces).to eql('43XF')
|
115
|
+
expect(pdf_ocr.clean).to eq(true)
|
116
|
+
expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
|
117
|
+
end
|
118
|
+
|
119
|
+
it ' warn when tesseract cannot give pdf' do
|
120
|
+
rtesseract = RTesseract.new(@image_for_pdf)
|
121
|
+
|
122
|
+
allow(rtesseract).to receive(:tesseract_version).and_return(3.02)
|
123
|
+
expect { rtesseract.to_pdf }.to raise_error(RTesseract::TesseractVersionError)
|
124
|
+
|
125
|
+
allow(rtesseract).to receive(:tesseract_version).and_return(3.03)
|
126
|
+
expect { rtesseract.to_pdf }.not_to raise_error
|
127
|
+
end
|
128
|
+
|
97
129
|
it ' be configurable' do
|
98
130
|
expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0, display_text: 0).config).to eql("chop_enable 0\nenable_assoc 0\ndisplay_text 0")
|
99
131
|
expect(RTesseract.new(@image_tif, chop_enable: 0).config).to eql('chop_enable 0')
|
100
132
|
expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0).config).to eql("chop_enable 0\nenable_assoc 0")
|
101
133
|
expect(RTesseract.new(@image_tif, chop_enable: 0).to_s_without_spaces).to eql('43XF')
|
102
|
-
expect(RTesseract.new(@image_tif, tessedit_char_whitelist:
|
134
|
+
expect(RTesseract.new(@image_tif, tessedit_char_whitelist: 'ABCDEF12345').to_s_without_spaces).to eql('43F')
|
103
135
|
end
|
104
136
|
|
105
137
|
it ' crop image' do
|
@@ -179,7 +211,11 @@ describe 'Rtesseract' do
|
|
179
211
|
expect { RTesseract::Utils.remove_files(Pathname.new(Dir.tmpdir).join('test_not_exists')) }.to raise_error(RTesseract::TempFilesNotRemovedError)
|
180
212
|
end
|
181
213
|
|
182
|
-
it '
|
214
|
+
it ' get a numeric value for tesseract version' do
|
215
|
+
expect(RTesseract::Utils.version_number).to be_a Float
|
216
|
+
end
|
217
|
+
|
218
|
+
it ' support default config processors' do
|
183
219
|
# Rmagick
|
184
220
|
RTesseract.configure { |config| config.processor = 'rmagick' }
|
185
221
|
expect(RTesseract.new(@image_tif).processor.a_name?('rmagick')).to eql(true)
|
@@ -212,6 +248,18 @@ describe 'Rtesseract' do
|
|
212
248
|
expect(RTesseract.new(@image_tif).user_patterns).to eql(' --user-patterns /tmp/test ')
|
213
249
|
end
|
214
250
|
|
251
|
+
it ' configure pdf has no effect and kept in-house' do
|
252
|
+
# So it does not interfere with #to_s outputting.
|
253
|
+
RTesseract.configure { |config| config.options_cmd = ['pdf'] }
|
254
|
+
expect(RTesseract.new(@image_tif).options_cmd).to eql([])
|
255
|
+
|
256
|
+
RTesseract.configure { |config| config.options_cmd = [:pdf] }
|
257
|
+
expect(RTesseract.new(@image_tif).options_cmd).to eql([])
|
258
|
+
|
259
|
+
RTesseract.configure { |config| config.options_cmd = [:pdf, 'pdf'] }
|
260
|
+
expect(RTesseract.new(@image_tif).options_cmd).to eql([])
|
261
|
+
end
|
262
|
+
|
215
263
|
it ' support new configs' do
|
216
264
|
expect(RTesseract.new(@image_tif, tessdata_dir: '/tmp/test').tessdata_dir).to eql(' --tessdata-dir /tmp/test ')
|
217
265
|
expect(RTesseract.new(@image_tif, user_words: '/tmp/test').user_words).to eql(' --user-words /tmp/test ')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rtesseract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danilo Jeremias da Silva
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -145,6 +145,7 @@ files:
|
|
145
145
|
- spec/images/mixed.tif
|
146
146
|
- spec/images/orientation_reverse.png
|
147
147
|
- spec/images/test with spaces.tif
|
148
|
+
- spec/images/test-pdf.png
|
148
149
|
- spec/images/test.bmp
|
149
150
|
- spec/images/test.jpg
|
150
151
|
- spec/images/test.png
|