rtesseract 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/CHANGELOG.md +6 -0
- data/README.rdoc +16 -1
- data/VERSION +1 -1
- data/lib/rtesseract.rb +60 -13
- data/lib/rtesseract/box.rb +2 -2
- data/lib/rtesseract/box_char.rb +1 -1
- data/lib/rtesseract/configuration.rb +12 -2
- data/lib/rtesseract/errors.rb +6 -0
- data/lib/rtesseract/utils.rb +11 -1
- data/rtesseract.gemspec +4 -3
- data/spec/images/test-pdf.png +0 -0
- data/spec/rtesseract_spec.rb +50 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: db992168bb87c6c4f3124403f9417c4cd46aca3e
|
4
|
+
data.tar.gz: b329a5ebc7316f28f63bcd7d84aa575cb661b3b9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a74dc8fd03a678ecdff1425bc188173615da59b636cbe9ceb072353a1e4d3f2aa076b9787007ef9bfebb9c928e2a170d8c0149f864741c888c6d74ea04a78889
|
7
|
+
data.tar.gz: 97d3bfea1ae54841ce68427893e8e69fa358a3018ba80b8f06d23759abe0d89c81e481c5775699b3c0e98234fb39dfb898a42b833c48b8ac42128c7c7c292b88
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
data/README.rdoc
CHANGED
@@ -16,6 +16,8 @@ To work properly rtesseract are needed:
|
|
16
16
|
|
17
17
|
Atention: Version 1.0.0 works fine with Ruby 2.0 and tesseract 3.0 and lower versions of rtesseract works fine with Ruby 1.8 and tesseract 2.0.4.
|
18
18
|
|
19
|
+
PDF support requires a newer version of tesseract, specifically V.3.03 or above.
|
20
|
+
|
19
21
|
== EXAMPLE USAGE
|
20
22
|
|
21
23
|
It's very simple to use rtesseract:
|
@@ -23,7 +25,19 @@ It's very simple to use rtesseract:
|
|
23
25
|
=== CONVERT IMAGE TO STRING
|
24
26
|
|
25
27
|
image = RTesseract.new("my_image.jpg")
|
26
|
-
image.to_s #Getting the value
|
28
|
+
image.to_s # Getting the value
|
29
|
+
|
30
|
+
=== CONVERT IMAGE TO SEARCHABLE PDF
|
31
|
+
|
32
|
+
image = RTesseract.new("my_image.jpg")
|
33
|
+
image.to_pdf # Getting the pdf path
|
34
|
+
image.to_s # Still can get the value only.
|
35
|
+
# ...
|
36
|
+
# some stuff
|
37
|
+
# ...
|
38
|
+
image.clean # to delete file once finished
|
39
|
+
|
40
|
+
This will preserve the image colors, pictures and structure in the generated pdf.
|
27
41
|
|
28
42
|
=== CHANGE THE IMAGE
|
29
43
|
|
@@ -89,6 +103,7 @@ Language Options
|
|
89
103
|
* por - Portuguese
|
90
104
|
* spa - Spanish
|
91
105
|
* vie - Vietnamese
|
106
|
+
* or any other supported by tesseract.
|
92
107
|
Note: Make sure you have installed the language to tesseract
|
93
108
|
|
94
109
|
Other Options
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.0
|
1
|
+
2.1.0
|
data/lib/rtesseract.rb
CHANGED
@@ -15,10 +15,9 @@ class RTesseract
|
|
15
15
|
def initialize(src = '', options = {})
|
16
16
|
self.configuration = RTesseract.local_config(options)
|
17
17
|
@options = options || {}
|
18
|
-
@value = nil
|
19
18
|
@points = {}
|
20
19
|
@processor = RTesseract::Processor.choose_processor!(configuration.processor)
|
21
|
-
|
20
|
+
self.source = src
|
22
21
|
initialize_hook
|
23
22
|
end
|
24
23
|
|
@@ -29,6 +28,7 @@ class RTesseract
|
|
29
28
|
# Define the source
|
30
29
|
def source=(src)
|
31
30
|
@value = nil
|
31
|
+
@pdf_path = nil
|
32
32
|
@source = @processor.image?(src) ? src : Pathname.new(src)
|
33
33
|
end
|
34
34
|
|
@@ -127,24 +127,50 @@ class RTesseract
|
|
127
127
|
'.txt'
|
128
128
|
end
|
129
129
|
|
130
|
+
# Detect version number
|
131
|
+
def tesseract_version
|
132
|
+
RTesseract::Utils.version_number
|
133
|
+
end
|
134
|
+
|
135
|
+
|
130
136
|
# Rand file path
|
131
|
-
def
|
132
|
-
@
|
137
|
+
def file_dest
|
138
|
+
@file_dest = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
|
133
139
|
end
|
134
140
|
|
135
|
-
# Full path of file with extension
|
136
|
-
def
|
137
|
-
[@
|
141
|
+
# Full path of file with txt extension
|
142
|
+
def file_with_ext(ext = nil)
|
143
|
+
[@file_dest, ext || file_ext].join('')
|
138
144
|
end
|
139
145
|
|
140
146
|
# Run command
|
141
147
|
def convert_command
|
142
|
-
`#{configuration.command} "#{image}" "#{
|
148
|
+
`#{configuration.command} "#{image}" "#{file_dest}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{options_cmd.join(' ')}`
|
149
|
+
end
|
150
|
+
|
151
|
+
# Is pdf output?
|
152
|
+
def pdf?
|
153
|
+
options_cmd.include? 'pdf'
|
143
154
|
end
|
144
155
|
|
145
156
|
# Read result file
|
146
157
|
def convert_text
|
147
|
-
@value = File.read(
|
158
|
+
@value = File.read(file_with_ext).to_s
|
159
|
+
end
|
160
|
+
|
161
|
+
# Store pdf result path
|
162
|
+
def convert_pdf
|
163
|
+
@pdf_path = file_with_ext('.pdf')
|
164
|
+
end
|
165
|
+
|
166
|
+
# Convert result to proper type
|
167
|
+
def convert_result
|
168
|
+
if pdf?
|
169
|
+
convert_pdf
|
170
|
+
else
|
171
|
+
convert_text
|
172
|
+
RTesseract::Utils.remove_files([@image, file_with_ext])
|
173
|
+
end
|
148
174
|
end
|
149
175
|
|
150
176
|
# Hook to convert
|
@@ -155,15 +181,14 @@ class RTesseract
|
|
155
181
|
def convert
|
156
182
|
convert_command
|
157
183
|
after_convert_hook
|
158
|
-
|
159
|
-
RTesseract::Utils.remove_files([@image, text_file_with_ext])
|
184
|
+
convert_result
|
160
185
|
rescue => error
|
161
186
|
raise RTesseract::ConversionError.new(error), error, caller
|
162
187
|
end
|
163
188
|
|
164
189
|
# Output value
|
165
190
|
def to_s
|
166
|
-
return @value if @value
|
191
|
+
return @value if @value
|
167
192
|
|
168
193
|
if @processor.image?(@source) || @source.file?
|
169
194
|
convert
|
@@ -175,8 +200,30 @@ class RTesseract
|
|
175
200
|
|
176
201
|
# Remove spaces and break-lines
|
177
202
|
def to_s_without_spaces
|
178
|
-
to_s.
|
203
|
+
to_s.gsub(/\s/, '')
|
179
204
|
end
|
205
|
+
|
206
|
+
# Output pdf path
|
207
|
+
def to_pdf
|
208
|
+
return @pdf_path if @pdf_path
|
209
|
+
|
210
|
+
fail TesseractVersionError.new if tesseract_version.nil? || tesseract_version < 3.03
|
211
|
+
|
212
|
+
if @processor.image?(@source) || @source.file?
|
213
|
+
options_cmd << 'pdf'
|
214
|
+
convert
|
215
|
+
options_cmd.delete('pdf')
|
216
|
+
@pdf_path
|
217
|
+
else
|
218
|
+
fail RTesseract::ImageNotSelectedError.new(@source)
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
# Destroy pdf file
|
223
|
+
def clean
|
224
|
+
RTesseract::Utils.remove_files([@pdf_path])
|
225
|
+
end
|
226
|
+
|
180
227
|
end
|
181
228
|
|
182
229
|
require 'rtesseract/mixed'
|
data/lib/rtesseract/box.rb
CHANGED
@@ -29,7 +29,7 @@ class RTesseract
|
|
29
29
|
|
30
30
|
# Read the result file
|
31
31
|
def parse_file
|
32
|
-
html = Nokogiri::HTML(File.read(
|
32
|
+
html = Nokogiri::HTML(File.read(file_with_ext))
|
33
33
|
html.css('span.ocrx_word, span.ocr_word')
|
34
34
|
end
|
35
35
|
|
@@ -42,7 +42,7 @@ class RTesseract
|
|
42
42
|
|
43
43
|
# Move file html to hocr
|
44
44
|
def after_convert_hook
|
45
|
-
FileUtils.mv(
|
45
|
+
FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil
|
46
46
|
end
|
47
47
|
|
48
48
|
# Output value
|
data/lib/rtesseract/box_char.rb
CHANGED
@@ -43,6 +43,15 @@ class RTesseract
|
|
43
43
|
def self.configure
|
44
44
|
self.configuration ||= Configuration.new
|
45
45
|
yield(configuration)
|
46
|
+
self.clear_pdf_option
|
47
|
+
end
|
48
|
+
|
49
|
+
# Clear pdf option
|
50
|
+
def self.clear_pdf_option
|
51
|
+
if self.configuration.options_cmd
|
52
|
+
self.configuration.options_cmd.delete('pdf')
|
53
|
+
self.configuration.options_cmd.delete(:pdf)
|
54
|
+
end
|
46
55
|
end
|
47
56
|
|
48
57
|
# Default command
|
@@ -59,7 +68,8 @@ class RTesseract
|
|
59
68
|
config.processor = config.option(options, :processor, 'rmagick')
|
60
69
|
config.load_options(options, [:lang, :psm, :tessdata_dir, :user_words, :user_patterns])
|
61
70
|
config.debug = config.option(options, :debug, false)
|
62
|
-
|
71
|
+
pdf_opts = lambda { |o| o == 'pdf' || o == :pdf }
|
72
|
+
config.options_cmd = [options.option(:options, nil)].delete_if(&pdf_opts).flatten.compact
|
63
73
|
end
|
64
74
|
end
|
65
|
-
end
|
75
|
+
end
|
data/lib/rtesseract/errors.rb
CHANGED
@@ -12,4 +12,10 @@ class RTesseract
|
|
12
12
|
class ConversionError < ErrorWithMemory; end
|
13
13
|
class ImageNotSelectedError < ErrorWithMemory; end
|
14
14
|
class TempFilesNotRemovedError < ErrorWithMemory; end
|
15
|
+
|
16
|
+
class TesseractVersionError < StandardError
|
17
|
+
def initialize
|
18
|
+
super "Tesseract version is unknown or below 3.03 which is required for pdf output."
|
19
|
+
end
|
20
|
+
end
|
15
21
|
end
|
data/lib/rtesseract/utils.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
1
3
|
# RTesseract
|
2
4
|
class RTesseract
|
3
5
|
# Some utils methods
|
@@ -22,6 +24,14 @@ class RTesseract
|
|
22
24
|
end
|
23
25
|
true
|
24
26
|
end
|
27
|
+
|
28
|
+
# Extract tesseract version number
|
29
|
+
def self.version_number
|
30
|
+
out, err, st = Open3.capture3(RTesseract.default_command, "--version")
|
31
|
+
|
32
|
+
version = err.split("\n")[0].split(" ")[1].split('.')[0, 2].join('.')
|
33
|
+
Float(version) rescue nil
|
34
|
+
end
|
25
35
|
end
|
26
36
|
end
|
27
37
|
|
@@ -31,4 +41,4 @@ class Hash
|
|
31
41
|
def option(attr_name, default)
|
32
42
|
delete(attr_name.to_s) || delete(attr_name) || default
|
33
43
|
end
|
34
|
-
end
|
44
|
+
end
|
data/rtesseract.gemspec
CHANGED
@@ -2,16 +2,16 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: rtesseract 2.0
|
5
|
+
# stub: rtesseract 2.1.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "rtesseract"
|
9
|
-
s.version = "2.0
|
9
|
+
s.version = "2.1.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib"]
|
13
13
|
s.authors = ["Danilo Jeremias da Silva"]
|
14
|
-
s.date = "2016-
|
14
|
+
s.date = "2016-09-08"
|
15
15
|
s.description = "Ruby library for working with the Tesseract OCR."
|
16
16
|
s.email = "dannnylo@gmail.com"
|
17
17
|
s.extra_rdoc_files = [
|
@@ -48,6 +48,7 @@ Gem::Specification.new do |s|
|
|
48
48
|
"spec/images/mixed.tif",
|
49
49
|
"spec/images/orientation_reverse.png",
|
50
50
|
"spec/images/test with spaces.tif",
|
51
|
+
"spec/images/test-pdf.png",
|
51
52
|
"spec/images/test.bmp",
|
52
53
|
"spec/images/test.jpg",
|
53
54
|
"spec/images/test.png",
|
Binary file
|
data/spec/rtesseract_spec.rb
CHANGED
@@ -13,6 +13,7 @@ describe 'Rtesseract' do
|
|
13
13
|
before do
|
14
14
|
@path = Pathname.new(__FILE__.gsub('rtesseract_spec.rb', '')).expand_path
|
15
15
|
@image_tif = @path.join('images', 'test.tif').to_s
|
16
|
+
@image_for_pdf = @path.join('images', 'test-pdf.png').to_s
|
16
17
|
end
|
17
18
|
|
18
19
|
it ' be instantiable' do
|
@@ -94,12 +95,43 @@ describe 'Rtesseract' do
|
|
94
95
|
expect(RTesseract.new(@image_tif, options: [:digits, :quiet]).options_cmd).to eql([:digits, :quiet])
|
95
96
|
end
|
96
97
|
|
98
|
+
it ' support pdf output mode' do
|
99
|
+
# Internal test. Consider 'pdf' option only when #to_pdf is called.
|
100
|
+
expect(RTesseract.new(@image_tif, options: 'pdf').options_cmd).to eql([])
|
101
|
+
expect(RTesseract.new(@image_for_pdf, options: :pdf).options_cmd).to eql([])
|
102
|
+
|
103
|
+
pdf_ocr = RTesseract.new(@image_for_pdf)
|
104
|
+
expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
|
105
|
+
expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
|
106
|
+
# Comment next line and go to tmp dir to see generated pdf.
|
107
|
+
expect(pdf_ocr.clean).to eq(true)
|
108
|
+
expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
|
109
|
+
|
110
|
+
# Still have original functionality (i.e. #to_s, #to_s_without_spaces).
|
111
|
+
pdf_ocr = RTesseract.new(@image_tif)
|
112
|
+
expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
|
113
|
+
expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
|
114
|
+
expect(pdf_ocr.to_s_without_spaces).to eql('43XF')
|
115
|
+
expect(pdf_ocr.clean).to eq(true)
|
116
|
+
expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
|
117
|
+
end
|
118
|
+
|
119
|
+
it ' warn when tesseract cannot give pdf' do
|
120
|
+
rtesseract = RTesseract.new(@image_for_pdf)
|
121
|
+
|
122
|
+
allow(rtesseract).to receive(:tesseract_version).and_return(3.02)
|
123
|
+
expect { rtesseract.to_pdf }.to raise_error(RTesseract::TesseractVersionError)
|
124
|
+
|
125
|
+
allow(rtesseract).to receive(:tesseract_version).and_return(3.03)
|
126
|
+
expect { rtesseract.to_pdf }.not_to raise_error
|
127
|
+
end
|
128
|
+
|
97
129
|
it ' be configurable' do
|
98
130
|
expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0, display_text: 0).config).to eql("chop_enable 0\nenable_assoc 0\ndisplay_text 0")
|
99
131
|
expect(RTesseract.new(@image_tif, chop_enable: 0).config).to eql('chop_enable 0')
|
100
132
|
expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0).config).to eql("chop_enable 0\nenable_assoc 0")
|
101
133
|
expect(RTesseract.new(@image_tif, chop_enable: 0).to_s_without_spaces).to eql('43XF')
|
102
|
-
expect(RTesseract.new(@image_tif, tessedit_char_whitelist:
|
134
|
+
expect(RTesseract.new(@image_tif, tessedit_char_whitelist: 'ABCDEF12345').to_s_without_spaces).to eql('43F')
|
103
135
|
end
|
104
136
|
|
105
137
|
it ' crop image' do
|
@@ -179,7 +211,11 @@ describe 'Rtesseract' do
|
|
179
211
|
expect { RTesseract::Utils.remove_files(Pathname.new(Dir.tmpdir).join('test_not_exists')) }.to raise_error(RTesseract::TempFilesNotRemovedError)
|
180
212
|
end
|
181
213
|
|
182
|
-
it '
|
214
|
+
it ' get a numeric value for tesseract version' do
|
215
|
+
expect(RTesseract::Utils.version_number).to be_a Float
|
216
|
+
end
|
217
|
+
|
218
|
+
it ' support default config processors' do
|
183
219
|
# Rmagick
|
184
220
|
RTesseract.configure { |config| config.processor = 'rmagick' }
|
185
221
|
expect(RTesseract.new(@image_tif).processor.a_name?('rmagick')).to eql(true)
|
@@ -212,6 +248,18 @@ describe 'Rtesseract' do
|
|
212
248
|
expect(RTesseract.new(@image_tif).user_patterns).to eql(' --user-patterns /tmp/test ')
|
213
249
|
end
|
214
250
|
|
251
|
+
it ' configure pdf has no effect and kept in-house' do
|
252
|
+
# So it does not interfere with #to_s outputting.
|
253
|
+
RTesseract.configure { |config| config.options_cmd = ['pdf'] }
|
254
|
+
expect(RTesseract.new(@image_tif).options_cmd).to eql([])
|
255
|
+
|
256
|
+
RTesseract.configure { |config| config.options_cmd = [:pdf] }
|
257
|
+
expect(RTesseract.new(@image_tif).options_cmd).to eql([])
|
258
|
+
|
259
|
+
RTesseract.configure { |config| config.options_cmd = [:pdf, 'pdf'] }
|
260
|
+
expect(RTesseract.new(@image_tif).options_cmd).to eql([])
|
261
|
+
end
|
262
|
+
|
215
263
|
it ' support new configs' do
|
216
264
|
expect(RTesseract.new(@image_tif, tessdata_dir: '/tmp/test').tessdata_dir).to eql(' --tessdata-dir /tmp/test ')
|
217
265
|
expect(RTesseract.new(@image_tif, user_words: '/tmp/test').user_words).to eql(' --user-words /tmp/test ')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rtesseract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danilo Jeremias da Silva
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -145,6 +145,7 @@ files:
|
|
145
145
|
- spec/images/mixed.tif
|
146
146
|
- spec/images/orientation_reverse.png
|
147
147
|
- spec/images/test with spaces.tif
|
148
|
+
- spec/images/test-pdf.png
|
148
149
|
- spec/images/test.bmp
|
149
150
|
- spec/images/test.jpg
|
150
151
|
- spec/images/test.png
|