rtesseract 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6eae58279cf744227e79b7bbc9180f7aea852547
4
- data.tar.gz: 3836aa96d24b7f1a0b957cf803553f547cc33544
3
+ metadata.gz: db992168bb87c6c4f3124403f9417c4cd46aca3e
4
+ data.tar.gz: b329a5ebc7316f28f63bcd7d84aa575cb661b3b9
5
5
  SHA512:
6
- metadata.gz: 0ef57359c7c7f43094a50838b6d29d28d7808c9cadd8f2b8514c613be030161f8d640c41ba3d403c00fb59fdf85ffcbc57795f6c65b8418ad348eb1a6c07e901
7
- data.tar.gz: ff5f0f94c8039bd0b38b0c9ec2618b4c38b07b9707e28ff29a3bb943abc85d5afaa543dfba1ba2b9e565d056ea558eda9b7f6d222a6adb43614cd86c6e8fdcac
6
+ metadata.gz: a74dc8fd03a678ecdff1425bc188173615da59b636cbe9ceb072353a1e4d3f2aa076b9787007ef9bfebb9c928e2a170d8c0149f864741c888c6d74ea04a78889
7
+ data.tar.gz: 97d3bfea1ae54841ce68427893e8e69fa358a3018ba80b8f06d23759abe0d89c81e481c5775699b3c0e98234fb39dfb898a42b833c48b8ac42128c7c7c292b88
data/.travis.yml CHANGED
@@ -1,10 +1,11 @@
1
+ sudo: required
2
+ dist: trusty
1
3
  language: ruby
2
4
  addons:
3
5
  apt:
4
6
  packages:
5
7
  - tesseract-ocr
6
8
 
7
- sudo: false
8
9
  rvm:
9
10
  - 1.9.3
10
11
  - 2.0.0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## v2.1.0
2
+
3
+ #### Added
4
+
5
+ * Support to generate searchable PDF
6
+
1
7
  ## v2.0.1
2
8
 
3
9
  #### Changed
data/README.rdoc CHANGED
@@ -16,6 +16,8 @@ To work properly rtesseract are needed:
16
16
 
17
17
  Atention: Version 1.0.0 works fine with Ruby 2.0 and tesseract 3.0 and lower versions of rtesseract works fine with Ruby 1.8 and tesseract 2.0.4.
18
18
 
19
+ PDF support requires a newer version of tesseract, specifically V.3.03 or above.
20
+
19
21
  == EXAMPLE USAGE
20
22
 
21
23
  It's very simple to use rtesseract:
@@ -23,7 +25,19 @@ It's very simple to use rtesseract:
23
25
  === CONVERT IMAGE TO STRING
24
26
 
25
27
  image = RTesseract.new("my_image.jpg")
26
- image.to_s #Getting the value
28
+ image.to_s # Getting the value
29
+
30
+ === CONVERT IMAGE TO SEARCHABLE PDF
31
+
32
+ image = RTesseract.new("my_image.jpg")
33
+ image.to_pdf # Getting the pdf path
34
+ image.to_s # Still can get the value only.
35
+ # ...
36
+ # some stuff
37
+ # ...
38
+ image.clean # to delete file once finished
39
+
40
+ This will preserve the image colors, pictures and structure in the generated pdf.
27
41
 
28
42
  === CHANGE THE IMAGE
29
43
 
@@ -89,6 +103,7 @@ Language Options
89
103
  * por - Portuguese
90
104
  * spa - Spanish
91
105
  * vie - Vietnamese
106
+ * or any other supported by tesseract.
92
107
  Note: Make sure you have installed the language to tesseract
93
108
 
94
109
  Other Options
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.1
1
+ 2.1.0
data/lib/rtesseract.rb CHANGED
@@ -15,10 +15,9 @@ class RTesseract
15
15
  def initialize(src = '', options = {})
16
16
  self.configuration = RTesseract.local_config(options)
17
17
  @options = options || {}
18
- @value = nil
19
18
  @points = {}
20
19
  @processor = RTesseract::Processor.choose_processor!(configuration.processor)
21
- @source = @processor.image?(src) ? src : Pathname.new(src)
20
+ self.source = src
22
21
  initialize_hook
23
22
  end
24
23
 
@@ -29,6 +28,7 @@ class RTesseract
29
28
  # Define the source
30
29
  def source=(src)
31
30
  @value = nil
31
+ @pdf_path = nil
32
32
  @source = @processor.image?(src) ? src : Pathname.new(src)
33
33
  end
34
34
 
@@ -127,24 +127,50 @@ class RTesseract
127
127
  '.txt'
128
128
  end
129
129
 
130
+ # Detect version number
131
+ def tesseract_version
132
+ RTesseract::Utils.version_number
133
+ end
134
+
135
+
130
136
  # Rand file path
131
- def text_file
132
- @text_file = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
137
+ def file_dest
138
+ @file_dest = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
133
139
  end
134
140
 
135
- # Full path of file with extension
136
- def text_file_with_ext(ext = nil)
137
- [@text_file, ext || file_ext].join('')
141
+ # Full path of file with txt extension
142
+ def file_with_ext(ext = nil)
143
+ [@file_dest, ext || file_ext].join('')
138
144
  end
139
145
 
140
146
  # Run command
141
147
  def convert_command
142
- `#{configuration.command} "#{image}" "#{text_file}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{configuration.options_cmd.join(' ')}`
148
+ `#{configuration.command} "#{image}" "#{file_dest}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{options_cmd.join(' ')}`
149
+ end
150
+
151
+ # Is pdf output?
152
+ def pdf?
153
+ options_cmd.include? 'pdf'
143
154
  end
144
155
 
145
156
  # Read result file
146
157
  def convert_text
147
- @value = File.read(text_file_with_ext).to_s
158
+ @value = File.read(file_with_ext).to_s
159
+ end
160
+
161
+ # Store pdf result path
162
+ def convert_pdf
163
+ @pdf_path = file_with_ext('.pdf')
164
+ end
165
+
166
+ # Convert result to proper type
167
+ def convert_result
168
+ if pdf?
169
+ convert_pdf
170
+ else
171
+ convert_text
172
+ RTesseract::Utils.remove_files([@image, file_with_ext])
173
+ end
148
174
  end
149
175
 
150
176
  # Hook to convert
@@ -155,15 +181,14 @@ class RTesseract
155
181
  def convert
156
182
  convert_command
157
183
  after_convert_hook
158
- convert_text
159
- RTesseract::Utils.remove_files([@image, text_file_with_ext])
184
+ convert_result
160
185
  rescue => error
161
186
  raise RTesseract::ConversionError.new(error), error, caller
162
187
  end
163
188
 
164
189
  # Output value
165
190
  def to_s
166
- return @value if @value != nil
191
+ return @value if @value
167
192
 
168
193
  if @processor.image?(@source) || @source.file?
169
194
  convert
@@ -175,8 +200,30 @@ class RTesseract
175
200
 
176
201
  # Remove spaces and break-lines
177
202
  def to_s_without_spaces
178
- to_s.delete(' ').delete("\n").delete("\r")
203
+ to_s.gsub(/\s/, '')
179
204
  end
205
+
206
+ # Output pdf path
207
+ def to_pdf
208
+ return @pdf_path if @pdf_path
209
+
210
+ fail TesseractVersionError.new if tesseract_version.nil? || tesseract_version < 3.03
211
+
212
+ if @processor.image?(@source) || @source.file?
213
+ options_cmd << 'pdf'
214
+ convert
215
+ options_cmd.delete('pdf')
216
+ @pdf_path
217
+ else
218
+ fail RTesseract::ImageNotSelectedError.new(@source)
219
+ end
220
+ end
221
+
222
+ # Destroy pdf file
223
+ def clean
224
+ RTesseract::Utils.remove_files([@pdf_path])
225
+ end
226
+
180
227
  end
181
228
 
182
229
  require 'rtesseract/mixed'
@@ -29,7 +29,7 @@ class RTesseract
29
29
 
30
30
  # Read the result file
31
31
  def parse_file
32
- html = Nokogiri::HTML(File.read(text_file_with_ext))
32
+ html = Nokogiri::HTML(File.read(file_with_ext))
33
33
  html.css('span.ocrx_word, span.ocr_word')
34
34
  end
35
35
 
@@ -42,7 +42,7 @@ class RTesseract
42
42
 
43
43
  # Move file html to hocr
44
44
  def after_convert_hook
45
- FileUtils.mv(text_file_with_ext('.html'), text_file_with_ext) rescue nil
45
+ FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil
46
46
  end
47
47
 
48
48
  # Output value
@@ -16,7 +16,7 @@ class RTesseract
16
16
 
17
17
  # Read the result file
18
18
  def parse_file
19
- File.read(text_file_with_ext).to_s
19
+ File.read(file_with_ext).to_s
20
20
  end
21
21
 
22
22
  def convert_text
@@ -43,6 +43,15 @@ class RTesseract
43
43
  def self.configure
44
44
  self.configuration ||= Configuration.new
45
45
  yield(configuration)
46
+ self.clear_pdf_option
47
+ end
48
+
49
+ # Clear pdf option
50
+ def self.clear_pdf_option
51
+ if self.configuration.options_cmd
52
+ self.configuration.options_cmd.delete('pdf')
53
+ self.configuration.options_cmd.delete(:pdf)
54
+ end
46
55
  end
47
56
 
48
57
  # Default command
@@ -59,7 +68,8 @@ class RTesseract
59
68
  config.processor = config.option(options, :processor, 'rmagick')
60
69
  config.load_options(options, [:lang, :psm, :tessdata_dir, :user_words, :user_patterns])
61
70
  config.debug = config.option(options, :debug, false)
62
- config.options_cmd = [options.option(:options, nil)].flatten.compact
71
+ pdf_opts = lambda { |o| o == 'pdf' || o == :pdf }
72
+ config.options_cmd = [options.option(:options, nil)].delete_if(&pdf_opts).flatten.compact
63
73
  end
64
74
  end
65
- end
75
+ end
@@ -12,4 +12,10 @@ class RTesseract
12
12
  class ConversionError < ErrorWithMemory; end
13
13
  class ImageNotSelectedError < ErrorWithMemory; end
14
14
  class TempFilesNotRemovedError < ErrorWithMemory; end
15
+
16
+ class TesseractVersionError < StandardError
17
+ def initialize
18
+ super "Tesseract version is unknown or below 3.03 which is required for pdf output."
19
+ end
20
+ end
15
21
  end
@@ -1,3 +1,5 @@
1
+ require 'open3'
2
+
1
3
  # RTesseract
2
4
  class RTesseract
3
5
  # Some utils methods
@@ -22,6 +24,14 @@ class RTesseract
22
24
  end
23
25
  true
24
26
  end
27
+
28
+ # Extract tesseract version number
29
+ def self.version_number
30
+ out, err, st = Open3.capture3(RTesseract.default_command, "--version")
31
+
32
+ version = err.split("\n")[0].split(" ")[1].split('.')[0, 2].join('.')
33
+ Float(version) rescue nil
34
+ end
25
35
  end
26
36
  end
27
37
 
@@ -31,4 +41,4 @@ class Hash
31
41
  def option(attr_name, default)
32
42
  delete(attr_name.to_s) || delete(attr_name) || default
33
43
  end
34
- end
44
+ end
data/rtesseract.gemspec CHANGED
@@ -2,16 +2,16 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: rtesseract 2.0.1 ruby lib
5
+ # stub: rtesseract 2.1.0 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
8
  s.name = "rtesseract"
9
- s.version = "2.0.1"
9
+ s.version = "2.1.0"
10
10
 
11
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
12
  s.require_paths = ["lib"]
13
13
  s.authors = ["Danilo Jeremias da Silva"]
14
- s.date = "2016-05-17"
14
+ s.date = "2016-09-08"
15
15
  s.description = "Ruby library for working with the Tesseract OCR."
16
16
  s.email = "dannnylo@gmail.com"
17
17
  s.extra_rdoc_files = [
@@ -48,6 +48,7 @@ Gem::Specification.new do |s|
48
48
  "spec/images/mixed.tif",
49
49
  "spec/images/orientation_reverse.png",
50
50
  "spec/images/test with spaces.tif",
51
+ "spec/images/test-pdf.png",
51
52
  "spec/images/test.bmp",
52
53
  "spec/images/test.jpg",
53
54
  "spec/images/test.png",
Binary file
@@ -13,6 +13,7 @@ describe 'Rtesseract' do
13
13
  before do
14
14
  @path = Pathname.new(__FILE__.gsub('rtesseract_spec.rb', '')).expand_path
15
15
  @image_tif = @path.join('images', 'test.tif').to_s
16
+ @image_for_pdf = @path.join('images', 'test-pdf.png').to_s
16
17
  end
17
18
 
18
19
  it ' be instantiable' do
@@ -94,12 +95,43 @@ describe 'Rtesseract' do
94
95
  expect(RTesseract.new(@image_tif, options: [:digits, :quiet]).options_cmd).to eql([:digits, :quiet])
95
96
  end
96
97
 
98
+ it ' support pdf output mode' do
99
+ # Internal test. Consider 'pdf' option only when #to_pdf is called.
100
+ expect(RTesseract.new(@image_tif, options: 'pdf').options_cmd).to eql([])
101
+ expect(RTesseract.new(@image_for_pdf, options: :pdf).options_cmd).to eql([])
102
+
103
+ pdf_ocr = RTesseract.new(@image_for_pdf)
104
+ expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
105
+ expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
106
+ # Comment next line and go to tmp dir to see generated pdf.
107
+ expect(pdf_ocr.clean).to eq(true)
108
+ expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
109
+
110
+ # Still have original functionality (i.e. #to_s, #to_s_without_spaces).
111
+ pdf_ocr = RTesseract.new(@image_tif)
112
+ expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
113
+ expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
114
+ expect(pdf_ocr.to_s_without_spaces).to eql('43XF')
115
+ expect(pdf_ocr.clean).to eq(true)
116
+ expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
117
+ end
118
+
119
+ it ' warn when tesseract cannot give pdf' do
120
+ rtesseract = RTesseract.new(@image_for_pdf)
121
+
122
+ allow(rtesseract).to receive(:tesseract_version).and_return(3.02)
123
+ expect { rtesseract.to_pdf }.to raise_error(RTesseract::TesseractVersionError)
124
+
125
+ allow(rtesseract).to receive(:tesseract_version).and_return(3.03)
126
+ expect { rtesseract.to_pdf }.not_to raise_error
127
+ end
128
+
97
129
  it ' be configurable' do
98
130
  expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0, display_text: 0).config).to eql("chop_enable 0\nenable_assoc 0\ndisplay_text 0")
99
131
  expect(RTesseract.new(@image_tif, chop_enable: 0).config).to eql('chop_enable 0')
100
132
  expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0).config).to eql("chop_enable 0\nenable_assoc 0")
101
133
  expect(RTesseract.new(@image_tif, chop_enable: 0).to_s_without_spaces).to eql('43XF')
102
- expect(RTesseract.new(@image_tif, tessedit_char_whitelist: "ABCDEF12345").to_s_without_spaces).to eql('43F')
134
+ expect(RTesseract.new(@image_tif, tessedit_char_whitelist: 'ABCDEF12345').to_s_without_spaces).to eql('43F')
103
135
  end
104
136
 
105
137
  it ' crop image' do
@@ -179,7 +211,11 @@ describe 'Rtesseract' do
179
211
  expect { RTesseract::Utils.remove_files(Pathname.new(Dir.tmpdir).join('test_not_exists')) }.to raise_error(RTesseract::TempFilesNotRemovedError)
180
212
  end
181
213
 
182
- it ' support default config processors' do
214
+ it ' get a numeric value for tesseract version' do
215
+ expect(RTesseract::Utils.version_number).to be_a Float
216
+ end
217
+
218
+ it ' support default config processors' do
183
219
  # Rmagick
184
220
  RTesseract.configure { |config| config.processor = 'rmagick' }
185
221
  expect(RTesseract.new(@image_tif).processor.a_name?('rmagick')).to eql(true)
@@ -212,6 +248,18 @@ describe 'Rtesseract' do
212
248
  expect(RTesseract.new(@image_tif).user_patterns).to eql(' --user-patterns /tmp/test ')
213
249
  end
214
250
 
251
+ it ' configure pdf has no effect and kept in-house' do
252
+ # So it does not interfere with #to_s outputting.
253
+ RTesseract.configure { |config| config.options_cmd = ['pdf'] }
254
+ expect(RTesseract.new(@image_tif).options_cmd).to eql([])
255
+
256
+ RTesseract.configure { |config| config.options_cmd = [:pdf] }
257
+ expect(RTesseract.new(@image_tif).options_cmd).to eql([])
258
+
259
+ RTesseract.configure { |config| config.options_cmd = [:pdf, 'pdf'] }
260
+ expect(RTesseract.new(@image_tif).options_cmd).to eql([])
261
+ end
262
+
215
263
  it ' support new configs' do
216
264
  expect(RTesseract.new(@image_tif, tessdata_dir: '/tmp/test').tessdata_dir).to eql(' --tessdata-dir /tmp/test ')
217
265
  expect(RTesseract.new(@image_tif, user_words: '/tmp/test').user_words).to eql(' --user-words /tmp/test ')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rtesseract
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danilo Jeremias da Silva
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-17 00:00:00.000000000 Z
11
+ date: 2016-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -145,6 +145,7 @@ files:
145
145
  - spec/images/mixed.tif
146
146
  - spec/images/orientation_reverse.png
147
147
  - spec/images/test with spaces.tif
148
+ - spec/images/test-pdf.png
148
149
  - spec/images/test.bmp
149
150
  - spec/images/test.jpg
150
151
  - spec/images/test.png