rtesseract 2.0.1 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6eae58279cf744227e79b7bbc9180f7aea852547
4
- data.tar.gz: 3836aa96d24b7f1a0b957cf803553f547cc33544
3
+ metadata.gz: db992168bb87c6c4f3124403f9417c4cd46aca3e
4
+ data.tar.gz: b329a5ebc7316f28f63bcd7d84aa575cb661b3b9
5
5
  SHA512:
6
- metadata.gz: 0ef57359c7c7f43094a50838b6d29d28d7808c9cadd8f2b8514c613be030161f8d640c41ba3d403c00fb59fdf85ffcbc57795f6c65b8418ad348eb1a6c07e901
7
- data.tar.gz: ff5f0f94c8039bd0b38b0c9ec2618b4c38b07b9707e28ff29a3bb943abc85d5afaa543dfba1ba2b9e565d056ea558eda9b7f6d222a6adb43614cd86c6e8fdcac
6
+ metadata.gz: a74dc8fd03a678ecdff1425bc188173615da59b636cbe9ceb072353a1e4d3f2aa076b9787007ef9bfebb9c928e2a170d8c0149f864741c888c6d74ea04a78889
7
+ data.tar.gz: 97d3bfea1ae54841ce68427893e8e69fa358a3018ba80b8f06d23759abe0d89c81e481c5775699b3c0e98234fb39dfb898a42b833c48b8ac42128c7c7c292b88
data/.travis.yml CHANGED
@@ -1,10 +1,11 @@
1
+ sudo: required
2
+ dist: trusty
1
3
  language: ruby
2
4
  addons:
3
5
  apt:
4
6
  packages:
5
7
  - tesseract-ocr
6
8
 
7
- sudo: false
8
9
  rvm:
9
10
  - 1.9.3
10
11
  - 2.0.0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## v2.1.0
2
+
3
+ #### Added
4
+
5
+ * Support to generate searchable PDF
6
+
1
7
  ## v2.0.1
2
8
 
3
9
  #### Changed
data/README.rdoc CHANGED
@@ -16,6 +16,8 @@ To work properly rtesseract are needed:
16
16
 
17
17
  Atention: Version 1.0.0 works fine with Ruby 2.0 and tesseract 3.0 and lower versions of rtesseract works fine with Ruby 1.8 and tesseract 2.0.4.
18
18
 
19
+ PDF support requires a newer version of tesseract, specifically V.3.03 or above.
20
+
19
21
  == EXAMPLE USAGE
20
22
 
21
23
  It's very simple to use rtesseract:
@@ -23,7 +25,19 @@ It's very simple to use rtesseract:
23
25
  === CONVERT IMAGE TO STRING
24
26
 
25
27
  image = RTesseract.new("my_image.jpg")
26
- image.to_s #Getting the value
28
+ image.to_s # Getting the value
29
+
30
+ === CONVERT IMAGE TO SEARCHABLE PDF
31
+
32
+ image = RTesseract.new("my_image.jpg")
33
+ image.to_pdf # Getting the pdf path
34
+ image.to_s # Still can get the value only.
35
+ # ...
36
+ # some stuff
37
+ # ...
38
+ image.clean # to delete file once finished
39
+
40
+ This will preserve the image colors, pictures and structure in the generated pdf.
27
41
 
28
42
  === CHANGE THE IMAGE
29
43
 
@@ -89,6 +103,7 @@ Language Options
89
103
  * por - Portuguese
90
104
  * spa - Spanish
91
105
  * vie - Vietnamese
106
+ * or any other supported by tesseract.
92
107
  Note: Make sure you have installed the language to tesseract
93
108
 
94
109
  Other Options
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.1
1
+ 2.1.0
data/lib/rtesseract.rb CHANGED
@@ -15,10 +15,9 @@ class RTesseract
15
15
  def initialize(src = '', options = {})
16
16
  self.configuration = RTesseract.local_config(options)
17
17
  @options = options || {}
18
- @value = nil
19
18
  @points = {}
20
19
  @processor = RTesseract::Processor.choose_processor!(configuration.processor)
21
- @source = @processor.image?(src) ? src : Pathname.new(src)
20
+ self.source = src
22
21
  initialize_hook
23
22
  end
24
23
 
@@ -29,6 +28,7 @@ class RTesseract
29
28
  # Define the source
30
29
  def source=(src)
31
30
  @value = nil
31
+ @pdf_path = nil
32
32
  @source = @processor.image?(src) ? src : Pathname.new(src)
33
33
  end
34
34
 
@@ -127,24 +127,50 @@ class RTesseract
127
127
  '.txt'
128
128
  end
129
129
 
130
+ # Detect version number
131
+ def tesseract_version
132
+ RTesseract::Utils.version_number
133
+ end
134
+
135
+
130
136
  # Rand file path
131
- def text_file
132
- @text_file = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
137
+ def file_dest
138
+ @file_dest = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
133
139
  end
134
140
 
135
- # Full path of file with extension
136
- def text_file_with_ext(ext = nil)
137
- [@text_file, ext || file_ext].join('')
141
+ # Full path of file with txt extension
142
+ def file_with_ext(ext = nil)
143
+ [@file_dest, ext || file_ext].join('')
138
144
  end
139
145
 
140
146
  # Run command
141
147
  def convert_command
142
- `#{configuration.command} "#{image}" "#{text_file}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{configuration.options_cmd.join(' ')}`
148
+ `#{configuration.command} "#{image}" "#{file_dest}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{options_cmd.join(' ')}`
149
+ end
150
+
151
+ # Is pdf output?
152
+ def pdf?
153
+ options_cmd.include? 'pdf'
143
154
  end
144
155
 
145
156
  # Read result file
146
157
  def convert_text
147
- @value = File.read(text_file_with_ext).to_s
158
+ @value = File.read(file_with_ext).to_s
159
+ end
160
+
161
+ # Store pdf result path
162
+ def convert_pdf
163
+ @pdf_path = file_with_ext('.pdf')
164
+ end
165
+
166
+ # Convert result to proper type
167
+ def convert_result
168
+ if pdf?
169
+ convert_pdf
170
+ else
171
+ convert_text
172
+ RTesseract::Utils.remove_files([@image, file_with_ext])
173
+ end
148
174
  end
149
175
 
150
176
  # Hook to convert
@@ -155,15 +181,14 @@ class RTesseract
155
181
  def convert
156
182
  convert_command
157
183
  after_convert_hook
158
- convert_text
159
- RTesseract::Utils.remove_files([@image, text_file_with_ext])
184
+ convert_result
160
185
  rescue => error
161
186
  raise RTesseract::ConversionError.new(error), error, caller
162
187
  end
163
188
 
164
189
  # Output value
165
190
  def to_s
166
- return @value if @value != nil
191
+ return @value if @value
167
192
 
168
193
  if @processor.image?(@source) || @source.file?
169
194
  convert
@@ -175,8 +200,30 @@ class RTesseract
175
200
 
176
201
  # Remove spaces and break-lines
177
202
  def to_s_without_spaces
178
- to_s.delete(' ').delete("\n").delete("\r")
203
+ to_s.gsub(/\s/, '')
179
204
  end
205
+
206
+ # Output pdf path
207
+ def to_pdf
208
+ return @pdf_path if @pdf_path
209
+
210
+ fail TesseractVersionError.new if tesseract_version.nil? || tesseract_version < 3.03
211
+
212
+ if @processor.image?(@source) || @source.file?
213
+ options_cmd << 'pdf'
214
+ convert
215
+ options_cmd.delete('pdf')
216
+ @pdf_path
217
+ else
218
+ fail RTesseract::ImageNotSelectedError.new(@source)
219
+ end
220
+ end
221
+
222
+ # Destroy pdf file
223
+ def clean
224
+ RTesseract::Utils.remove_files([@pdf_path])
225
+ end
226
+
180
227
  end
181
228
 
182
229
  require 'rtesseract/mixed'
@@ -29,7 +29,7 @@ class RTesseract
29
29
 
30
30
  # Read the result file
31
31
  def parse_file
32
- html = Nokogiri::HTML(File.read(text_file_with_ext))
32
+ html = Nokogiri::HTML(File.read(file_with_ext))
33
33
  html.css('span.ocrx_word, span.ocr_word')
34
34
  end
35
35
 
@@ -42,7 +42,7 @@ class RTesseract
42
42
 
43
43
  # Move file html to hocr
44
44
  def after_convert_hook
45
- FileUtils.mv(text_file_with_ext('.html'), text_file_with_ext) rescue nil
45
+ FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil
46
46
  end
47
47
 
48
48
  # Output value
@@ -16,7 +16,7 @@ class RTesseract
16
16
 
17
17
  # Read the result file
18
18
  def parse_file
19
- File.read(text_file_with_ext).to_s
19
+ File.read(file_with_ext).to_s
20
20
  end
21
21
 
22
22
  def convert_text
@@ -43,6 +43,15 @@ class RTesseract
43
43
  def self.configure
44
44
  self.configuration ||= Configuration.new
45
45
  yield(configuration)
46
+ self.clear_pdf_option
47
+ end
48
+
49
+ # Clear pdf option
50
+ def self.clear_pdf_option
51
+ if self.configuration.options_cmd
52
+ self.configuration.options_cmd.delete('pdf')
53
+ self.configuration.options_cmd.delete(:pdf)
54
+ end
46
55
  end
47
56
 
48
57
  # Default command
@@ -59,7 +68,8 @@ class RTesseract
59
68
  config.processor = config.option(options, :processor, 'rmagick')
60
69
  config.load_options(options, [:lang, :psm, :tessdata_dir, :user_words, :user_patterns])
61
70
  config.debug = config.option(options, :debug, false)
62
- config.options_cmd = [options.option(:options, nil)].flatten.compact
71
+ pdf_opts = lambda { |o| o == 'pdf' || o == :pdf }
72
+ config.options_cmd = [options.option(:options, nil)].delete_if(&pdf_opts).flatten.compact
63
73
  end
64
74
  end
65
- end
75
+ end
@@ -12,4 +12,10 @@ class RTesseract
12
12
  class ConversionError < ErrorWithMemory; end
13
13
  class ImageNotSelectedError < ErrorWithMemory; end
14
14
  class TempFilesNotRemovedError < ErrorWithMemory; end
15
+
16
+ class TesseractVersionError < StandardError
17
+ def initialize
18
+ super "Tesseract version is unknown or below 3.03 which is required for pdf output."
19
+ end
20
+ end
15
21
  end
@@ -1,3 +1,5 @@
1
+ require 'open3'
2
+
1
3
  # RTesseract
2
4
  class RTesseract
3
5
  # Some utils methods
@@ -22,6 +24,14 @@ class RTesseract
22
24
  end
23
25
  true
24
26
  end
27
+
28
+ # Extract tesseract version number
29
+ def self.version_number
30
+ out, err, st = Open3.capture3(RTesseract.default_command, "--version")
31
+
32
+ version = err.split("\n")[0].split(" ")[1].split('.')[0, 2].join('.')
33
+ Float(version) rescue nil
34
+ end
25
35
  end
26
36
  end
27
37
 
@@ -31,4 +41,4 @@ class Hash
31
41
  def option(attr_name, default)
32
42
  delete(attr_name.to_s) || delete(attr_name) || default
33
43
  end
34
- end
44
+ end
data/rtesseract.gemspec CHANGED
@@ -2,16 +2,16 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: rtesseract 2.0.1 ruby lib
5
+ # stub: rtesseract 2.1.0 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
8
  s.name = "rtesseract"
9
- s.version = "2.0.1"
9
+ s.version = "2.1.0"
10
10
 
11
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
12
  s.require_paths = ["lib"]
13
13
  s.authors = ["Danilo Jeremias da Silva"]
14
- s.date = "2016-05-17"
14
+ s.date = "2016-09-08"
15
15
  s.description = "Ruby library for working with the Tesseract OCR."
16
16
  s.email = "dannnylo@gmail.com"
17
17
  s.extra_rdoc_files = [
@@ -48,6 +48,7 @@ Gem::Specification.new do |s|
48
48
  "spec/images/mixed.tif",
49
49
  "spec/images/orientation_reverse.png",
50
50
  "spec/images/test with spaces.tif",
51
+ "spec/images/test-pdf.png",
51
52
  "spec/images/test.bmp",
52
53
  "spec/images/test.jpg",
53
54
  "spec/images/test.png",
Binary file
@@ -13,6 +13,7 @@ describe 'Rtesseract' do
13
13
  before do
14
14
  @path = Pathname.new(__FILE__.gsub('rtesseract_spec.rb', '')).expand_path
15
15
  @image_tif = @path.join('images', 'test.tif').to_s
16
+ @image_for_pdf = @path.join('images', 'test-pdf.png').to_s
16
17
  end
17
18
 
18
19
  it ' be instantiable' do
@@ -94,12 +95,43 @@ describe 'Rtesseract' do
94
95
  expect(RTesseract.new(@image_tif, options: [:digits, :quiet]).options_cmd).to eql([:digits, :quiet])
95
96
  end
96
97
 
98
+ it ' support pdf output mode' do
99
+ # Internal test. Consider 'pdf' option only when #to_pdf is called.
100
+ expect(RTesseract.new(@image_tif, options: 'pdf').options_cmd).to eql([])
101
+ expect(RTesseract.new(@image_for_pdf, options: :pdf).options_cmd).to eql([])
102
+
103
+ pdf_ocr = RTesseract.new(@image_for_pdf)
104
+ expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
105
+ expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
106
+ # Comment next line and go to tmp dir to see generated pdf.
107
+ expect(pdf_ocr.clean).to eq(true)
108
+ expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
109
+
110
+ # Still have original functionality (i.e. #to_s, #to_s_without_spaces).
111
+ pdf_ocr = RTesseract.new(@image_tif)
112
+ expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
113
+ expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
114
+ expect(pdf_ocr.to_s_without_spaces).to eql('43XF')
115
+ expect(pdf_ocr.clean).to eq(true)
116
+ expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
117
+ end
118
+
119
+ it ' warn when tesseract cannot give pdf' do
120
+ rtesseract = RTesseract.new(@image_for_pdf)
121
+
122
+ allow(rtesseract).to receive(:tesseract_version).and_return(3.02)
123
+ expect { rtesseract.to_pdf }.to raise_error(RTesseract::TesseractVersionError)
124
+
125
+ allow(rtesseract).to receive(:tesseract_version).and_return(3.03)
126
+ expect { rtesseract.to_pdf }.not_to raise_error
127
+ end
128
+
97
129
  it ' be configurable' do
98
130
  expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0, display_text: 0).config).to eql("chop_enable 0\nenable_assoc 0\ndisplay_text 0")
99
131
  expect(RTesseract.new(@image_tif, chop_enable: 0).config).to eql('chop_enable 0')
100
132
  expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0).config).to eql("chop_enable 0\nenable_assoc 0")
101
133
  expect(RTesseract.new(@image_tif, chop_enable: 0).to_s_without_spaces).to eql('43XF')
102
- expect(RTesseract.new(@image_tif, tessedit_char_whitelist: "ABCDEF12345").to_s_without_spaces).to eql('43F')
134
+ expect(RTesseract.new(@image_tif, tessedit_char_whitelist: 'ABCDEF12345').to_s_without_spaces).to eql('43F')
103
135
  end
104
136
 
105
137
  it ' crop image' do
@@ -179,7 +211,11 @@ describe 'Rtesseract' do
179
211
  expect { RTesseract::Utils.remove_files(Pathname.new(Dir.tmpdir).join('test_not_exists')) }.to raise_error(RTesseract::TempFilesNotRemovedError)
180
212
  end
181
213
 
182
- it ' support default config processors' do
214
+ it ' get a numeric value for tesseract version' do
215
+ expect(RTesseract::Utils.version_number).to be_a Float
216
+ end
217
+
218
+ it ' support default config processors' do
183
219
  # Rmagick
184
220
  RTesseract.configure { |config| config.processor = 'rmagick' }
185
221
  expect(RTesseract.new(@image_tif).processor.a_name?('rmagick')).to eql(true)
@@ -212,6 +248,18 @@ describe 'Rtesseract' do
212
248
  expect(RTesseract.new(@image_tif).user_patterns).to eql(' --user-patterns /tmp/test ')
213
249
  end
214
250
 
251
+ it ' configure pdf has no effect and kept in-house' do
252
+ # So it does not interfere with #to_s outputting.
253
+ RTesseract.configure { |config| config.options_cmd = ['pdf'] }
254
+ expect(RTesseract.new(@image_tif).options_cmd).to eql([])
255
+
256
+ RTesseract.configure { |config| config.options_cmd = [:pdf] }
257
+ expect(RTesseract.new(@image_tif).options_cmd).to eql([])
258
+
259
+ RTesseract.configure { |config| config.options_cmd = [:pdf, 'pdf'] }
260
+ expect(RTesseract.new(@image_tif).options_cmd).to eql([])
261
+ end
262
+
215
263
  it ' support new configs' do
216
264
  expect(RTesseract.new(@image_tif, tessdata_dir: '/tmp/test').tessdata_dir).to eql(' --tessdata-dir /tmp/test ')
217
265
  expect(RTesseract.new(@image_tif, user_words: '/tmp/test').user_words).to eql(' --user-words /tmp/test ')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rtesseract
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Danilo Jeremias da Silva
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-17 00:00:00.000000000 Z
11
+ date: 2016-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -145,6 +145,7 @@ files:
145
145
  - spec/images/mixed.tif
146
146
  - spec/images/orientation_reverse.png
147
147
  - spec/images/test with spaces.tif
148
+ - spec/images/test-pdf.png
148
149
  - spec/images/test.bmp
149
150
  - spec/images/test.jpg
150
151
  - spec/images/test.png