RubyGems - rtesseract - Versions diffs - 2.0.1 → 2.1.0 - Mend

rtesseract 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/.travis.yml +2 -1
data/CHANGELOG.md +6 -0
data/README.rdoc +16 -1
data/VERSION +1 -1
data/lib/rtesseract.rb +60 -13
data/lib/rtesseract/box.rb +2 -2
data/lib/rtesseract/box_char.rb +1 -1
data/lib/rtesseract/configuration.rb +12 -2
data/lib/rtesseract/errors.rb +6 -0
data/lib/rtesseract/utils.rb +11 -1
data/rtesseract.gemspec +4 -3
data/spec/images/test-pdf.png +0 -0
data/spec/rtesseract_spec.rb +50 -2
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 6eae58279cf744227e79b7bbc9180f7aea852547
-  data.tar.gz: 3836aa96d24b7f1a0b957cf803553f547cc33544
+  metadata.gz: db992168bb87c6c4f3124403f9417c4cd46aca3e
+  data.tar.gz: b329a5ebc7316f28f63bcd7d84aa575cb661b3b9
 SHA512:
-  metadata.gz: 0ef57359c7c7f43094a50838b6d29d28d7808c9cadd8f2b8514c613be030161f8d640c41ba3d403c00fb59fdf85ffcbc57795f6c65b8418ad348eb1a6c07e901
-  data.tar.gz: ff5f0f94c8039bd0b38b0c9ec2618b4c38b07b9707e28ff29a3bb943abc85d5afaa543dfba1ba2b9e565d056ea558eda9b7f6d222a6adb43614cd86c6e8fdcac
+  metadata.gz: a74dc8fd03a678ecdff1425bc188173615da59b636cbe9ceb072353a1e4d3f2aa076b9787007ef9bfebb9c928e2a170d8c0149f864741c888c6d74ea04a78889
+  data.tar.gz: 97d3bfea1ae54841ce68427893e8e69fa358a3018ba80b8f06d23759abe0d89c81e481c5775699b3c0e98234fb39dfb898a42b833c48b8ac42128c7c7c292b88

data/.travis.yml CHANGED Viewed

@@ -1,10 +1,11 @@
+sudo: required
+dist: trusty
 language: ruby
 addons:
   apt:
     packages:
     - tesseract-ocr
-sudo: false
 rvm:
   - 1.9.3
   - 2.0.0

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,9 @@
+## v2.1.0
+#### Added
+* Support to generate searchable PDF
 ## v2.0.1
 #### Changed

data/README.rdoc CHANGED Viewed

@@ -16,6 +16,8 @@ To work properly rtesseract are needed:
 Atention: Version 1.0.0 works fine with Ruby 2.0 and tesseract 3.0 and lower versions of rtesseract works fine with Ruby 1.8 and tesseract 2.0.4.
+PDF support requires a newer version of tesseract, specifically V.3.03 or above.
 == EXAMPLE USAGE
 It's very simple to use rtesseract:
@@ -23,7 +25,19 @@ It's very simple to use rtesseract:
 === CONVERT IMAGE TO STRING
   image = RTesseract.new("my_image.jpg")
-  image.to_s #Getting the value
+  image.to_s # Getting the value
+=== CONVERT IMAGE TO SEARCHABLE PDF
+  image = RTesseract.new("my_image.jpg")
+  image.to_pdf  # Getting the pdf path
+  image.to_s    # Still can get the value only.
+  # ...
+  # some stuff
+  # ...
+  image.clean   # to delete file once finished
+This will preserve the image colors, pictures and structure in the generated pdf.
 === CHANGE THE IMAGE
@@ -89,6 +103,7 @@ Language Options
      * por   - Portuguese
      * spa   - Spanish
      * vie   - Vietnamese
+     * or any other supported by tesseract.
   Note: Make sure you have installed the language to tesseract
 Other Options

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 2.0.1
1	+ 2.1.0

data/lib/rtesseract.rb CHANGED Viewed

@@ -15,10 +15,9 @@ class RTesseract
   def initialize(src = '', options = {})
     self.configuration = RTesseract.local_config(options)
     @options = options || {}
-    @value = nil
     @points = {}
     @processor = RTesseract::Processor.choose_processor!(configuration.processor)
-    @source = @processor.image?(src) ? src : Pathname.new(src)
+    self.source = src
     initialize_hook
   end
@@ -29,6 +28,7 @@ class RTesseract
   # Define the source
   def source=(src)
     @value = nil
+    @pdf_path = nil
     @source = @processor.image?(src) ? src : Pathname.new(src)
   end
@@ -127,24 +127,50 @@ class RTesseract
     '.txt'
   end
+  # Detect version number
+  def tesseract_version
+    RTesseract::Utils.version_number
+  end
   # Rand file path
-  def text_file
-    @text_file = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
+  def file_dest
+    @file_dest = Pathname.new(Dir.tmpdir).join("#{Time.now.to_f}#{rand(1500)}").to_s
   end
-  # Full path of file with extension
-  def text_file_with_ext(ext = nil)
-    [@text_file, ext || file_ext].join('')
+  # Full path of file with txt extension
+  def file_with_ext(ext = nil)
+    [@file_dest, ext || file_ext].join('')
   end
   # Run command
   def convert_command
-    `#{configuration.command} "#{image}" "#{text_file}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{configuration.options_cmd.join(' ')}`
+    `#{configuration.command} "#{image}" "#{file_dest}" #{lang} #{psm} #{tessdata_dir} #{user_words} #{user_patterns} #{config_file} #{clear_console_output} #{options_cmd.join(' ')}`
+  end
+  # Is pdf output?
+  def pdf?
+    options_cmd.include? 'pdf'
   end
   # Read result file
   def convert_text
-    @value = File.read(text_file_with_ext).to_s
+    @value = File.read(file_with_ext).to_s
+  end
+  # Store pdf result path
+  def convert_pdf
+    @pdf_path = file_with_ext('.pdf')
+  end
+  # Convert result to proper type
+  def convert_result
+    if pdf?
+      convert_pdf
+    else
+      convert_text
+      RTesseract::Utils.remove_files([@image, file_with_ext])
+    end
   end
   # Hook to convert
@@ -155,15 +181,14 @@ class RTesseract
   def convert
     convert_command
     after_convert_hook
-    convert_text
-    RTesseract::Utils.remove_files([@image, text_file_with_ext])
+    convert_result
   rescue => error
     raise RTesseract::ConversionError.new(error), error, caller
   end
   # Output value
   def to_s
-    return @value if @value != nil
+    return @value if @value
     if @processor.image?(@source) || @source.file?
       convert
@@ -175,8 +200,30 @@ class RTesseract
   # Remove spaces and break-lines
   def to_s_without_spaces
-    to_s.delete(' ').delete("\n").delete("\r")
+    to_s.gsub(/\s/, '')
   end
+  # Output pdf path
+  def to_pdf
+    return @pdf_path if @pdf_path
+    fail TesseractVersionError.new if tesseract_version.nil? || tesseract_version < 3.03
+    if @processor.image?(@source) || @source.file?
+      options_cmd << 'pdf'
+      convert
+      options_cmd.delete('pdf')
+      @pdf_path
+    else
+      fail RTesseract::ImageNotSelectedError.new(@source)
+    end
+  end
+  # Destroy pdf file
+  def clean
+    RTesseract::Utils.remove_files([@pdf_path])
+  end
 end
 require 'rtesseract/mixed'

data/lib/rtesseract/box.rb CHANGED Viewed

@@ -29,7 +29,7 @@ class RTesseract
     # Read the result file
     def parse_file
-      html = Nokogiri::HTML(File.read(text_file_with_ext))
+      html = Nokogiri::HTML(File.read(file_with_ext))
       html.css('span.ocrx_word, span.ocr_word')
     end
@@ -42,7 +42,7 @@ class RTesseract
     # Move file html to hocr
     def after_convert_hook
-      FileUtils.mv(text_file_with_ext('.html'), text_file_with_ext) rescue nil
+      FileUtils.mv(file_with_ext('.html'), file_with_ext) rescue nil
     end
     # Output value

data/lib/rtesseract/box_char.rb CHANGED Viewed

@@ -16,7 +16,7 @@ class RTesseract
     # Read the result file
     def parse_file
-      File.read(text_file_with_ext).to_s
+      File.read(file_with_ext).to_s
     end
     def convert_text

data/lib/rtesseract/configuration.rb CHANGED Viewed

@@ -43,6 +43,15 @@ class RTesseract
   def self.configure
     self.configuration ||= Configuration.new
     yield(configuration)
+    self.clear_pdf_option
+  end
+  # Clear pdf option
+  def self.clear_pdf_option
+    if self.configuration.options_cmd
+      self.configuration.options_cmd.delete('pdf')
+      self.configuration.options_cmd.delete(:pdf)
+    end
   end
   # Default command
@@ -59,7 +68,8 @@ class RTesseract
       config.processor = config.option(options, :processor, 'rmagick')
       config.load_options(options, [:lang, :psm, :tessdata_dir, :user_words, :user_patterns])
       config.debug = config.option(options, :debug, false)
-      config.options_cmd = [options.option(:options, nil)].flatten.compact
+      pdf_opts = lambda { |o| o == 'pdf' || o == :pdf }
+      config.options_cmd = [options.option(:options, nil)].delete_if(&pdf_opts).flatten.compact
     end
   end
-end
+end

data/lib/rtesseract/errors.rb CHANGED Viewed

@@ -12,4 +12,10 @@ class RTesseract
   class ConversionError < ErrorWithMemory; end
   class ImageNotSelectedError < ErrorWithMemory; end
   class TempFilesNotRemovedError < ErrorWithMemory; end
+  class TesseractVersionError < StandardError
+    def initialize
+      super "Tesseract version is unknown or below 3.03 which is required for pdf output."
+    end
+  end
 end

data/lib/rtesseract/utils.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'open3'
 # RTesseract
 class RTesseract
   # Some utils methods
@@ -22,6 +24,14 @@ class RTesseract
       end
       true
     end
+    # Extract tesseract version number
+    def self.version_number
+      out, err, st = Open3.capture3(RTesseract.default_command, "--version")
+      version = err.split("\n")[0].split(" ")[1].split('.')[0, 2].join('.')
+      Float(version) rescue nil
+    end
   end
 end
@@ -31,4 +41,4 @@ class Hash
   def option(attr_name, default)
     delete(attr_name.to_s) || delete(attr_name) || default
   end
-end
+end

data/rtesseract.gemspec CHANGED Viewed

@@ -2,16 +2,16 @@
 # DO NOT EDIT THIS FILE DIRECTLY
 # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 # -*- encoding: utf-8 -*-
-# stub: rtesseract 2.0.1 ruby lib
+# stub: rtesseract 2.1.0 ruby lib
 Gem::Specification.new do |s|
   s.name = "rtesseract"
-  s.version = "2.0.1"
+  s.version = "2.1.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.require_paths = ["lib"]
   s.authors = ["Danilo Jeremias da Silva"]
-  s.date = "2016-05-17"
+  s.date = "2016-09-08"
   s.description = "Ruby library for working with the Tesseract OCR."
   s.email = "dannnylo@gmail.com"
   s.extra_rdoc_files = [
@@ -48,6 +48,7 @@ Gem::Specification.new do |s|
     "spec/images/mixed.tif",
     "spec/images/orientation_reverse.png",
     "spec/images/test with spaces.tif",
+    "spec/images/test-pdf.png",
     "spec/images/test.bmp",
     "spec/images/test.jpg",
     "spec/images/test.png",

data/spec/images/test-pdf.png ADDED Viewed

Binary file

data/spec/rtesseract_spec.rb CHANGED Viewed

@@ -13,6 +13,7 @@ describe 'Rtesseract' do
   before do
     @path = Pathname.new(__FILE__.gsub('rtesseract_spec.rb', '')).expand_path
     @image_tif = @path.join('images', 'test.tif').to_s
+    @image_for_pdf = @path.join('images', 'test-pdf.png').to_s
   end
   it ' be instantiable' do
@@ -94,12 +95,43 @@ describe 'Rtesseract' do
     expect(RTesseract.new(@image_tif, options: [:digits, :quiet]).options_cmd).to eql([:digits, :quiet])
   end
+  it ' support pdf output mode' do
+    # Internal test. Consider 'pdf' option only when #to_pdf is called.
+    expect(RTesseract.new(@image_tif, options: 'pdf').options_cmd).to eql([])
+    expect(RTesseract.new(@image_for_pdf, options: :pdf).options_cmd).to eql([])
+    pdf_ocr = RTesseract.new(@image_for_pdf)
+    expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
+    expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
+    # Comment next line and go to tmp dir to see generated pdf.
+    expect(pdf_ocr.clean).to eq(true)
+    expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
+    # Still have original functionality (i.e. #to_s, #to_s_without_spaces).
+    pdf_ocr = RTesseract.new(@image_tif)
+    expect(File.exists?(pdf_ocr.to_pdf)).to eql(true)
+    expect(File.extname(pdf_ocr.to_pdf)).to eql('.pdf')
+    expect(pdf_ocr.to_s_without_spaces).to eql('43XF')
+    expect(pdf_ocr.clean).to eq(true)
+    expect(File.exists?(pdf_ocr.to_pdf)).to eql(false)
+  end
+  it ' warn when tesseract cannot give pdf' do
+    rtesseract = RTesseract.new(@image_for_pdf)
+    allow(rtesseract).to receive(:tesseract_version).and_return(3.02)
+    expect { rtesseract.to_pdf }.to raise_error(RTesseract::TesseractVersionError)
+    allow(rtesseract).to receive(:tesseract_version).and_return(3.03)
+    expect { rtesseract.to_pdf }.not_to raise_error
+  end
   it ' be configurable' do
     expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0, display_text: 0).config).to eql("chop_enable 0\nenable_assoc 0\ndisplay_text 0")
     expect(RTesseract.new(@image_tif, chop_enable: 0).config).to eql('chop_enable 0')
     expect(RTesseract.new(@image_tif, chop_enable: 0, enable_assoc: 0).config).to eql("chop_enable 0\nenable_assoc 0")
     expect(RTesseract.new(@image_tif, chop_enable: 0).to_s_without_spaces).to eql('43XF')
-    expect(RTesseract.new(@image_tif, tessedit_char_whitelist: "ABCDEF12345").to_s_without_spaces).to eql('43F')
+    expect(RTesseract.new(@image_tif, tessedit_char_whitelist: 'ABCDEF12345').to_s_without_spaces).to eql('43F')
   end
   it ' crop image' do
@@ -179,7 +211,11 @@ describe 'Rtesseract' do
     expect { RTesseract::Utils.remove_files(Pathname.new(Dir.tmpdir).join('test_not_exists')) }.to raise_error(RTesseract::TempFilesNotRemovedError)
   end
-  it ' support  default config processors' do
+  it ' get a numeric value for tesseract version' do
+    expect(RTesseract::Utils.version_number).to be_a Float
+  end
+  it ' support default config processors' do
     # Rmagick
     RTesseract.configure { |config| config.processor = 'rmagick' }
     expect(RTesseract.new(@image_tif).processor.a_name?('rmagick')).to eql(true)
@@ -212,6 +248,18 @@ describe 'Rtesseract' do
     expect(RTesseract.new(@image_tif).user_patterns).to eql(' --user-patterns /tmp/test ')
   end
+  it ' configure pdf has no effect and kept in-house' do
+    # So it does not interfere with #to_s outputting.
+    RTesseract.configure { |config| config.options_cmd =  ['pdf'] }
+    expect(RTesseract.new(@image_tif).options_cmd).to eql([])
+    RTesseract.configure { |config| config.options_cmd = [:pdf] }
+    expect(RTesseract.new(@image_tif).options_cmd).to eql([])
+    RTesseract.configure { |config| config.options_cmd = [:pdf, 'pdf'] }
+    expect(RTesseract.new(@image_tif).options_cmd).to eql([])
+  end
   it ' support new configs' do
     expect(RTesseract.new(@image_tif, tessdata_dir: '/tmp/test').tessdata_dir).to eql(' --tessdata-dir /tmp/test ')
     expect(RTesseract.new(@image_tif, user_words: '/tmp/test').user_words).to eql(' --user-words /tmp/test ')

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rtesseract
 version: !ruby/object:Gem::Version
-  version: 2.0.1
+  version: 2.1.0
 platform: ruby
 authors:
 - Danilo Jeremias da Silva
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-05-17 00:00:00.000000000 Z
+date: 2016-09-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -145,6 +145,7 @@ files:
 - spec/images/mixed.tif
 - spec/images/orientation_reverse.png
 - spec/images/test with spaces.tif
+- spec/images/test-pdf.png
 - spec/images/test.bmp
 - spec/images/test.jpg
 - spec/images/test.png