RubyGems - ocr-file - Versions diffs - 0.0.1 → 0.0.4 - Mend

ocr-file 0.0.1 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/README.md +24 -6
data/bin/ocr-file +1 -1
data/lib/ocr-file/cli.rb +37 -1
data/lib/ocr-file/document.rb +137 -59
data/lib/ocr-file/image_engines/image_magick.rb +73 -7
data/lib/ocr-file/image_engines/pdftoppm.rb +1 -1
data/lib/ocr-file/text_engines/result_processor.rb +34 -0
data/lib/ocr-file/version.rb +1 -1
data/lib/ocr-file.rb +1 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6b558ce36c35e74b410f42928eae1a987485d1bbd64da77750574062bc05b91e
-  data.tar.gz: d906c620a02c5a2d139b3d89d05e9b3872ee6c929b4aa661b20f9033d8f3605f
+  metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
+  data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
 SHA512:
-  metadata.gz: 81049908609ba3d622be2b6f99dabeca2960a455fa3d56ee1fca4c177c2ee4365281421c1128ec3fa5476d068daa53b3d7f7600c5fd1c31fcb5834ca688f9747
-  data.tar.gz: 1a7dcd56a7196694371abf70633635545138bdc7bc0af2873fc5e7c22bdbfc97e9986ba1a18afc288b24e488caf999b644ec1f9d8889ce6e5efa6fcfe776c204
+  metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
+  data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    ocr-file (0.0.1)
+    ocr-file (0.0.4)
       active_attr (~> 0.15.4)
       console-style (~> 0.0.1)
       hexapdf (~> 0.23.0)

data/README.md CHANGED Viewed

@@ -42,15 +42,16 @@ You will need to install `tesseract` with your desired language on your system,
     type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
     ocr_engine: 'tesseract', # 'cloud-vision'
     # Image Pre-Processing
-    image_pre_preprocess: true,
-    effects: ['bw', 'norm'],
-    threshold: 0.25,
+    image_preprocess: true,
+    effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
+    automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
     # PDF to Image Processing
     optimise_pdf: true,
     extract_pdf_images: true, # if false will screenshot each PDF page
     temp_filename_prefix: 'image',
     # Console Output
     verbose: true,
+    timing: true,
   }
   doc = OcrFile::Document.new(
@@ -84,7 +85,23 @@ You will need to install `tesseract` with your desired language on your system,
 ### Notes / Tips
 Set `extract_pdf_images` to `false` for higher quality OCR. However this will consume more temporary space per PDF page and also be considerably slower.
-Image pre-processing is not yet implemented.
+Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
+`automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
+### Simple CLI
+Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
+```
+# Basic Usage with console output
+ocr-file input_file_path output_folder_path
+# Output to PDF
+ocr-file input_file_path output_folder_path pdf
+# Output to TXT
+ocr-file input_file_path output_folder_path txt
+```
 ## Development
@@ -94,14 +111,15 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 ### TODOs
 - input validation
-- CLI
-- image processing
+- Better CLI
 - password
 - Base64 encoding
 - requirements checking (installed dependencies etc ...)
 - Tests
 - Configurable temp folder cleanup
 - Improve console output
+- Fix spaces in file names
+- Better verbosity
 ### Tests
 To run tests execute:

data/bin/ocr-file CHANGED Viewed

@@ -2,4 +2,4 @@
 require 'ocr-file'
-puts "Hello, world!"
+OcrFile::Cli.new(ARGV).call

data/lib/ocr-file/cli.rb CHANGED Viewed

@@ -1,5 +1,41 @@
 module OcrFile
-  module Cli
+  class Cli
+    attr_reader :args
+    def initialize(args)
+      @args = args
+    end
+    def valid?
+      return true if args.size == 2 || args.size == 3
+      false
+    end
+    def invalid?
+      !valid?
+    end
+    def call
+      # TODO: Use ConsoleStyle::Functions
+      # TODO: Heading and better CLI interface
+      # Simple cli for now
+      puts "OCR Tool © Jason Chalom 2022, Version: #{OcrFile::VERSION}"
+      abort "File path, Save Folder Paths, and output type (pdf, txt) are required!" if invalid?
+      # Using default config for now
+      original_file_path = args[0]
+      save_file_path = args[1]
+      output_type = args[2]
+      document = OcrFile::Document.new(original_file_path: original_file_path, save_file_path: save_file_path)
+      if output_type.to_s.downcase.include?('pdf')
+        document.to_pdf
+      elsif output_type.to_s.downcase.include?('txt') || output_type.to_s.downcase.include?('text')
+        document.to_text
+      else # Display in console
+        puts document.to_s
+      end
+    end
   end
 end

data/lib/ocr-file/document.rb CHANGED Viewed

@@ -1,7 +1,11 @@
 module OcrFile
   class Document
+    # TODO: Skewness / text orientation detection
+    # TODO: Better handwriting analysis
     ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
     PAGE_BREAK = "\n\r\n" # TODO: Make configurable
+    EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
     DEFAULT_CONFIG = {
       # Images from PDF
       filetype: 'png',
@@ -18,15 +22,16 @@ module OcrFile
       type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
       ocr_engine: 'tesseract', # 'cloud-vision'
       # Image Pre-Processing
-      image_pre_preprocess: true,
-      effects: ['bw', 'norm'],
-      threshold: 0.25,
+      image_preprocess: true,
+      effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
+      automatic_reprocess: true,
       # PDF to Image Processing
       optimise_pdf: true,
       extract_pdf_images: true, # if false will screenshot each PDF page
       temp_filename_prefix: 'image',
       # Console Output
       verbose: true,
+      timing: true,
     }
     attr_reader :original_file_path,
@@ -34,7 +39,9 @@ module OcrFile
       :save_file_path,
       :final_save_file,
       :config,
-      :ocr_engine
+      :ocr_engine,
+      :start_time,
+      :end_time
     # save_file_path will also generate a tmp path for tmp files. Expected folder path
     # TODO: Add in more input validation
@@ -52,12 +59,12 @@ module OcrFile
     end
     def pdf?
-      @original_file_path.include?('.pdf')
+      @original_file_path.downcase.include?('.pdf')
     end
     def image?
       return false if pdf?
-      ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
+      ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
     end
     # Treat anything which isnt a PDF or image as text
@@ -65,74 +72,52 @@ module OcrFile
       !pdf? && !image?
     end
+    # Trigger OCR pipeline
     def to_pdf
-      if pdf?
-        create_temp_folder
-        image_paths = extract_image_paths_from_pdf(@original_file_path)
-        pdfs_to_merge = []
-        image_paths.each do |image_path|
-          pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
-        end
-        merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
+      @start_time = Time.now
+      find_best_image_processing if config[:automatic_reprocess] && !text?
-        OcrFile::ImageEngines::PdfEngine
-          .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
-        close
+      if pdf?
+        ocr_pdf_to_searchable_pdf
       elsif text?
-        text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
-        pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
-        OcrFile::ImageEngines::PdfEngine
-          .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+        text_to_pdf
       else # is an image
         ocr_image_to_pdf
       end
+      close
+      @end_time = Time.now
+      print_time
     end
     def to_text
-      if pdf?
-        create_temp_folder
-        image_paths = extract_image_paths_from_pdf(@original_file_path)
+      @start_time = Time.now
+      return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
-        image_paths.each do |image_path|
-          text = @ocr_engine.ocr_to_text(image_path, options: @config)
-          ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
-        end
+      find_best_image_processing(save: true)
+      close
-        close
-      elsif text?
-        ::OcrFile::FileHelpers.open_text_file(@original_file_path)
-      else # is an image
-        ocr_image_to_text(save: true)
-      end
+      @end_time = Time.now
+      print_time
     end
     def to_s
-      if pdf?
-        create_temp_folder
-        image_paths = extract_image_paths_from_pdf(@original_file_path)
+      @start_time = Time.now
+      return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
-        text = ''
+      text = find_best_image_processing(save: false)
-        image_paths.each do |image_path|
-          text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
-        end
+      close
-        close
-        text
-      elsif text?
-        ::OcrFile::FileHelpers.open_text_file(@original_file_path)
-      else # is an image
-        ocr_image_to_text(save: false)
-      end
+      @end_time = Time.now
+      print_time
+      text
     end
     def close
-      ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
+      # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
     end
     private
@@ -157,19 +142,80 @@ module OcrFile
     end
     def create_temp_folder
-      # TODO: Make this a bit more robust
-      @temp_folder_path = "#{save_file_path}/temp/".gsub(' ', '\ ')
+      date = Time.now.to_s.split(' ').first
+      @temp_folder_path = "#{save_file_path}/temp-#{date}/".gsub(' ', '\ ')
       ::OcrFile::FileHelpers.make_directory(@temp_folder_path)
     end
+    def process_image(path)
+      return path unless @config[:image_preprocess]
+      create_temp_folder
+      save_file_path = "#{@temp_folder_path}/#{Time.now.to_i}.#{@config[:filetype]}"
+      image_processor = OcrFile::ImageEngines::ImageMagick.new(
+        image_path: path,
+        temp_path: @temp_folder_path,
+        save_file_path: save_file_path,
+        config: @config
+      )
+      image_processor.convert!
+    end
+    def ocr_pdf_to_searchable_pdf
+      create_temp_folder
+      image_paths = extract_image_paths_from_pdf(@original_file_path)
+      pdfs_to_merge = []
+      image_paths.each do |image_path|
+        pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
+      end
+      merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
+      OcrFile::ImageEngines::PdfEngine
+        .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+    end
+    def text_to_pdf
+      text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
+      pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
+      OcrFile::ImageEngines::PdfEngine
+        .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+    end
     def ocr_image_to_pdf
-      pdf_document = @ocr_engine.ocr_to_pdf(@original_file_path, options: @config)
+      find_best_image_processing if config[:automatic_reprocess]
+      pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
       OcrFile::ImageEngines::PdfEngine
         .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
     end
-    def ocr_image_to_text(save: true)
-      text = @ocr_engine.ocr_to_text(@original_file_path, options: @config)
+    def ocr_pdf_to_text(save:)
+      create_temp_folder
+      image_paths = extract_image_paths_from_pdf(@original_file_path)
+      text = ''
+      image_paths.each do |image_path|
+        text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
+      end
+      if save
+        ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
+      else
+        text
+      end
+    end
+    def ocr_image_to_text(save:)
+      create_temp_folder
+      text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
       if save
         ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -178,6 +224,38 @@ module OcrFile
       end
     end
+    def ocr_file_to_text(save:)
+      if pdf? &&
+        ocr_pdf_to_text(save: save)
+      else # is an image
+        ocr_image_to_text(save: save)
+      end
+    end
+    def find_best_image_processing(save:)
+      ocr_file_to_text(save: save) if !config[:automatic_reprocess]
+      text = ''
+      effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
+      effects_to_test.each do |effect|
+        config[:effects] = config[:effects] - [effect]
+        text = ocr_file_to_text(save: false)
+        break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
+      end
+      # Adds in extra operations which is unfortunately inefficient
+      if save
+        ocr_file_to_text(save: save)
+      else
+        text
+      end
+    end
+    def print_time
+      puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
+    end
     def find_ocr_engine(engine_id)
       ocr_engine_constants
         .map { |c| ocr_module(c) }

data/lib/ocr-file/image_engines/image_magick.rb CHANGED Viewed

@@ -1,14 +1,80 @@
 module OcrFile
   module ImageEngines
-    module ImageMagick
-      extend self
+    class ImageMagick
       # TODO:
-      # B/W
-      # Contrast
-      # Image Norm
-      # Threshold
       # Conversion of image types
+      # Rotation and detection of skew
+      attr_reader :image_path, :image, :temp_path, :save_file_path, :config
+      def initialize(image_path:, temp_path:, save_file_path:, config:)
+        @image_path = image_path
+        @config = config
+        @save_file_path = save_file_path
+        @temp_path = temp_path
+        # Will be available in the next version of MiniMagick > 4.11.0
+        # https://github.com/minimagick/minimagick/pull/541
+        # MiniMagick.configure do |config|
+        #   # cli_version  graphicsmagick?  imagemagick7?  imagemagick? version
+        #   config.tmpdir = File.join(Dir.tmpdir, @temp_path)
+        # end
+        @image = MiniMagick::Image.open(image_path)
+      end
+      def convert!
+        return @image_path unless @config[:image_preprocess]
+        @config[:effects].each do |effect|
+          self.send(effect.to_sym)
+        end
+        save!
+      end
+      def save!
+        image.write(@save_file_path)
+        @save_file_path
+      end
+      # Effects
+      # http://www.imagemagick.org/script/command-line-options.php
+      def bw
+        @image.alpha('off')
+        @image.auto_threshold("otsu")
+      end
+      def enhance
+        @image.enhance
+      end
+      def norm
+        @image.equalize
+      end
+      # Most likely not going to be configurable because
+      # these are aggressive parameters used to optimised OCR results
+      # and not the final results of the PDFs
+      def sharpen
+        @image.sharpen('0x4') # radiusXsigma
+      end
+      # https://github.com/ImageMagick/ImageMagick/discussions/4145
+      def remove_shadow
+        @image.negate
+        @image.lat("20x20+10\%")
+        @image.negate
+      end
+      def deskew
+        @image.deskew('40%') # threshold recommended in the docs
+      end
+      def despeckle
+        @image.despeckle
+      end
     end
   end
 end

data/lib/ocr-file/image_engines/pdftoppm.rb CHANGED Viewed

@@ -13,7 +13,7 @@ module OcrFile
         print 'Generating screenshots of each PDF page ... '
         if filetype == 'jpg'
-          `pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
+          `pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} "#{pdf_path}" "#{save_path}/#{filename}"`
         else
           `pdftoppm -#{filetype} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
         end

data/lib/ocr-file/text_engines/result_processor.rb ADDED Viewed

@@ -0,0 +1,34 @@
+module OcrFile
+  module TextEngines
+    class ResultProcessor
+      MINIMUM_WORD_LENGTH = 3
+      attr_reader :text, :clear_text
+      def initialize(text)
+        @text = text
+        @clear_text = remove_lines
+      end
+      # This is a very naive way of determining if we should re-do OCR with
+      # shifted options
+      def valid_words?
+        word_size_average >= MINIMUM_WORD_LENGTH
+      end
+      def word_count
+        @_word_count ||= clear_text.split(' ').size
+      end
+      def word_size_average
+        @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
+      end
+      private
+      def remove_lines
+        text.gsub("\n", ' ').gsub("\r", ' ').gsub('  ', '')
+      end
+    end
+  end
+end

data/lib/ocr-file/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OcrFile
-  VERSION = "0.0.1"
+  VERSION = "0.0.4"
 end

data/lib/ocr-file.rb CHANGED Viewed

@@ -10,6 +10,7 @@ require 'ocr-file/image_engines/image_magick'
 require 'ocr-file/image_engines/pdftoppm'
 require 'ocr-file/ocr_engines/tesseract'
 require 'ocr-file/ocr_engines/cloud_vision'
+require 'ocr-file/text_engines/result_processor'
 require 'ocr-file/file_helpers'
 require 'ocr-file/document'
 require 'ocr-file/cli'

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ocr-file
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.4
 platform: ruby
 authors:
 - trex22
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-06-19 00:00:00.000000000 Z
+date: 2022-06-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: console-style
@@ -122,6 +122,7 @@ files:
 - lib/ocr-file/image_engines/pdftoppm.rb
 - lib/ocr-file/ocr_engines/cloud_vision.rb
 - lib/ocr-file/ocr_engines/tesseract.rb
+- lib/ocr-file/text_engines/result_processor.rb
 - lib/ocr-file/version.rb
 - ocr-file.gemspec
 homepage: https://github.com/TRex22/ocr-file