RubyGems - ocr-file - Versions diffs - 0.0.3 → 0.0.7 - Mend

ocr-file 0.0.3 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/Gemfile.lock +4 -2
data/README.md +12 -2
data/lib/ocr-file/document.rb +139 -49
data/lib/ocr-file/image_engines/pdf_engine.rb +28 -1
data/lib/ocr-file/text_engines/result_processor.rb +82 -0
data/lib/ocr-file/version.rb +1 -1
data/lib/ocr-file.rb +2 -0
data/ocr-file.gemspec +1 -0
metadata +17 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0e67553a31e82eba190368040d3475b812e113aedfb9994484043dda34a55053
-  data.tar.gz: 6fe5e142fef4387fc98fce57d3fdb2b7a0c37199d1712bd1d85dced9a0e61274
+  metadata.gz: 8b87806d21622a72c6166c35fe4367f5b07135e5e7fab4e8be8b8941f75439dc
+  data.tar.gz: d342a91e9b23f8677784553327ba1cc1c00e1599415512b28226f8e9f6bc55b4
 SHA512:
-  metadata.gz: e5d06cf54a8bc96c90522ab67530310730230067ee226f6eb1143adde2ccb407dde25aef7b595836478ee944e4e9b3ff306b4df5a08ec14ab6623ab08daefa8b
-  data.tar.gz: 45a7c3d06908c878f281db9baf4ec82310ecde20e12cad5ff4cc03d2f271167d46fa52145fe598f594a3360a525c926d955bb08d17e740ba78f97ec72f0f4b47
+  metadata.gz: ecadeeb21a358274bce4ed3d7fce66e53d31ff3abe940ff1b9d77893f12b73bfd41e9ac35324e3a98f004638f9d1906760ef962a3637fbaf48973faeec9a17cb
+  data.tar.gz: 5d4a149dd6d0da1feb723b08c327edab414b75f0b633cea53aaee00d43313d26b84659956957acec7550a822998b76a760b3888770a606d8b4a1f9bb14f807c2

data/Gemfile.lock CHANGED Viewed

@@ -1,12 +1,13 @@
 PATH
   remote: .
   specs:
-    ocr-file (0.0.2)
+    ocr-file (0.0.6)
       active_attr (~> 0.15.4)
       console-style (~> 0.0.1)
       hexapdf (~> 0.23.0)
       mini_magick (~> 4.11.0)
       rtesseract (~> 3.1.2)
+      ruby-spellchecker (~> 0.1.5)
 GEM
   remote: https://rubygems.org/
@@ -60,7 +61,7 @@ GEM
       coderay (~> 1.1)
       method_source (~> 1.0)
     racc (1.6.0)
-    rack (2.2.3.1)
+    rack (2.2.4)
     rack-test (1.1.0)
       rack (>= 1.0, < 3)
     rails-dom-testing (2.0.3)
@@ -69,6 +70,7 @@ GEM
     rails-html-sanitizer (1.4.3)
       loofah (~> 2.3)
     rtesseract (3.1.2)
+    ruby-spellchecker (0.1.5)
     tzinfo (2.0.4)
       concurrent-ruby (~> 1.0)

data/README.md CHANGED Viewed

@@ -44,12 +44,16 @@ You will need to install `tesseract` with your desired language on your system,
     # Image Pre-Processing
     image_preprocess: true,
     effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
+    automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
     # PDF to Image Processing
     optimise_pdf: true,
     extract_pdf_images: true, # if false will screenshot each PDF page
     temp_filename_prefix: 'image',
+    spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
+    keep_files: false,
     # Console Output
     verbose: true,
+    timing: true
   }
   doc = OcrFile::Document.new(
@@ -74,6 +78,7 @@ You will need to install `tesseract` with your desired language on your system,
   doc.to_pdf
   # How to merge files into a single PDF:
+  # The files can be images or other PDFs
   filepaths = []
   documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
   merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
@@ -85,6 +90,8 @@ Set `extract_pdf_images` to `false` for higher quality OCR. However this will co
 Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
+`automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
 ### Simple CLI
 Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
@@ -108,7 +115,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 ### TODOs
 - input validation
 - Better CLI
-- image processing
 - password
 - Base64 encoding
 - requirements checking (installed dependencies etc ...)
@@ -117,7 +123,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 - Improve console output
 - Fix spaces in file names
 - Better verbosity
-- Timing
+- Docker
+- pdftk / pdf merge for text and bookmarks etc ...
+    - https://github.com/tesseract-ocr/tesseract/issues/660
+    - tesseract -c naked_pdf=true
+-
 ### Tests
 To run tests execute:

data/lib/ocr-file/document.rb CHANGED Viewed

@@ -5,6 +5,7 @@ module OcrFile
     ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
     PAGE_BREAK = "\n\r\n" # TODO: Make configurable
+    EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
     DEFAULT_CONFIG = {
       # Images from PDF
       filetype: 'png',
@@ -23,12 +24,16 @@ module OcrFile
       # Image Pre-Processing
       image_preprocess: true,
       effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
+      automatic_reprocess: true,
       # PDF to Image Processing
       optimise_pdf: true,
       extract_pdf_images: true, # if false will screenshot each PDF page
       temp_filename_prefix: 'image',
+      spelling_correction: true,
+      keep_files: false,
       # Console Output
       verbose: true,
+      timing: true
     }
     attr_reader :original_file_path,
@@ -36,7 +41,9 @@ module OcrFile
       :save_file_path,
       :final_save_file,
       :config,
-      :ocr_engine
+      :ocr_engine,
+      :start_time,
+      :end_time
     # save_file_path will also generate a tmp path for tmp files. Expected folder path
     # TODO: Add in more input validation
@@ -69,75 +76,50 @@ module OcrFile
     # Trigger OCR pipeline
     def to_pdf
-      if pdf?
-        create_temp_folder
-        image_paths = extract_image_paths_from_pdf(@original_file_path)
-        pdfs_to_merge = []
-        image_paths.each do |image_path|
-          pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
-        end
-        merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
+      @start_time = Time.now
+      find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
-        OcrFile::ImageEngines::PdfEngine
-          .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+      if pdf?
+        ocr_pdf_to_searchable_pdf
       elsif text?
-        text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
-        pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
-        OcrFile::ImageEngines::PdfEngine
-          .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+        text_to_pdf
       else # is an image
         ocr_image_to_pdf
       end
       close
+      @end_time = Time.now
+      print_time
     end
     def to_text
-      if pdf?
-        create_temp_folder
-        image_paths = extract_image_paths_from_pdf(@original_file_path)
-        image_paths.each do |image_path|
-          text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
-          ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
-        end
-      elsif text?
-        ::OcrFile::FileHelpers.open_text_file(@original_file_path)
-      else # is an image
-        ocr_image_to_text(save: true)
-      end
+      @start_time = Time.now
+      return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
+      find_best_image_processing(save: true)
       close
+      @end_time = Time.now
+      print_time
     end
     def to_s
-      if pdf?
-        create_temp_folder
-        image_paths = extract_image_paths_from_pdf(@original_file_path)
+      @start_time = Time.now
+      return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
-        text = ''
+      text = find_best_image_processing(save: false)
-        image_paths.each do |image_path|
-          text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
-        end
+      close
-        close
-        text
-      elsif text?
-        ::OcrFile::FileHelpers.open_text_file(@original_file_path)
-      else # is an image
-        text = ocr_image_to_text(save: false)
+      @end_time = Time.now
+      print_time
-        close
-        text
-      end
+      text
     end
     def close
+      return if keep_files?
       ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
     end
@@ -162,6 +144,10 @@ module OcrFile
       end
     end
+    def keep_files?
+      config['keep_files']
+    end
     def create_temp_folder
       date = Time.now.to_s.split(' ').first
@@ -185,14 +171,67 @@ module OcrFile
       image_processor.convert!
     end
+    def ocr_pdf_to_searchable_pdf
+      create_temp_folder
+      image_paths = extract_image_paths_from_pdf(@original_file_path)
+      pdfs_to_merge = []
+      image_paths.each do |image_path|
+        puts image_path
+        pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
+      end
+      merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
+      OcrFile::ImageEngines::PdfEngine
+        .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+    end
+    def text_to_pdf
+      text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
+      text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
+      pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
+      OcrFile::ImageEngines::PdfEngine
+        .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+    end
     def ocr_image_to_pdf
+      find_best_image_processing(save: false) if config[:automatic_reprocess]
       pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
       OcrFile::ImageEngines::PdfEngine
         .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
     end
-    def ocr_image_to_text(save: true)
+    def ocr_pdf_to_text(save:)
+      create_temp_folder
+      image_paths = extract_image_paths_from_pdf(@original_file_path)
+      text = ''
+      image_paths.each do |image_path|
+        puts image_path
+        text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
+        text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
+        text = "#{text}#{PAGE_BREAK}#{text}"
+      end
+      if save
+        ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
+      else
+        text
+      end
+    end
+    def ocr_image_to_text(save:)
+      create_temp_folder
       text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
+      text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
       if save
         ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -201,6 +240,57 @@ module OcrFile
       end
     end
+    def ocr_file_to_text(save:)
+      if pdf?
+        ocr_pdf_to_text(save: save)
+      else # is an image
+        ocr_image_to_text(save: save)
+      end
+    end
+    def find_best_image_processing(save:)
+      ocr_file_to_text(save: save) unless config[:automatic_reprocess]
+      text = ''
+      best_text_count = 0
+      best_effects = config[:effects]
+      effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
+      effects_to_test.each do |effect|
+        text = test_ocr_settings(effect)
+        processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
+        if processed_result.count_of_issues < best_text_count
+          best_text_count = processed_result.count_of_issues
+          best_effects = config[:effects]
+        end
+        break if processed_result.valid_words?
+      end
+      # Fallback
+      if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
+        config[:effects] = best_effects
+        text = ocr_file_to_text(save: false)
+      end
+      # Adds in extra operations which is unfortunately inefficient
+      if save
+        ocr_file_to_text(save: save)
+      else
+        text
+      end
+    end
+    def test_ocr_settings(effect)
+      config[:effects] = config[:effects] - [effect]
+      ocr_file_to_text(save: false)
+    end
+    def print_time
+      puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
+    end
     def find_ocr_engine(engine_id)
       ocr_engine_constants
         .map { |c| ocr_module(c) }

data/lib/ocr-file/image_engines/pdf_engine.rb CHANGED Viewed

@@ -61,11 +61,38 @@ module OcrFile
         image_paths
       end
+      def insert_image(document, image_path)
+        canvas = document.pages.add.canvas
+        canvas.image(image_path, at: [0, 0], height: 700)
+      end
+      def combine(text, pdf_of_images)
+        return unless pdf_of_images.is_a?(::HexaPDF::Document)
+        if text.is_a?(::HexaPDF::Document)
+          pages_of_text = text.pages
+        else # Assume raw text with PAGE_BREAK
+          pages_of_text = text.split(PAGE_BREAK)
+        end
+        return unless pages_of_text.size == pdf_of_images.pages.size
+        if text.is_a?(::HexaPDF::Document) # Keep the page structure
+        else # Just text to embed
+        end
+      end
       def merge(documents)
         target = ::HexaPDF::Document.new
         documents.each do |document|
-          document.pages.each { |page| target.pages << target.import(page) }
+          if document.is_a?(::HexaPDF::Document)
+            document.pages.each { |page| target.pages << target.import(page) }
+          else # Assume an image
+            insert_image(target, document)
+          end
         end
         target

data/lib/ocr-file/text_engines/result_processor.rb ADDED Viewed

@@ -0,0 +1,82 @@
+module OcrFile
+  module TextEngines
+    class ResultProcessor
+      MINIMUM_WORD_LENGTH = 4
+      ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
+      ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
+      # REGEX
+      ASCII_ONLY = /[^\u{0000}-\u{007f}]/
+      NOISE_CHARACTERS = /[^\w\s\/-;:]/
+      DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
+      EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
+      attr_reader :text, :clear_text
+      def initialize(text)
+        @text = text
+        @clear_text = generate_clear_text || text || ''
+      end
+      def correct
+        Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
+      end
+      # This is a very naive way of determining if we should re-do OCR with
+      # shifted options
+      def valid_words?
+        word_size_average >= MINIMUM_WORD_LENGTH &&
+          spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
+          unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
+      end
+      def invalid_words?
+        !valid_words?
+      end
+      def word_count
+        return 0 if empty_text?
+        @_word_count ||= clear_words.size
+      end
+      def word_size_average
+        return 0 if empty_text?
+        @_word_size_average ||= clear_words.map(&:size).sum / word_count
+      end
+      # Assume English
+      def unidentified_word_count
+        clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
+      end
+      def spelling_error_count
+        Spellchecker.check(clear_text).count
+      end
+      def count_of_issues
+        spelling_error_count + unidentified_word_count
+      end
+      private
+      def empty_text?
+        clear_text.nil? || clear_text == ''
+      end
+      def clear_words
+        @clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
+      end
+      def generate_clear_text
+        remove_lines
+          &.gsub(ASCII_ONLY, '')
+          &.gsub(NOISE_CHARACTERS, '')
+          &.gsub(DUPLICATE_WORDS, '')
+      end
+      def remove_lines
+        text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub('  ', '')
+      end
+    end
+  end
+end

data/lib/ocr-file/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OcrFile
-  VERSION = "0.0.3"
+  VERSION = "0.0.7"
 end

data/lib/ocr-file.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require 'hexapdf'
 require 'hexapdf/cli/images'
 require 'rtesseract'
 require 'mini_magick'
+require 'ruby-spellchecker'
 require 'ocr-file/version'
@@ -10,6 +11,7 @@ require 'ocr-file/image_engines/image_magick'
 require 'ocr-file/image_engines/pdftoppm'
 require 'ocr-file/ocr_engines/tesseract'
 require 'ocr-file/ocr_engines/cloud_vision'
+require 'ocr-file/text_engines/result_processor'
 require 'ocr-file/file_helpers'
 require 'ocr-file/document'
 require 'ocr-file/cli'

data/ocr-file.gemspec CHANGED Viewed

@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency "hexapdf", "~> 0.23.0"
   spec.add_dependency "rtesseract", "~> 3.1.2"
   spec.add_dependency "mini_magick", "~> 4.11.0"
+  spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
   # Development Dependencies
   spec.add_development_dependency "pry", "~> 0.14.1"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ocr-file
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.7
 platform: ruby
 authors:
 - trex22
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-06-20 00:00:00.000000000 Z
+date: 2022-07-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: console-style
@@ -80,6 +80,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 4.11.0
+- !ruby/object:Gem::Dependency
+  name: ruby-spellchecker
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.5
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.5
 - !ruby/object:Gem::Dependency
   name: pry
   requirement: !ruby/object:Gem::Requirement
@@ -122,6 +136,7 @@ files:
 - lib/ocr-file/image_engines/pdftoppm.rb
 - lib/ocr-file/ocr_engines/cloud_vision.rb
 - lib/ocr-file/ocr_engines/tesseract.rb
+- lib/ocr-file/text_engines/result_processor.rb
 - lib/ocr-file/version.rb
 - ocr-file.gemspec
 homepage: https://github.com/TRex22/ocr-file