RubyGems - ocr-file - Versions diffs - 0.0.4 → 0.0.6 - Mend

ocr-file 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/Gemfile.lock +4 -2
data/README.md +7 -0
data/lib/ocr-file/document.rb +37 -8
data/lib/ocr-file/image_engines/pdf_engine.rb +28 -1
data/lib/ocr-file/text_engines/result_processor.rb +54 -6
data/lib/ocr-file/version.rb +1 -1
data/lib/ocr-file.rb +1 -0
data/ocr-file.gemspec +1 -0
metadata +16 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
-  data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
+  metadata.gz: 9660e3d19c210789a7aeab56e63b002b507c12917edd1418202900d05647773b
+  data.tar.gz: 28861544f58374db141e5e3cc3ee8569201c4ffd611458e77774c98835e2f882
 SHA512:
-  metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
-  data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
+  metadata.gz: 55f4266d6877a7f2c8a4175f5601930c9940aa1a8a06fc1d6d84faca455232173f2ae4be887dc9d20246cb224498a213b91871b66ee3fb3b02699399be33453a
+  data.tar.gz: e78263baffd1ae1ff1246f705250d788ae6b5c24b3d59d78e169b3ddf93e139ac9b9688cf9d954053a913a1c3bd6be54c40466d372f915c4430d55d93282f1bb

data/Gemfile.lock CHANGED Viewed

@@ -1,12 +1,13 @@
 PATH
   remote: .
   specs:
-    ocr-file (0.0.4)
+    ocr-file (0.0.6)
       active_attr (~> 0.15.4)
       console-style (~> 0.0.1)
       hexapdf (~> 0.23.0)
       mini_magick (~> 4.11.0)
       rtesseract (~> 3.1.2)
+      ruby-spellchecker (~> 0.1.5)
 GEM
   remote: https://rubygems.org/
@@ -60,7 +61,7 @@ GEM
       coderay (~> 1.1)
       method_source (~> 1.0)
     racc (1.6.0)
-    rack (2.2.3.1)
+    rack (2.2.4)
     rack-test (1.1.0)
       rack (>= 1.0, < 3)
     rails-dom-testing (2.0.3)
@@ -69,6 +70,7 @@ GEM
     rails-html-sanitizer (1.4.3)
       loofah (~> 2.3)
     rtesseract (3.1.2)
+    ruby-spellchecker (0.1.5)
     tzinfo (2.0.4)
       concurrent-ruby (~> 1.0)

data/README.md CHANGED Viewed

@@ -49,6 +49,7 @@ You will need to install `tesseract` with your desired language on your system,
     optimise_pdf: true,
     extract_pdf_images: true, # if false will screenshot each PDF page
     temp_filename_prefix: 'image',
+    spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
     # Console Output
     verbose: true,
     timing: true,
@@ -76,6 +77,7 @@ You will need to install `tesseract` with your desired language on your system,
   doc.to_pdf
   # How to merge files into a single PDF:
+  # The files can be images or other PDFs
   filepaths = []
   documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
   merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
@@ -120,6 +122,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 - Improve console output
 - Fix spaces in file names
 - Better verbosity
+- Docker
+- pdftk / pdf merge for text and bookmarks etc ...
+    - https://github.com/tesseract-ocr/tesseract/issues/660
+    - tesseract -c naked_pdf=true
+-
 ### Tests
 To run tests execute:

data/lib/ocr-file/document.rb CHANGED Viewed

@@ -29,6 +29,7 @@ module OcrFile
       optimise_pdf: true,
       extract_pdf_images: true, # if false will screenshot each PDF page
       temp_filename_prefix: 'image',
+      spelling_correction: true,
       # Console Output
       verbose: true,
       timing: true,
@@ -75,7 +76,7 @@ module OcrFile
     # Trigger OCR pipeline
     def to_pdf
       @start_time = Time.now
-      find_best_image_processing if config[:automatic_reprocess] && !text?
+      find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
       if pdf?
         ocr_pdf_to_searchable_pdf
@@ -117,7 +118,7 @@ module OcrFile
     end
     def close
-      # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
+      ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
     end
     private
@@ -171,6 +172,7 @@ module OcrFile
       pdfs_to_merge = []
       image_paths.each do |image_path|
+        puts image_path
         pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
       end
@@ -182,6 +184,8 @@ module OcrFile
     def text_to_pdf
       text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
+      text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
       pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
       OcrFile::ImageEngines::PdfEngine
@@ -189,7 +193,7 @@ module OcrFile
     end
     def ocr_image_to_pdf
-      find_best_image_processing if config[:automatic_reprocess]
+      find_best_image_processing(save: false) if config[:automatic_reprocess]
       pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
       OcrFile::ImageEngines::PdfEngine
@@ -203,7 +207,11 @@ module OcrFile
       text = ''
       image_paths.each do |image_path|
-        text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
+        puts image_path
+        text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
+        text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
+        text = "#{text}#{PAGE_BREAK}#{text}"
       end
       if save
@@ -215,7 +223,9 @@ module OcrFile
     def ocr_image_to_text(save:)
       create_temp_folder
       text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
+      text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
       if save
         ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -225,7 +235,7 @@ module OcrFile
     end
     def ocr_file_to_text(save:)
-      if pdf? &&
+      if pdf?
         ocr_pdf_to_text(save: save)
       else # is an image
         ocr_image_to_text(save: save)
@@ -233,15 +243,29 @@ module OcrFile
     end
     def find_best_image_processing(save:)
-      ocr_file_to_text(save: save) if !config[:automatic_reprocess]
+      ocr_file_to_text(save: save) unless config[:automatic_reprocess]
       text = ''
+      best_text_count = 0
+      best_effects = config[:effects]
       effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
       effects_to_test.each do |effect|
-        config[:effects] = config[:effects] - [effect]
+        text = test_ocr_settings(effect)
+        processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
+        if processed_result.count_of_issues < best_text_count
+          best_text_count = processed_result.count_of_issues
+          best_effects = config[:effects]
+        end
+        break if processed_result.valid_words?
+      end
+      # Fallback
+      if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
+        config[:effects] = best_effects
         text = ocr_file_to_text(save: false)
-        break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
       end
       # Adds in extra operations which is unfortunately inefficient
@@ -252,6 +276,11 @@ module OcrFile
       end
     end
+    def test_ocr_settings(effect)
+      config[:effects] = config[:effects] - [effect]
+      ocr_file_to_text(save: false)
+    end
     def print_time
       puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
     end

data/lib/ocr-file/image_engines/pdf_engine.rb CHANGED Viewed

@@ -61,11 +61,38 @@ module OcrFile
         image_paths
       end
+      def insert_image(document, image_path)
+        canvas = document.pages.add.canvas
+        canvas.image(image_path, at: [0, 0], height: 700)
+      end
+      def combine(text, pdf_of_images)
+        return unless pdf_of_images.is_a?(::HexaPDF::Document)
+        if text.is_a?(::HexaPDF::Document)
+          pages_of_text = text.pages
+        else # Assume raw text with PAGE_BREAK
+          pages_of_text = text.split(PAGE_BREAK)
+        end
+        return unless pages_of_text.size == pdf_of_images.pages.size
+        if text.is_a?(::HexaPDF::Document) # Keep the page structure
+        else # Just text to embed
+        end
+      end
       def merge(documents)
         target = ::HexaPDF::Document.new
         documents.each do |document|
-          document.pages.each { |page| target.pages << target.import(page) }
+          if document.is_a?(::HexaPDF::Document)
+            document.pages.each { |page| target.pages << target.import(page) }
+          else # Assume an image
+            insert_image(target, document)
+          end
         end
         target

data/lib/ocr-file/text_engines/result_processor.rb CHANGED Viewed

@@ -1,33 +1,81 @@
 module OcrFile
   module TextEngines
     class ResultProcessor
-      MINIMUM_WORD_LENGTH = 3
+      MINIMUM_WORD_LENGTH = 4
+      ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
+      ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
+      # REGEX
+      ASCII_ONLY = /[^\u{0000}-\u{007f}]/
+      NOISE_CHARACTERS = /[^\w\s\/-;:]/
+      DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
+      EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
       attr_reader :text, :clear_text
       def initialize(text)
         @text = text
-        @clear_text = remove_lines
+        @clear_text = generate_clear_text || text || ''
+      end
+      def correct
+        Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
       end
       # This is a very naive way of determining if we should re-do OCR with
       # shifted options
       def valid_words?
-        word_size_average >= MINIMUM_WORD_LENGTH
+        word_size_average >= MINIMUM_WORD_LENGTH &&
+          spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
+          unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
+      end
+      def invalid_words?
+        !valid_words?
       end
       def word_count
-        @_word_count ||= clear_text.split(' ').size
+        return 0 if empty_text?
+        @_word_count ||= clear_words.size
       end
       def word_size_average
-        @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
+        return 0 if empty_text?
+        @_word_size_average ||= clear_words.map(&:size).sum / word_count
+      end
+      # Assume English
+      def unidentified_word_count
+        clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
+      end
+      def spelling_error_count
+        Spellchecker.check(clear_text).count
+      end
+      def count_of_issues
+        spelling_error_count + unidentified_word_count
       end
       private
+      def empty_text?
+        clear_text.nil? || clear_text == ''
+      end
+      def clear_words
+        @clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
+      end
+      def generate_clear_text
+        remove_lines
+          &.gsub(ASCII_ONLY, '')
+          &.gsub(NOISE_CHARACTERS, '')
+          &.gsub(DUPLICATE_WORDS, '')
+      end
       def remove_lines
-        text.gsub("\n", ' ').gsub("\r", ' ').gsub('  ', '')
+        text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub('  ', '')
       end
     end
   end

data/lib/ocr-file/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OcrFile
-  VERSION = "0.0.4"
+  VERSION = "0.0.6"
 end

data/lib/ocr-file.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require 'hexapdf'
 require 'hexapdf/cli/images'
 require 'rtesseract'
 require 'mini_magick'
+require 'ruby-spellchecker'
 require 'ocr-file/version'

data/ocr-file.gemspec CHANGED Viewed

@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency "hexapdf", "~> 0.23.0"
   spec.add_dependency "rtesseract", "~> 3.1.2"
   spec.add_dependency "mini_magick", "~> 4.11.0"
+  spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
   # Development Dependencies
   spec.add_development_dependency "pry", "~> 0.14.1"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ocr-file
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.6
 platform: ruby
 authors:
 - trex22
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-06-20 00:00:00.000000000 Z
+date: 2022-07-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: console-style
@@ -80,6 +80,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 4.11.0
+- !ruby/object:Gem::Dependency
+  name: ruby-spellchecker
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.5
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.5
 - !ruby/object:Gem::Dependency
   name: pry
   requirement: !ruby/object:Gem::Requirement