RubyGems - ocr-file - Versions diffs - 0.0.4 → 0.0.8 - Mend

ocr-file 0.0.4 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/Gemfile.lock +4 -2
data/README.md +12 -4
data/lib/ocr-file/document.rb +45 -10
data/lib/ocr-file/image_engines/image_magick.rb +8 -1
data/lib/ocr-file/image_engines/pdf_engine.rb +43 -1
data/lib/ocr-file/text_engines/result_processor.rb +54 -6
data/lib/ocr-file/version.rb +1 -1
data/lib/ocr-file.rb +1 -0
data/ocr-file.gemspec +1 -0
metadata +16 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
-  data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
+  metadata.gz: 9ae0f4940b34df3280221cf8b26d86ba3498f8344ef5f0e27ea335ca651a8906
+  data.tar.gz: 5e790899721d25bb0f4dc0e8e276b39b62bbb2803549fdbc8ba148804885bec0
 SHA512:
-  metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
-  data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
+  metadata.gz: 6cd016ca7bba37866579cad59f01f41d190c0a191cd1ce27fa7037646da7bf4962923664c7b6295655936aed8714fac01b08301be65fdfef68403c8dd12c075b
+  data.tar.gz: f1581713a76e19f1b24d43f030cccbfb32b206bea8d1a5f07fed26fe4e0cfaa3f991c0c35b98bf1f222ca36b143e83700638ecf3b0520b9663d2fe4336cc5da2

data/Gemfile.lock CHANGED Viewed

@@ -1,12 +1,13 @@
 PATH
   remote: .
   specs:
-    ocr-file (0.0.4)
+    ocr-file (0.0.8)
       active_attr (~> 0.15.4)
       console-style (~> 0.0.1)
       hexapdf (~> 0.23.0)
       mini_magick (~> 4.11.0)
       rtesseract (~> 3.1.2)
+      ruby-spellchecker (~> 0.1.5)
 GEM
   remote: https://rubygems.org/
@@ -60,7 +61,7 @@ GEM
       coderay (~> 1.1)
       method_source (~> 1.0)
     racc (1.6.0)
-    rack (2.2.3.1)
+    rack (2.2.4)
     rack-test (1.1.0)
       rack (>= 1.0, < 3)
     rails-dom-testing (2.0.3)
@@ -69,6 +70,7 @@ GEM
     rails-html-sanitizer (1.4.3)
       loofah (~> 2.3)
     rtesseract (3.1.2)
+    ruby-spellchecker (0.1.5)
     tzinfo (2.0.4)
       concurrent-ruby (~> 1.0)

data/README.md CHANGED Viewed

@@ -49,9 +49,11 @@ You will need to install `tesseract` with your desired language on your system,
     optimise_pdf: true,
     extract_pdf_images: true, # if false will screenshot each PDF page
     temp_filename_prefix: 'image',
+    spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
+    keep_files: false,
     # Console Output
     verbose: true,
-    timing: true,
+    timing: true
   }
   doc = OcrFile::Document.new(
@@ -76,9 +78,10 @@ You will need to install `tesseract` with your desired language on your system,
   doc.to_pdf
   # How to merge files into a single PDF:
-  filepaths = []
-  documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
-  merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
+  # The files can be images or other PDFs
+  file_paths = []
+  merged_document = ::HexaPDF::Document.new
+  documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.insert_image(merged_document, path) }
   OcrFile::ImageEngines::PdfEngine.save_pdf(merged_document, save_file_path, optimise: true)
 ```
@@ -120,6 +123,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 - Improve console output
 - Fix spaces in file names
 - Better verbosity
+- Docker
+- pdftk / pdf merge for text and bookmarks etc ...
+    - https://github.com/tesseract-ocr/tesseract/issues/660
+    - tesseract -c naked_pdf=true
+-
 ### Tests
 To run tests execute:

data/lib/ocr-file/document.rb CHANGED Viewed

@@ -29,9 +29,11 @@ module OcrFile
       optimise_pdf: true,
       extract_pdf_images: true, # if false will screenshot each PDF page
       temp_filename_prefix: 'image',
+      spelling_correction: true,
+      keep_files: false,
       # Console Output
       verbose: true,
-      timing: true,
+      timing: true
     }
     attr_reader :original_file_path,
@@ -64,7 +66,7 @@ module OcrFile
     def image?
       return false if pdf?
-      ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
+      ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") }
     end
     # Treat anything which isnt a PDF or image as text
@@ -75,7 +77,7 @@ module OcrFile
     # Trigger OCR pipeline
     def to_pdf
       @start_time = Time.now
-      find_best_image_processing if config[:automatic_reprocess] && !text?
+      find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
       if pdf?
         ocr_pdf_to_searchable_pdf
@@ -117,7 +119,8 @@ module OcrFile
     end
     def close
-      # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
+      return if keep_files?
+      ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
     end
     private
@@ -141,6 +144,10 @@ module OcrFile
       end
     end
+    def keep_files?
+      config['keep_files']
+    end
     def create_temp_folder
       date = Time.now.to_s.split(' ').first
@@ -171,6 +178,7 @@ module OcrFile
       pdfs_to_merge = []
       image_paths.each do |image_path|
+        puts image_path
         pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
       end
@@ -182,6 +190,8 @@ module OcrFile
     def text_to_pdf
       text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
+      text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
       pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
       OcrFile::ImageEngines::PdfEngine
@@ -189,7 +199,7 @@ module OcrFile
     end
     def ocr_image_to_pdf
-      find_best_image_processing if config[:automatic_reprocess]
+      find_best_image_processing(save: false) if config[:automatic_reprocess]
       pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
       OcrFile::ImageEngines::PdfEngine
@@ -203,7 +213,11 @@ module OcrFile
       text = ''
       image_paths.each do |image_path|
-        text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
+        puts image_path
+        text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
+        text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
+        text = "#{text}#{PAGE_BREAK}#{text}"
       end
       if save
@@ -215,7 +229,9 @@ module OcrFile
     def ocr_image_to_text(save:)
       create_temp_folder
       text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
+      text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
       if save
         ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -225,7 +241,7 @@ module OcrFile
     end
     def ocr_file_to_text(save:)
-      if pdf? &&
+      if pdf?
         ocr_pdf_to_text(save: save)
       else # is an image
         ocr_image_to_text(save: save)
@@ -233,15 +249,29 @@ module OcrFile
     end
     def find_best_image_processing(save:)
-      ocr_file_to_text(save: save) if !config[:automatic_reprocess]
+      ocr_file_to_text(save: save) unless config[:automatic_reprocess]
       text = ''
+      best_text_count = 0
+      best_effects = config[:effects]
       effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
       effects_to_test.each do |effect|
-        config[:effects] = config[:effects] - [effect]
+        text = test_ocr_settings(effect)
+        processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
+        if processed_result.count_of_issues < best_text_count
+          best_text_count = processed_result.count_of_issues
+          best_effects = config[:effects]
+        end
+        break if processed_result.valid_words?
+      end
+      # Fallback
+      if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
+        config[:effects] = best_effects
         text = ocr_file_to_text(save: false)
-        break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
       end
       # Adds in extra operations which is unfortunately inefficient
@@ -252,6 +282,11 @@ module OcrFile
       end
     end
+    def test_ocr_settings(effect)
+      config[:effects] = config[:effects] - [effect]
+      ocr_file_to_text(save: false)
+    end
     def print_time
       puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
     end

data/lib/ocr-file/image_engines/image_magick.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module OcrFile
       # Conversion of image types
       # Rotation and detection of skew
-      attr_reader :image_path, :image, :temp_path, :save_file_path, :config
+      attr_reader :image_path, :image, :temp_path, :save_file_path, :config, :width, :height
       def initialize(image_path:, temp_path:, save_file_path:, config:)
         @image_path = image_path
@@ -22,6 +22,9 @@ module OcrFile
         # end
         @image = MiniMagick::Image.open(image_path)
+        @width = @image[:width]
+        @height = @image[:height]
       end
       def convert!
@@ -39,6 +42,10 @@ module OcrFile
         @save_file_path
       end
+      def resize(width, height)
+        @image.resize("#{width}x#{height}")
+      end
       # Effects
       # http://www.imagemagick.org/script/command-line-options.php
       def bw

data/lib/ocr-file/image_engines/pdf_engine.rb CHANGED Viewed

@@ -61,11 +61,53 @@ module OcrFile
         image_paths
       end
+      def insert_image(document, image_path, dimensions: nil)
+        image_processor = OcrFile::ImageEngines::ImageMagick.new(
+          image_path: image_path,
+          temp_path: @temp_folder_path,
+          save_file_path: '',
+          config: @config
+        )
+        if dimensions
+          width = dimensions[0]
+          height = dimensions[1]
+        else
+          width = image_processor.width
+          height = image_processor.height
+        end
+        page = document.pages.add([0, 0, width, height])
+        page.canvas.image(@image || image_path, at: [0, 0], width: width, height: height)
+      end
+      def combine(text, pdf_of_images)
+        return unless pdf_of_images.is_a?(::HexaPDF::Document)
+        if text.is_a?(::HexaPDF::Document)
+          pages_of_text = text.pages
+        else # Assume raw text with PAGE_BREAK
+          pages_of_text = text.split(PAGE_BREAK)
+        end
+        return unless pages_of_text.size == pdf_of_images.pages.size
+        if text.is_a?(::HexaPDF::Document) # Keep the page structure
+        else # Just text to embed
+        end
+      end
       def merge(documents)
         target = ::HexaPDF::Document.new
         documents.each do |document|
-          document.pages.each { |page| target.pages << target.import(page) }
+          if document.is_a?(::HexaPDF::Document)
+            document.pages.each { |page| target.pages << target.import(page) }
+          else # Assume an image
+            insert_image(target, document)
+          end
         end
         target

data/lib/ocr-file/text_engines/result_processor.rb CHANGED Viewed

@@ -1,33 +1,81 @@
 module OcrFile
   module TextEngines
     class ResultProcessor
-      MINIMUM_WORD_LENGTH = 3
+      MINIMUM_WORD_LENGTH = 4
+      ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
+      ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
+      # REGEX
+      ASCII_ONLY = /[^\u{0000}-\u{007f}]/
+      NOISE_CHARACTERS = /[^\w\s\/-;:]/
+      DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
+      EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
       attr_reader :text, :clear_text
       def initialize(text)
         @text = text
-        @clear_text = remove_lines
+        @clear_text = generate_clear_text || text || ''
+      end
+      def correct
+        Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
       end
       # This is a very naive way of determining if we should re-do OCR with
       # shifted options
       def valid_words?
-        word_size_average >= MINIMUM_WORD_LENGTH
+        word_size_average >= MINIMUM_WORD_LENGTH &&
+          spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
+          unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
+      end
+      def invalid_words?
+        !valid_words?
       end
       def word_count
-        @_word_count ||= clear_text.split(' ').size
+        return 0 if empty_text?
+        @_word_count ||= clear_words.size
       end
       def word_size_average
-        @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
+        return 0 if empty_text?
+        @_word_size_average ||= clear_words.map(&:size).sum / word_count
+      end
+      # Assume English
+      def unidentified_word_count
+        clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
+      end
+      def spelling_error_count
+        Spellchecker.check(clear_text).count
+      end
+      def count_of_issues
+        spelling_error_count + unidentified_word_count
       end
       private
+      def empty_text?
+        clear_text.nil? || clear_text == ''
+      end
+      def clear_words
+        @clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
+      end
+      def generate_clear_text
+        remove_lines
+          &.gsub(ASCII_ONLY, '')
+          &.gsub(NOISE_CHARACTERS, '')
+          &.gsub(DUPLICATE_WORDS, '')
+      end
       def remove_lines
-        text.gsub("\n", ' ').gsub("\r", ' ').gsub('  ', '')
+        text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub('  ', '')
       end
     end
   end

data/lib/ocr-file/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OcrFile
-  VERSION = "0.0.4"
+  VERSION = "0.0.8"
 end

data/lib/ocr-file.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require 'hexapdf'
 require 'hexapdf/cli/images'
 require 'rtesseract'
 require 'mini_magick'
+require 'ruby-spellchecker'
 require 'ocr-file/version'

data/ocr-file.gemspec CHANGED Viewed

@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency "hexapdf", "~> 0.23.0"
   spec.add_dependency "rtesseract", "~> 3.1.2"
   spec.add_dependency "mini_magick", "~> 4.11.0"
+  spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
   # Development Dependencies
   spec.add_development_dependency "pry", "~> 0.14.1"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ocr-file
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.8
 platform: ruby
 authors:
 - trex22
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-06-20 00:00:00.000000000 Z
+date: 2022-07-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: console-style
@@ -80,6 +80,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 4.11.0
+- !ruby/object:Gem::Dependency
+  name: ruby-spellchecker
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.5
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.1.5
 - !ruby/object:Gem::Dependency
   name: pry
   requirement: !ruby/object:Gem::Requirement