RubyGems - ocr-file - Versions diffs - 0.0.3 → 0.0.4 - Mend

ocr-file 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/README.md +4 -2
data/lib/ocr-file/document.rb +105 -50
data/lib/ocr-file/text_engines/result_processor.rb +34 -0
data/lib/ocr-file/version.rb +1 -1
data/lib/ocr-file.rb +1 -0
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0e67553a31e82eba190368040d3475b812e113aedfb9994484043dda34a55053
-  data.tar.gz: 6fe5e142fef4387fc98fce57d3fdb2b7a0c37199d1712bd1d85dced9a0e61274
+  metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
+  data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
 SHA512:
-  metadata.gz: e5d06cf54a8bc96c90522ab67530310730230067ee226f6eb1143adde2ccb407dde25aef7b595836478ee944e4e9b3ff306b4df5a08ec14ab6623ab08daefa8b
-  data.tar.gz: 45a7c3d06908c878f281db9baf4ec82310ecde20e12cad5ff4cc03d2f271167d46fa52145fe598f594a3360a525c926d955bb08d17e740ba78f97ec72f0f4b47
+  metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
+  data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    ocr-file (0.0.2)
+    ocr-file (0.0.4)
       active_attr (~> 0.15.4)
       console-style (~> 0.0.1)
       hexapdf (~> 0.23.0)

data/README.md CHANGED Viewed

@@ -44,12 +44,14 @@ You will need to install `tesseract` with your desired language on your system,
     # Image Pre-Processing
     image_preprocess: true,
     effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
+    automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
     # PDF to Image Processing
     optimise_pdf: true,
     extract_pdf_images: true, # if false will screenshot each PDF page
     temp_filename_prefix: 'image',
     # Console Output
     verbose: true,
+    timing: true,
   }
   doc = OcrFile::Document.new(
@@ -85,6 +87,8 @@ Set `extract_pdf_images` to `false` for higher quality OCR. However this will co
 Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
+`automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
 ### Simple CLI
 Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
@@ -108,7 +112,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 ### TODOs
 - input validation
 - Better CLI
-- image processing
 - password
 - Base64 encoding
 - requirements checking (installed dependencies etc ...)
@@ -117,7 +120,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 - Improve console output
 - Fix spaces in file names
 - Better verbosity
-- Timing
 ### Tests
 To run tests execute:

data/lib/ocr-file/document.rb CHANGED Viewed

@@ -5,6 +5,7 @@ module OcrFile
     ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
     PAGE_BREAK = "\n\r\n" # TODO: Make configurable
+    EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
     DEFAULT_CONFIG = {
       # Images from PDF
       filetype: 'png',
@@ -23,12 +24,14 @@ module OcrFile
       # Image Pre-Processing
       image_preprocess: true,
       effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
+      automatic_reprocess: true,
       # PDF to Image Processing
       optimise_pdf: true,
       extract_pdf_images: true, # if false will screenshot each PDF page
       temp_filename_prefix: 'image',
       # Console Output
       verbose: true,
+      timing: true,
     }
     attr_reader :original_file_path,
@@ -36,7 +39,9 @@ module OcrFile
       :save_file_path,
       :final_save_file,
       :config,
-      :ocr_engine
+      :ocr_engine,
+      :start_time,
+      :end_time
     # save_file_path will also generate a tmp path for tmp files. Expected folder path
     # TODO: Add in more input validation
@@ -69,76 +74,50 @@ module OcrFile
     # Trigger OCR pipeline
     def to_pdf
-      if pdf?
-        create_temp_folder
-        image_paths = extract_image_paths_from_pdf(@original_file_path)
-        pdfs_to_merge = []
-        image_paths.each do |image_path|
-          pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
-        end
-        merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
+      @start_time = Time.now
+      find_best_image_processing if config[:automatic_reprocess] && !text?
-        OcrFile::ImageEngines::PdfEngine
-          .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+      if pdf?
+        ocr_pdf_to_searchable_pdf
       elsif text?
-        text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
-        pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
-        OcrFile::ImageEngines::PdfEngine
-          .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+        text_to_pdf
       else # is an image
         ocr_image_to_pdf
       end
       close
+      @end_time = Time.now
+      print_time
     end
     def to_text
-      if pdf?
-        create_temp_folder
-        image_paths = extract_image_paths_from_pdf(@original_file_path)
-        image_paths.each do |image_path|
-          text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
-          ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
-        end
-      elsif text?
-        ::OcrFile::FileHelpers.open_text_file(@original_file_path)
-      else # is an image
-        ocr_image_to_text(save: true)
-      end
+      @start_time = Time.now
+      return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
+      find_best_image_processing(save: true)
       close
+      @end_time = Time.now
+      print_time
     end
     def to_s
-      if pdf?
-        create_temp_folder
-        image_paths = extract_image_paths_from_pdf(@original_file_path)
+      @start_time = Time.now
+      return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
-        text = ''
+      text = find_best_image_processing(save: false)
-        image_paths.each do |image_path|
-          text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
-        end
+      close
-        close
-        text
-      elsif text?
-        ::OcrFile::FileHelpers.open_text_file(@original_file_path)
-      else # is an image
-        text = ocr_image_to_text(save: false)
+      @end_time = Time.now
+      print_time
-        close
-        text
-      end
+      text
     end
     def close
-      ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
+      # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
     end
     private
@@ -185,13 +164,57 @@ module OcrFile
       image_processor.convert!
     end
+    def ocr_pdf_to_searchable_pdf
+      create_temp_folder
+      image_paths = extract_image_paths_from_pdf(@original_file_path)
+      pdfs_to_merge = []
+      image_paths.each do |image_path|
+        pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
+      end
+      merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
+      OcrFile::ImageEngines::PdfEngine
+        .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+    end
+    def text_to_pdf
+      text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
+      pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
+      OcrFile::ImageEngines::PdfEngine
+        .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+    end
     def ocr_image_to_pdf
+      find_best_image_processing if config[:automatic_reprocess]
       pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
       OcrFile::ImageEngines::PdfEngine
         .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
     end
-    def ocr_image_to_text(save: true)
+    def ocr_pdf_to_text(save:)
+      create_temp_folder
+      image_paths = extract_image_paths_from_pdf(@original_file_path)
+      text = ''
+      image_paths.each do |image_path|
+        text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
+      end
+      if save
+        ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
+      else
+        text
+      end
+    end
+    def ocr_image_to_text(save:)
+      create_temp_folder
       text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
       if save
@@ -201,6 +224,38 @@ module OcrFile
       end
     end
+    def ocr_file_to_text(save:)
+      if pdf? &&
+        ocr_pdf_to_text(save: save)
+      else # is an image
+        ocr_image_to_text(save: save)
+      end
+    end
+    def find_best_image_processing(save:)
+      ocr_file_to_text(save: save) if !config[:automatic_reprocess]
+      text = ''
+      effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
+      effects_to_test.each do |effect|
+        config[:effects] = config[:effects] - [effect]
+        text = ocr_file_to_text(save: false)
+        break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
+      end
+      # Adds in extra operations which is unfortunately inefficient
+      if save
+        ocr_file_to_text(save: save)
+      else
+        text
+      end
+    end
+    def print_time
+      puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
+    end
     def find_ocr_engine(engine_id)
       ocr_engine_constants
         .map { |c| ocr_module(c) }

data/lib/ocr-file/text_engines/result_processor.rb ADDED Viewed

@@ -0,0 +1,34 @@
+module OcrFile
+  module TextEngines
+    class ResultProcessor
+      MINIMUM_WORD_LENGTH = 3
+      attr_reader :text, :clear_text
+      def initialize(text)
+        @text = text
+        @clear_text = remove_lines
+      end
+      # This is a very naive way of determining if we should re-do OCR with
+      # shifted options
+      def valid_words?
+        word_size_average >= MINIMUM_WORD_LENGTH
+      end
+      def word_count
+        @_word_count ||= clear_text.split(' ').size
+      end
+      def word_size_average
+        @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
+      end
+      private
+      def remove_lines
+        text.gsub("\n", ' ').gsub("\r", ' ').gsub('  ', '')
+      end
+    end
+  end
+end

data/lib/ocr-file/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OcrFile
-  VERSION = "0.0.3"
+  VERSION = "0.0.4"
 end

data/lib/ocr-file.rb CHANGED Viewed

@@ -10,6 +10,7 @@ require 'ocr-file/image_engines/image_magick'
 require 'ocr-file/image_engines/pdftoppm'
 require 'ocr-file/ocr_engines/tesseract'
 require 'ocr-file/ocr_engines/cloud_vision'
+require 'ocr-file/text_engines/result_processor'
 require 'ocr-file/file_helpers'
 require 'ocr-file/document'
 require 'ocr-file/cli'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: ocr-file
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
 platform: ruby
 authors:
 - trex22
@@ -122,6 +122,7 @@ files:
 - lib/ocr-file/image_engines/pdftoppm.rb
 - lib/ocr-file/ocr_engines/cloud_vision.rb
 - lib/ocr-file/ocr_engines/tesseract.rb
+- lib/ocr-file/text_engines/result_processor.rb
 - lib/ocr-file/version.rb
 - ocr-file.gemspec
 homepage: https://github.com/TRex22/ocr-file