RubyGems - ocr-file - Versions diffs - 0.0.1 → 0.0.2 - Mend

ocr-file 0.0.1 → 0.0.2

Files changed (10) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/README.md +21 -5
data/bin/ocr-file +1 -1
data/lib/ocr-file/cli.rb +37 -1
data/lib/ocr-file/document.rb +40 -17
data/lib/ocr-file/image_engines/image_magick.rb +66 -7
data/lib/ocr-file/image_engines/pdftoppm.rb +1 -1
data/lib/ocr-file/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6b558ce36c35e74b410f42928eae1a987485d1bbd64da77750574062bc05b91e
-  data.tar.gz: d906c620a02c5a2d139b3d89d05e9b3872ee6c929b4aa661b20f9033d8f3605f
+  metadata.gz: f188bc0b29f4232b379e5e15d924c57a64a1758f04d8e168d2a44a744d20d1af
+  data.tar.gz: 5b54d844f01a5a5249572dd0abc270ae1fb37ff0070df9ad47eb84cf5f233fe7
 SHA512:
-  metadata.gz: 81049908609ba3d622be2b6f99dabeca2960a455fa3d56ee1fca4c177c2ee4365281421c1128ec3fa5476d068daa53b3d7f7600c5fd1c31fcb5834ca688f9747
-  data.tar.gz: 1a7dcd56a7196694371abf70633635545138bdc7bc0af2873fc5e7c22bdbfc97e9986ba1a18afc288b24e488caf999b644ec1f9d8889ce6e5efa6fcfe776c204
+  metadata.gz: c51ab724a77e8b22568dc0c7cefcf3ba28407f7050976d6900824954221d4f04e677b31b58ae644c87752e60024e1667194eda8b00c89dfab30f9a81d53ba1d5
+  data.tar.gz: 9b521be6e75808899398e77cf0c0b9dee842350a5c81c0ba513ad56125725607906c8c19e6b493201750ba331521db4ba247723a1c09d82dfb61e8caec857428

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    ocr-file (0.0.1)
+    ocr-file (0.0.2)
       active_attr (~> 0.15.4)
       console-style (~> 0.0.1)
       hexapdf (~> 0.23.0)

data/README.md CHANGED Viewed

@@ -42,9 +42,8 @@ You will need to install `tesseract` with your desired language on your system,
     type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
     ocr_engine: 'tesseract', # 'cloud-vision'
     # Image Pre-Processing
-    image_pre_preprocess: true,
-    effects: ['bw', 'norm'],
-    threshold: 0.25,
+    image_preprocess: true,
+    effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'], # Applies effects as listed. 'norm' is also available
     # PDF to Image Processing
     optimise_pdf: true,
     extract_pdf_images: true, # if false will screenshot each PDF page
@@ -84,7 +83,21 @@ You will need to install `tesseract` with your desired language on your system,
 ### Notes / Tips
 Set `extract_pdf_images` to `false` for higher quality OCR. However this will consume more temporary space per PDF page and also be considerably slower.
-Image pre-processing is not yet implemented.
+Image pre-processing only thresholds (bw), normalises the colour space, removes speckles and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary.
+### Simple CLI
+Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
+```
+# Basic Usage with console output
+ocr-file input_file_path output_folder_path
+# Output to PDF
+ocr-file input_file_path output_folder_path pdf
+# Output to TXT
+ocr-file input_file_path output_folder_path txt
+```
 ## Development
@@ -94,7 +107,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 ### TODOs
 - input validation
-- CLI
+- Better CLI
 - image processing
 - password
 - Base64 encoding
@@ -102,6 +115,9 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 - Tests
 - Configurable temp folder cleanup
 - Improve console output
+- Fix spaces in file names
+- Better verbosity
+- Timing
 ### Tests
 To run tests execute:

data/bin/ocr-file CHANGED Viewed

@@ -2,4 +2,4 @@
 require 'ocr-file'
-puts "Hello, world!"
+OcrFile::Cli.new(ARGV).call

data/lib/ocr-file/cli.rb CHANGED Viewed

@@ -1,5 +1,41 @@
 module OcrFile
-  module Cli
+  class Cli
+    attr_reader :args
+    def initialize(args)
+      @args = args
+    end
+    def valid?
+      return true if args.size == 2 || args.size == 3
+      false
+    end
+    def invalid?
+      !valid?
+    end
+    def call
+      # TODO: Use ConsoleStyle::Functions
+      # TODO: Heading and better CLI interface
+      # Simple cli for now
+      puts "OCR Tool © Jason Chalom 2022, Version: #{OcrFile::VERSION}"
+      abort "File path, Save Folder Paths, and output type (pdf, txt) are required!" if invalid?
+      # Using default config for now
+      original_file_path = args[0]
+      save_file_path = args[1]
+      output_type = args[2]
+      document = OcrFile::Document.new(original_file_path: original_file_path, save_file_path: save_file_path)
+      if output_type.to_s.downcase.include?('pdf')
+        document.to_pdf
+      elsif output_type.to_s.downcase.include?('txt') || output_type.to_s.downcase.include?('text')
+        document.to_text
+      else # Display in console
+        puts document.to_s
+      end
+    end
   end
 end

data/lib/ocr-file/document.rb CHANGED Viewed

@@ -1,5 +1,8 @@
 module OcrFile
   class Document
+    # TODO: Skewness / text orientation detection
+    # TODO: Better handwriting analysis
     ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
     PAGE_BREAK = "\n\r\n" # TODO: Make configurable
     DEFAULT_CONFIG = {
@@ -18,9 +21,8 @@ module OcrFile
       type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
       ocr_engine: 'tesseract', # 'cloud-vision'
       # Image Pre-Processing
-      image_pre_preprocess: true,
-      effects: ['bw', 'norm'],
-      threshold: 0.25,
+      image_preprocess: true,
+      effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'],
       # PDF to Image Processing
       optimise_pdf: true,
       extract_pdf_images: true, # if false will screenshot each PDF page
@@ -52,12 +54,12 @@ module OcrFile
     end
     def pdf?
-      @original_file_path.include?('.pdf')
+      @original_file_path.downcase.include?('.pdf')
     end
     def image?
       return false if pdf?
-      ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
+      ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
     end
     # Treat anything which isnt a PDF or image as text
@@ -65,6 +67,7 @@ module OcrFile
       !pdf? && !image?
     end
+    # Trigger OCR pipeline
     def to_pdf
       if pdf?
         create_temp_folder
@@ -73,15 +76,13 @@ module OcrFile
         pdfs_to_merge = []
         image_paths.each do |image_path|
-          pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
+          pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
         end
         merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
         OcrFile::ImageEngines::PdfEngine
           .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
-        close
       elsif text?
         text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
         pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
@@ -91,6 +92,8 @@ module OcrFile
       else # is an image
         ocr_image_to_pdf
       end
+      close
     end
     def to_text
@@ -99,16 +102,16 @@ module OcrFile
         image_paths = extract_image_paths_from_pdf(@original_file_path)
         image_paths.each do |image_path|
-          text = @ocr_engine.ocr_to_text(image_path, options: @config)
+          text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
           ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
         end
-        close
       elsif text?
         ::OcrFile::FileHelpers.open_text_file(@original_file_path)
       else # is an image
         ocr_image_to_text(save: true)
       end
+      close
     end
     def to_s
@@ -119,7 +122,7 @@ module OcrFile
         text = ''
         image_paths.each do |image_path|
-          text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
+          text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
         end
         close
@@ -127,7 +130,10 @@ module OcrFile
       elsif text?
         ::OcrFile::FileHelpers.open_text_file(@original_file_path)
       else # is an image
-        ocr_image_to_text(save: false)
+        text = ocr_image_to_text(save: false)
+        close
+        text
       end
     end
@@ -157,19 +163,36 @@ module OcrFile
     end
     def create_temp_folder
-      # TODO: Make this a bit more robust
-      @temp_folder_path = "#{save_file_path}/temp/".gsub(' ', '\ ')
+      date = Time.now.to_s.split(' ').first
+      @temp_folder_path = "#{save_file_path}/temp-#{date}/".gsub(' ', '\ ')
       ::OcrFile::FileHelpers.make_directory(@temp_folder_path)
     end
+    def process_image(path)
+      return path unless @config[:image_preprocess]
+      create_temp_folder
+      save_file_path = "#{@temp_folder_path}/#{Time.now.to_i}.#{@config[:filetype]}"
+      image_processor = OcrFile::ImageEngines::ImageMagick.new(
+        image_path: path,
+        temp_path: @temp_folder_path,
+        save_file_path: save_file_path,
+        config: @config
+      )
+      image_processor.convert!
+    end
     def ocr_image_to_pdf
-      pdf_document = @ocr_engine.ocr_to_pdf(@original_file_path, options: @config)
+      pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
       OcrFile::ImageEngines::PdfEngine
         .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
     end
     def ocr_image_to_text(save: true)
-      text = @ocr_engine.ocr_to_text(@original_file_path, options: @config)
+      text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
       if save
         ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)

data/lib/ocr-file/image_engines/image_magick.rb CHANGED Viewed

@@ -1,14 +1,73 @@
 module OcrFile
   module ImageEngines
-    module ImageMagick
-      extend self
+    class ImageMagick
       # TODO:
-      # B/W
-      # Contrast
-      # Image Norm
-      # Threshold
       # Conversion of image types
+      # Rotation and detection of skew
+      attr_reader :image_path, :image, :temp_path, :save_file_path, :config
+      def initialize(image_path:, temp_path:, save_file_path:, config:)
+        @image_path = image_path
+        @config = config
+        @save_file_path = save_file_path
+        @temp_path = temp_path
+        # Will be available in the next version of MiniMagick > 4.11.0
+        # https://github.com/minimagick/minimagick/pull/541
+        # MiniMagick.configure do |config|
+        #   # cli_version  graphicsmagick?  imagemagick7?  imagemagick? version
+        #   config.tmpdir = File.join(Dir.tmpdir, @temp_path)
+        # end
+        @image = MiniMagick::Image.open(image_path)
+      end
+      def convert!
+        return @image_path unless @config[:image_preprocess]
+        @config[:effects].each do |effect|
+          self.send(effect.to_sym)
+        end
+        save!
+      end
+      def save!
+        image.write(@save_file_path)
+        @save_file_path
+      end
+      # Effects
+      # http://www.imagemagick.org/script/command-line-options.php
+      def bw
+        @image.alpha('off')
+        @image.auto_threshold("otsu")
+      end
+      def enhance
+        @image.enhance
+      end
+      def norm
+        @image.equalize
+      end
+      # Most likely not going to be configurable because
+      # these are aggressive parameters used to optimised OCR results
+      # and not the final results of the PDFs
+      def sharpen
+        @image.sharpen('0x4') # radiusXsigma
+      end
+      def deskew
+        @image.deskew('40%') # threshold recommended in the docs
+      end
+      def despeckle
+        @image.despeckle
+      end
     end
   end
 end

data/lib/ocr-file/image_engines/pdftoppm.rb CHANGED Viewed

@@ -13,7 +13,7 @@ module OcrFile
         print 'Generating screenshots of each PDF page ... '
         if filetype == 'jpg'
-          `pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
+          `pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} "#{pdf_path}" "#{save_path}/#{filename}"`
         else
           `pdftoppm -#{filetype} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
         end

data/lib/ocr-file/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OcrFile
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ocr-file
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - trex22
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-06-19 00:00:00.000000000 Z
+date: 2022-06-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: console-style