ocr-file 0.0.1 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +24 -6
- data/bin/ocr-file +1 -1
- data/lib/ocr-file/cli.rb +37 -1
- data/lib/ocr-file/document.rb +137 -59
- data/lib/ocr-file/image_engines/image_magick.rb +73 -7
- data/lib/ocr-file/image_engines/pdftoppm.rb +1 -1
- data/lib/ocr-file/text_engines/result_processor.rb +34 -0
- data/lib/ocr-file/version.rb +1 -1
- data/lib/ocr-file.rb +1 -0
- metadata +3 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
         | 
| 4 | 
            +
              data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
         | 
| 7 | 
            +
              data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
         | 
    
        data/Gemfile.lock
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | @@ -42,15 +42,16 @@ You will need to install `tesseract` with your desired language on your system, | |
| 42 42 | 
             
                type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
         | 
| 43 43 | 
             
                ocr_engine: 'tesseract', # 'cloud-vision'
         | 
| 44 44 | 
             
                # Image Pre-Processing
         | 
| 45 | 
            -
                 | 
| 46 | 
            -
                effects: ['bw', 'norm' | 
| 47 | 
            -
                 | 
| 45 | 
            +
                image_preprocess: true,
         | 
| 46 | 
            +
                effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
         | 
| 47 | 
            +
                automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
         | 
| 48 48 | 
             
                # PDF to Image Processing
         | 
| 49 49 | 
             
                optimise_pdf: true,
         | 
| 50 50 | 
             
                extract_pdf_images: true, # if false will screenshot each PDF page
         | 
| 51 51 | 
             
                temp_filename_prefix: 'image',
         | 
| 52 52 | 
             
                # Console Output
         | 
| 53 53 | 
             
                verbose: true,
         | 
| 54 | 
            +
                timing: true,
         | 
| 54 55 | 
             
              }
         | 
| 55 56 |  | 
| 56 57 | 
             
              doc = OcrFile::Document.new(
         | 
| @@ -84,7 +85,23 @@ You will need to install `tesseract` with your desired language on your system, | |
| 84 85 | 
             
            ### Notes / Tips
         | 
| 85 86 | 
             
            Set `extract_pdf_images` to `false` for higher quality OCR. However this will consume more temporary space per PDF page and also be considerably slower.
         | 
| 86 87 |  | 
| 87 | 
            -
            Image pre-processing is  | 
| 88 | 
            +
            Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
         | 
| 89 | 
            +
             | 
| 90 | 
            +
            `automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            ### Simple CLI
         | 
| 93 | 
            +
            Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            ```
         | 
| 96 | 
            +
            # Basic Usage with console output
         | 
| 97 | 
            +
            ocr-file input_file_path output_folder_path
         | 
| 98 | 
            +
             | 
| 99 | 
            +
            # Output to PDF
         | 
| 100 | 
            +
            ocr-file input_file_path output_folder_path pdf
         | 
| 101 | 
            +
             | 
| 102 | 
            +
            # Output to TXT
         | 
| 103 | 
            +
            ocr-file input_file_path output_folder_path txt
         | 
| 104 | 
            +
            ```
         | 
| 88 105 |  | 
| 89 106 | 
             
            ## Development
         | 
| 90 107 |  | 
| @@ -94,14 +111,15 @@ To install this gem onto your local machine, run `bundle exec rake install`. To | |
| 94 111 |  | 
| 95 112 | 
             
            ### TODOs
         | 
| 96 113 | 
             
            - input validation
         | 
| 97 | 
            -
            - CLI
         | 
| 98 | 
            -
            - image processing
         | 
| 114 | 
            +
            - Better CLI
         | 
| 99 115 | 
             
            - password
         | 
| 100 116 | 
             
            - Base64 encoding
         | 
| 101 117 | 
             
            - requirements checking (installed dependencies etc ...)
         | 
| 102 118 | 
             
            - Tests
         | 
| 103 119 | 
             
            - Configurable temp folder cleanup
         | 
| 104 120 | 
             
            - Improve console output
         | 
| 121 | 
            +
            - Fix spaces in file names
         | 
| 122 | 
            +
            - Better verbosity
         | 
| 105 123 |  | 
| 106 124 | 
             
            ### Tests
         | 
| 107 125 | 
             
            To run tests execute:
         | 
    
        data/bin/ocr-file
    CHANGED
    
    
    
        data/lib/ocr-file/cli.rb
    CHANGED
    
    | @@ -1,5 +1,41 @@ | |
| 1 1 | 
             
            module OcrFile
         | 
| 2 | 
            -
               | 
| 2 | 
            +
              class Cli
         | 
| 3 | 
            +
                attr_reader :args
         | 
| 3 4 |  | 
| 5 | 
            +
                def initialize(args)
         | 
| 6 | 
            +
                  @args = args
         | 
| 7 | 
            +
                end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                def valid?
         | 
| 10 | 
            +
                  return true if args.size == 2 || args.size == 3
         | 
| 11 | 
            +
                  false
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                def invalid?
         | 
| 15 | 
            +
                  !valid?
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                def call
         | 
| 19 | 
            +
                  # TODO: Use ConsoleStyle::Functions
         | 
| 20 | 
            +
                  # TODO: Heading and better CLI interface
         | 
| 21 | 
            +
                  # Simple cli for now
         | 
| 22 | 
            +
                  puts "OCR Tool © Jason Chalom 2022, Version: #{OcrFile::VERSION}"
         | 
| 23 | 
            +
                  abort "File path, Save Folder Paths, and output type (pdf, txt) are required!" if invalid?
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  # Using default config for now
         | 
| 26 | 
            +
                  original_file_path = args[0]
         | 
| 27 | 
            +
                  save_file_path = args[1]
         | 
| 28 | 
            +
                  output_type = args[2]
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                  document = OcrFile::Document.new(original_file_path: original_file_path, save_file_path: save_file_path)
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  if output_type.to_s.downcase.include?('pdf')
         | 
| 33 | 
            +
                    document.to_pdf
         | 
| 34 | 
            +
                  elsif output_type.to_s.downcase.include?('txt') || output_type.to_s.downcase.include?('text')
         | 
| 35 | 
            +
                    document.to_text
         | 
| 36 | 
            +
                  else # Display in console
         | 
| 37 | 
            +
                    puts document.to_s
         | 
| 38 | 
            +
                  end
         | 
| 39 | 
            +
                end
         | 
| 4 40 | 
             
              end
         | 
| 5 41 | 
             
            end
         | 
    
        data/lib/ocr-file/document.rb
    CHANGED
    
    | @@ -1,7 +1,11 @@ | |
| 1 1 | 
             
            module OcrFile
         | 
| 2 2 | 
             
              class Document
         | 
| 3 | 
            +
                # TODO: Skewness / text orientation detection
         | 
| 4 | 
            +
                # TODO: Better handwriting analysis
         | 
| 5 | 
            +
             | 
| 3 6 | 
             
                ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
         | 
| 4 7 | 
             
                PAGE_BREAK = "\n\r\n" # TODO: Make configurable
         | 
| 8 | 
            +
                EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
         | 
| 5 9 | 
             
                DEFAULT_CONFIG = {
         | 
| 6 10 | 
             
                  # Images from PDF
         | 
| 7 11 | 
             
                  filetype: 'png',
         | 
| @@ -18,15 +22,16 @@ module OcrFile | |
| 18 22 | 
             
                  type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
         | 
| 19 23 | 
             
                  ocr_engine: 'tesseract', # 'cloud-vision'
         | 
| 20 24 | 
             
                  # Image Pre-Processing
         | 
| 21 | 
            -
                   | 
| 22 | 
            -
                  effects: [' | 
| 23 | 
            -
                   | 
| 25 | 
            +
                  image_preprocess: true,
         | 
| 26 | 
            +
                  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
         | 
| 27 | 
            +
                  automatic_reprocess: true,
         | 
| 24 28 | 
             
                  # PDF to Image Processing
         | 
| 25 29 | 
             
                  optimise_pdf: true,
         | 
| 26 30 | 
             
                  extract_pdf_images: true, # if false will screenshot each PDF page
         | 
| 27 31 | 
             
                  temp_filename_prefix: 'image',
         | 
| 28 32 | 
             
                  # Console Output
         | 
| 29 33 | 
             
                  verbose: true,
         | 
| 34 | 
            +
                  timing: true,
         | 
| 30 35 | 
             
                }
         | 
| 31 36 |  | 
| 32 37 | 
             
                attr_reader :original_file_path,
         | 
| @@ -34,7 +39,9 @@ module OcrFile | |
| 34 39 | 
             
                  :save_file_path,
         | 
| 35 40 | 
             
                  :final_save_file,
         | 
| 36 41 | 
             
                  :config,
         | 
| 37 | 
            -
                  :ocr_engine
         | 
| 42 | 
            +
                  :ocr_engine,
         | 
| 43 | 
            +
                  :start_time,
         | 
| 44 | 
            +
                  :end_time
         | 
| 38 45 |  | 
| 39 46 | 
             
                # save_file_path will also generate a tmp path for tmp files. Expected folder path
         | 
| 40 47 | 
             
                # TODO: Add in more input validation
         | 
| @@ -52,12 +59,12 @@ module OcrFile | |
| 52 59 | 
             
                end
         | 
| 53 60 |  | 
| 54 61 | 
             
                def pdf?
         | 
| 55 | 
            -
                  @original_file_path.include?('.pdf')
         | 
| 62 | 
            +
                  @original_file_path.downcase.include?('.pdf')
         | 
| 56 63 | 
             
                end
         | 
| 57 64 |  | 
| 58 65 | 
             
                def image?
         | 
| 59 66 | 
             
                  return false if pdf?
         | 
| 60 | 
            -
                  ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
         | 
| 67 | 
            +
                  ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
         | 
| 61 68 | 
             
                end
         | 
| 62 69 |  | 
| 63 70 | 
             
                # Treat anything which isnt a PDF or image as text
         | 
| @@ -65,74 +72,52 @@ module OcrFile | |
| 65 72 | 
             
                  !pdf? && !image?
         | 
| 66 73 | 
             
                end
         | 
| 67 74 |  | 
| 75 | 
            +
                # Trigger OCR pipeline
         | 
| 68 76 | 
             
                def to_pdf
         | 
| 69 | 
            -
                   | 
| 70 | 
            -
             | 
| 71 | 
            -
                    image_paths = extract_image_paths_from_pdf(@original_file_path)
         | 
| 72 | 
            -
             | 
| 73 | 
            -
                    pdfs_to_merge = []
         | 
| 74 | 
            -
             | 
| 75 | 
            -
                    image_paths.each do |image_path|
         | 
| 76 | 
            -
                      pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
         | 
| 77 | 
            -
                    end
         | 
| 78 | 
            -
             | 
| 79 | 
            -
                    merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
         | 
| 77 | 
            +
                  @start_time = Time.now
         | 
| 78 | 
            +
                  find_best_image_processing if config[:automatic_reprocess] && !text?
         | 
| 80 79 |  | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
                    close
         | 
| 80 | 
            +
                  if pdf?
         | 
| 81 | 
            +
                    ocr_pdf_to_searchable_pdf
         | 
| 85 82 | 
             
                  elsif text?
         | 
| 86 | 
            -
                     | 
| 87 | 
            -
                    pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
         | 
| 88 | 
            -
             | 
| 89 | 
            -
                    OcrFile::ImageEngines::PdfEngine
         | 
| 90 | 
            -
                      .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
         | 
| 83 | 
            +
                    text_to_pdf
         | 
| 91 84 | 
             
                  else # is an image
         | 
| 92 85 | 
             
                    ocr_image_to_pdf
         | 
| 93 86 | 
             
                  end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                  close
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                  @end_time = Time.now
         | 
| 91 | 
            +
                  print_time
         | 
| 94 92 | 
             
                end
         | 
| 95 93 |  | 
| 96 94 | 
             
                def to_text
         | 
| 97 | 
            -
                   | 
| 98 | 
            -
             | 
| 99 | 
            -
                    image_paths = extract_image_paths_from_pdf(@original_file_path)
         | 
| 95 | 
            +
                  @start_time = Time.now
         | 
| 96 | 
            +
                  return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
         | 
| 100 97 |  | 
| 101 | 
            -
             | 
| 102 | 
            -
             | 
| 103 | 
            -
                      ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
         | 
| 104 | 
            -
                    end
         | 
| 98 | 
            +
                  find_best_image_processing(save: true)
         | 
| 99 | 
            +
                  close
         | 
| 105 100 |  | 
| 106 | 
            -
             | 
| 107 | 
            -
                   | 
| 108 | 
            -
                    ::OcrFile::FileHelpers.open_text_file(@original_file_path)
         | 
| 109 | 
            -
                  else # is an image
         | 
| 110 | 
            -
                    ocr_image_to_text(save: true)
         | 
| 111 | 
            -
                  end
         | 
| 101 | 
            +
                  @end_time = Time.now
         | 
| 102 | 
            +
                  print_time
         | 
| 112 103 | 
             
                end
         | 
| 113 104 |  | 
| 114 105 | 
             
                def to_s
         | 
| 115 | 
            -
                   | 
| 116 | 
            -
             | 
| 117 | 
            -
                    image_paths = extract_image_paths_from_pdf(@original_file_path)
         | 
| 106 | 
            +
                  @start_time = Time.now
         | 
| 107 | 
            +
                  return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
         | 
| 118 108 |  | 
| 119 | 
            -
             | 
| 109 | 
            +
                  text = find_best_image_processing(save: false)
         | 
| 120 110 |  | 
| 121 | 
            -
             | 
| 122 | 
            -
                      text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
         | 
| 123 | 
            -
                    end
         | 
| 111 | 
            +
                  close
         | 
| 124 112 |  | 
| 125 | 
            -
             | 
| 126 | 
            -
             | 
| 127 | 
            -
             | 
| 128 | 
            -
             | 
| 129 | 
            -
                  else # is an image
         | 
| 130 | 
            -
                    ocr_image_to_text(save: false)
         | 
| 131 | 
            -
                  end
         | 
| 113 | 
            +
                  @end_time = Time.now
         | 
| 114 | 
            +
                  print_time
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                  text
         | 
| 132 117 | 
             
                end
         | 
| 133 118 |  | 
| 134 119 | 
             
                def close
         | 
| 135 | 
            -
                  ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
         | 
| 120 | 
            +
                  # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
         | 
| 136 121 | 
             
                end
         | 
| 137 122 |  | 
| 138 123 | 
             
                private
         | 
| @@ -157,19 +142,80 @@ module OcrFile | |
| 157 142 | 
             
                end
         | 
| 158 143 |  | 
| 159 144 | 
             
                def create_temp_folder
         | 
| 160 | 
            -
                   | 
| 161 | 
            -
             | 
| 145 | 
            +
                  date = Time.now.to_s.split(' ').first
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                  @temp_folder_path = "#{save_file_path}/temp-#{date}/".gsub(' ', '\ ')
         | 
| 162 148 | 
             
                  ::OcrFile::FileHelpers.make_directory(@temp_folder_path)
         | 
| 163 149 | 
             
                end
         | 
| 164 150 |  | 
| 151 | 
            +
                def process_image(path)
         | 
| 152 | 
            +
                  return path unless @config[:image_preprocess]
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                  create_temp_folder
         | 
| 155 | 
            +
                  save_file_path = "#{@temp_folder_path}/#{Time.now.to_i}.#{@config[:filetype]}"
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                  image_processor = OcrFile::ImageEngines::ImageMagick.new(
         | 
| 158 | 
            +
                    image_path: path,
         | 
| 159 | 
            +
                    temp_path: @temp_folder_path,
         | 
| 160 | 
            +
                    save_file_path: save_file_path,
         | 
| 161 | 
            +
                    config: @config
         | 
| 162 | 
            +
                  )
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                  image_processor.convert!
         | 
| 165 | 
            +
                end
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                def ocr_pdf_to_searchable_pdf
         | 
| 168 | 
            +
                  create_temp_folder
         | 
| 169 | 
            +
                  image_paths = extract_image_paths_from_pdf(@original_file_path)
         | 
| 170 | 
            +
             | 
| 171 | 
            +
                  pdfs_to_merge = []
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                  image_paths.each do |image_path|
         | 
| 174 | 
            +
                    pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
         | 
| 175 | 
            +
                  end
         | 
| 176 | 
            +
             | 
| 177 | 
            +
                  merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                  OcrFile::ImageEngines::PdfEngine
         | 
| 180 | 
            +
                    .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
         | 
| 181 | 
            +
                end
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                def text_to_pdf
         | 
| 184 | 
            +
                  text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
         | 
| 185 | 
            +
                  pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                  OcrFile::ImageEngines::PdfEngine
         | 
| 188 | 
            +
                    .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
         | 
| 189 | 
            +
                end
         | 
| 190 | 
            +
             | 
| 165 191 | 
             
                def ocr_image_to_pdf
         | 
| 166 | 
            -
                   | 
| 192 | 
            +
                  find_best_image_processing if config[:automatic_reprocess]
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                  pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
         | 
| 167 195 | 
             
                  OcrFile::ImageEngines::PdfEngine
         | 
| 168 196 | 
             
                    .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
         | 
| 169 197 | 
             
                end
         | 
| 170 198 |  | 
| 171 | 
            -
                def  | 
| 172 | 
            -
                   | 
| 199 | 
            +
                def ocr_pdf_to_text(save:)
         | 
| 200 | 
            +
                  create_temp_folder
         | 
| 201 | 
            +
                  image_paths = extract_image_paths_from_pdf(@original_file_path)
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                  text = ''
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                  image_paths.each do |image_path|
         | 
| 206 | 
            +
                    text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
         | 
| 207 | 
            +
                  end
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                  if save
         | 
| 210 | 
            +
                    ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
         | 
| 211 | 
            +
                  else
         | 
| 212 | 
            +
                    text
         | 
| 213 | 
            +
                  end
         | 
| 214 | 
            +
                end
         | 
| 215 | 
            +
             | 
| 216 | 
            +
                def ocr_image_to_text(save:)
         | 
| 217 | 
            +
                  create_temp_folder
         | 
| 218 | 
            +
                  text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
         | 
| 173 219 |  | 
| 174 220 | 
             
                  if save
         | 
| 175 221 | 
             
                    ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
         | 
| @@ -178,6 +224,38 @@ module OcrFile | |
| 178 224 | 
             
                  end
         | 
| 179 225 | 
             
                end
         | 
| 180 226 |  | 
| 227 | 
            +
                def ocr_file_to_text(save:)
         | 
| 228 | 
            +
                  if pdf? &&
         | 
| 229 | 
            +
                    ocr_pdf_to_text(save: save)
         | 
| 230 | 
            +
                  else # is an image
         | 
| 231 | 
            +
                    ocr_image_to_text(save: save)
         | 
| 232 | 
            +
                  end
         | 
| 233 | 
            +
                end
         | 
| 234 | 
            +
             | 
| 235 | 
            +
                def find_best_image_processing(save:)
         | 
| 236 | 
            +
                  ocr_file_to_text(save: save) if !config[:automatic_reprocess]
         | 
| 237 | 
            +
             | 
| 238 | 
            +
                  text = ''
         | 
| 239 | 
            +
                  effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
         | 
| 240 | 
            +
                  effects_to_test.each do |effect|
         | 
| 241 | 
            +
                    config[:effects] = config[:effects] - [effect]
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                    text = ocr_file_to_text(save: false)
         | 
| 244 | 
            +
                    break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
         | 
| 245 | 
            +
                  end
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                  # Adds in extra operations which is unfortunately inefficient
         | 
| 248 | 
            +
                  if save
         | 
| 249 | 
            +
                    ocr_file_to_text(save: save)
         | 
| 250 | 
            +
                  else
         | 
| 251 | 
            +
                    text
         | 
| 252 | 
            +
                  end
         | 
| 253 | 
            +
                end
         | 
| 254 | 
            +
             | 
| 255 | 
            +
                def print_time
         | 
| 256 | 
            +
                  puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
         | 
| 257 | 
            +
                end
         | 
| 258 | 
            +
             | 
| 181 259 | 
             
                def find_ocr_engine(engine_id)
         | 
| 182 260 | 
             
                  ocr_engine_constants
         | 
| 183 261 | 
             
                    .map { |c| ocr_module(c) }
         | 
| @@ -1,14 +1,80 @@ | |
| 1 1 | 
             
            module OcrFile
         | 
| 2 2 | 
             
              module ImageEngines
         | 
| 3 | 
            -
                 | 
| 4 | 
            -
                  extend self
         | 
| 5 | 
            -
             | 
| 3 | 
            +
                class ImageMagick
         | 
| 6 4 | 
             
                  # TODO:
         | 
| 7 | 
            -
                  # B/W
         | 
| 8 | 
            -
                  # Contrast
         | 
| 9 | 
            -
                  # Image Norm
         | 
| 10 | 
            -
                  # Threshold
         | 
| 11 5 | 
             
                  # Conversion of image types
         | 
| 6 | 
            +
                  # Rotation and detection of skew
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                  attr_reader :image_path, :image, :temp_path, :save_file_path, :config
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  def initialize(image_path:, temp_path:, save_file_path:, config:)
         | 
| 11 | 
            +
                    @image_path = image_path
         | 
| 12 | 
            +
                    @config = config
         | 
| 13 | 
            +
                    @save_file_path = save_file_path
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                    @temp_path = temp_path
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                    # Will be available in the next version of MiniMagick > 4.11.0
         | 
| 18 | 
            +
                    # https://github.com/minimagick/minimagick/pull/541
         | 
| 19 | 
            +
                    # MiniMagick.configure do |config|
         | 
| 20 | 
            +
                    #   # cli_version  graphicsmagick?  imagemagick7?  imagemagick? version
         | 
| 21 | 
            +
                    #   config.tmpdir = File.join(Dir.tmpdir, @temp_path)
         | 
| 22 | 
            +
                    # end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    @image = MiniMagick::Image.open(image_path)
         | 
| 25 | 
            +
                  end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                  def convert!
         | 
| 28 | 
            +
                    return @image_path unless @config[:image_preprocess]
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    @config[:effects].each do |effect|
         | 
| 31 | 
            +
                      self.send(effect.to_sym)
         | 
| 32 | 
            +
                    end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    save!
         | 
| 35 | 
            +
                  end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                  def save!
         | 
| 38 | 
            +
                    image.write(@save_file_path)
         | 
| 39 | 
            +
                    @save_file_path
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                  # Effects
         | 
| 43 | 
            +
                  # http://www.imagemagick.org/script/command-line-options.php
         | 
| 44 | 
            +
                  def bw
         | 
| 45 | 
            +
                    @image.alpha('off')
         | 
| 46 | 
            +
                    @image.auto_threshold("otsu")
         | 
| 47 | 
            +
                  end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                  def enhance
         | 
| 50 | 
            +
                    @image.enhance
         | 
| 51 | 
            +
                  end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                  def norm
         | 
| 54 | 
            +
                    @image.equalize
         | 
| 55 | 
            +
                  end
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                  # Most likely not going to be configurable because
         | 
| 58 | 
            +
                  # these are aggressive parameters used to optimised OCR results
         | 
| 59 | 
            +
                  # and not the final results of the PDFs
         | 
| 60 | 
            +
                  def sharpen
         | 
| 61 | 
            +
                    @image.sharpen('0x4') # radiusXsigma
         | 
| 62 | 
            +
                  end
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                  # https://github.com/ImageMagick/ImageMagick/discussions/4145
         | 
| 65 | 
            +
                  def remove_shadow
         | 
| 66 | 
            +
                    @image.negate
         | 
| 67 | 
            +
                    @image.lat("20x20+10\%")
         | 
| 68 | 
            +
                    @image.negate
         | 
| 69 | 
            +
                  end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                  def deskew
         | 
| 72 | 
            +
                    @image.deskew('40%') # threshold recommended in the docs
         | 
| 73 | 
            +
                  end
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                  def despeckle
         | 
| 76 | 
            +
                    @image.despeckle
         | 
| 77 | 
            +
                  end
         | 
| 12 78 | 
             
                end
         | 
| 13 79 | 
             
              end
         | 
| 14 80 | 
             
            end
         | 
| @@ -13,7 +13,7 @@ module OcrFile | |
| 13 13 | 
             
                    print 'Generating screenshots of each PDF page ... '
         | 
| 14 14 |  | 
| 15 15 | 
             
                    if filetype == 'jpg'
         | 
| 16 | 
            -
                      `pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
         | 
| 16 | 
            +
                      `pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} "#{pdf_path}" "#{save_path}/#{filename}"`
         | 
| 17 17 | 
             
                    else
         | 
| 18 18 | 
             
                      `pdftoppm -#{filetype} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
         | 
| 19 19 | 
             
                    end
         | 
| @@ -0,0 +1,34 @@ | |
| 1 | 
            +
            module OcrFile
         | 
| 2 | 
            +
              module TextEngines
         | 
| 3 | 
            +
                class ResultProcessor
         | 
| 4 | 
            +
                  MINIMUM_WORD_LENGTH = 3
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                  attr_reader :text, :clear_text
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                  def initialize(text)
         | 
| 9 | 
            +
                    @text = text
         | 
| 10 | 
            +
                    @clear_text = remove_lines
         | 
| 11 | 
            +
                  end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  # This is a very naive way of determining if we should re-do OCR with
         | 
| 14 | 
            +
                  # shifted options
         | 
| 15 | 
            +
                  def valid_words?
         | 
| 16 | 
            +
                    word_size_average >= MINIMUM_WORD_LENGTH
         | 
| 17 | 
            +
                  end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                  def word_count
         | 
| 20 | 
            +
                    @_word_count ||= clear_text.split(' ').size
         | 
| 21 | 
            +
                  end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                  def word_size_average
         | 
| 24 | 
            +
                    @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
         | 
| 25 | 
            +
                  end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                  private
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                  def remove_lines
         | 
| 30 | 
            +
                    text.gsub("\n", ' ').gsub("\r", ' ').gsub('  ', '')
         | 
| 31 | 
            +
                  end
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
              end
         | 
| 34 | 
            +
            end
         | 
    
        data/lib/ocr-file/version.rb
    CHANGED
    
    
    
        data/lib/ocr-file.rb
    CHANGED
    
    | @@ -10,6 +10,7 @@ require 'ocr-file/image_engines/image_magick' | |
| 10 10 | 
             
            require 'ocr-file/image_engines/pdftoppm'
         | 
| 11 11 | 
             
            require 'ocr-file/ocr_engines/tesseract'
         | 
| 12 12 | 
             
            require 'ocr-file/ocr_engines/cloud_vision'
         | 
| 13 | 
            +
            require 'ocr-file/text_engines/result_processor'
         | 
| 13 14 | 
             
            require 'ocr-file/file_helpers'
         | 
| 14 15 | 
             
            require 'ocr-file/document'
         | 
| 15 16 | 
             
            require 'ocr-file/cli'
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: ocr-file
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.0. | 
| 4 | 
            +
              version: 0.0.4
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - trex22
         | 
| 8 8 | 
             
            autorequire:
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2022-06- | 
| 11 | 
            +
            date: 2022-06-20 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: console-style
         | 
| @@ -122,6 +122,7 @@ files: | |
| 122 122 | 
             
            - lib/ocr-file/image_engines/pdftoppm.rb
         | 
| 123 123 | 
             
            - lib/ocr-file/ocr_engines/cloud_vision.rb
         | 
| 124 124 | 
             
            - lib/ocr-file/ocr_engines/tesseract.rb
         | 
| 125 | 
            +
            - lib/ocr-file/text_engines/result_processor.rb
         | 
| 125 126 | 
             
            - lib/ocr-file/version.rb
         | 
| 126 127 | 
             
            - ocr-file.gemspec
         | 
| 127 128 | 
             
            homepage: https://github.com/TRex22/ocr-file
         |