RubyGems - pdf_ocr - Versions diffs - 0.1.1 → 0.1.2 - Mend

pdf_ocr 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f1e04f45ee3dbb8cc9703ad628daa7e1ca20020b8b3dc3b4419706e881bffec3
-  data.tar.gz: ce753f931e9dc2391c61e0f83feda1ea4f9ae19e8248f9c64a66a14f62544037
+  metadata.gz: 874671c2167c8e17c21b59d7805434644565d8d1292bb9fbc7d57383080ace48
+  data.tar.gz: 439ee65fbadd68192b60c48a1688fb4b23026be373a9375ee8ad2017d12a2cd1
 SHA512:
-  metadata.gz: 8dd14930fc50eed3e0a4aca7bad66318cac441cb906c6b578d347689a644ccbf3a82169b17a04c533e01f7183533ef2cb779c2799c40e7ae8f5b7e57df81af38
-  data.tar.gz: 9496ebd284e7f4660a3c200fdc08a78f041c515dfd5548421ddca40eac3059f14169a6fc75e284cd6dc5ae34891b33bab7499697bbb40acfc73bb432ed32c633
+  metadata.gz: da2067e94fbe2765248887edd40dac4d1587fff186520373dd8d21c889ff28b883770627fa0d926b8ccabd12b76ddb146bef57a3054c5e914d08b7a1ecefd13a
+  data.tar.gz: 46d3e8b4391e02bb3333dd92d6e2f3ca92aac0a73eaaa47c54614f65309f94ff36234a0a02c4b04c267da1217e04f9f93e32b1680484c885cc719e26392c71dc

data/lib/ocr/data_extractor.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require "mini_magick"
 require "pdf/reader"
 require "rtesseract"
@@ -6,17 +8,49 @@ require "shellwords"
 require "tmpdir"
 module Ocr
+  ##
+  # DataExtractor handles PDF text extraction.
+  # It can parse regular PDFs or scanned PDFs using OCR.
+  #
+  # @example Extract text from a PDF
+  #   extractor = Ocr::DataExtractor.new("example.pdf")
+  #   result = extractor.call
+  #   if result["success"]
+  #     puts result["raw_text"]
+  #   else
+  #     puts result["message"]
+  #   end
+  #
   class DataExtractor
+    ##
+    # Initializes a new DataExtractor.
+    #
+    # @param document [String, File, IO] Path to a PDF file, File object, or IO object.
+    #
     def initialize(document)
       @document = document
     end
+    ##
+    # Main method to extract text from the PDF.
+    #
+    # @return [Hash] Result hash containing:
+    #   - "success" [Boolean]
+    #   - "raw_text" [String] if extraction succeeded
+    #   - "message" [String] if extraction failed
+    #
     def call
       ocr_data(@document)
     end
     private
+    ##
+    # Handles parsing the PDF and determining if OCR is needed.
+    #
+    # @param document [String, File, IO] The PDF document
+    # @return [Hash]
+    #
     def ocr_data(document)
       extracted_text = ""
       is_scanned = false
@@ -48,6 +82,13 @@ module Ocr
       scanned_pdf_ocr(file)
     end
+    ##
+    # Returns a File object from the given document
+    #
+    # @param document [String, File, IO]
+    # @return [File]
+    # @raise [ArgumentError] if the type is unsupported
+    #
     def get_file_from(document)
       return document.tap(&:open) if document.respond_to?(:open)
       return document if document.is_a?(File)
@@ -57,23 +98,47 @@ module Ocr
       raise ArgumentError, "Unsupported document type: #{document.class}"
     end
+    ##
+    # Safely extract text from a PDF page
+    #
+    # @param page [PDF::Reader::Page]
+    # @return [String]
+    #
     def safe_page_text(page)
       page.text.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
     rescue
       ""
     end
+    ##
+    # Determine if a PDF is likely scanned
+    #
+    # @param text [String]
+    # @return [Boolean]
+    #
     def scanned_pdf?(text)
       return true if text.empty?
       junk_ratio = text.count("^A-Za-z0-9\s").to_f / text.size
       junk_ratio > 0.5 || text.size < 100
     end
+    ##
+    # Check if the page is mostly non-text content
+    #
+    # @param text [String]
+    # @return [Boolean]
+    #
     def mostly_junk?(text)
       return true if text.empty?
       text.scan(/[A-Za-z]/).count < (text.size * 0.2)
     end
+    ##
+    # Perform OCR on scanned PDFs
+    #
+    # @param file [File, String]
+    # @return [Hash]
+    #
     def scanned_pdf_ocr(file)
       images = []
       full_text = ""
@@ -94,12 +159,24 @@ module Ocr
       cleanup(images)
     end
+    ##
+    # Convert PDF to PNG images
+    #
+    # @param pdf_path [String]
+    # @return [Array<String>] List of image paths
+    #
     def convert_pdf_to_images(pdf_path)
       output_prefix = File.join(Dir.tmpdir, "ocr_page_#{SecureRandom.hex(4)}")
       system("pdftoppm -png -r 300 #{Shellwords.escape(pdf_path)} #{Shellwords.escape(output_prefix)}")
       Dir["#{output_prefix}-*.png"]
     end
+    ##
+    # Extract text from an image using Tesseract
+    #
+    # @param image_path [String]
+    # @return [String]
+    #
     def extract_text(image_path)
       RTesseract.new(image_path, lang: "eng", processor: "mini_magick").to_s
     rescue => e
@@ -107,10 +184,20 @@ module Ocr
       ""
     end
+    ##
+    # Cleanup temporary images
+    #
+    # @param images [Array<String>]
+    #
     def cleanup(images)
       images&.each { |img| File.delete(img) if File.exist?(img) }
     end
+    ##
+    # Log warnings to Rails logger or stderr
+    #
+    # @param message [String]
+    #
     def log_warning(message)
       if defined?(Rails)
         Rails.logger.warn(message)

data/lib/ocr/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Ocr
-  VERSION = "0.1.1"
+  VERSION = "0.1.2"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf_ocr
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Ravi Shankar Singhal
@@ -106,7 +106,7 @@ metadata:
   homepage_uri: https://github.com/RaviShankarSinghal/ocr_gem
   source_code_uri: https://github.com/RaviShankarSinghal/ocr_gem
   changelog_uri: https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md
-  documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.1
+  documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.2
 post_install_message:
 rdoc_options: []
 require_paths: