RubyGems - pdf_ocr - Versions diffs - 0.1.0 → 0.1.2 - Mend

pdf_ocr 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 32727eeb24656d1fce7cb43f2f5192f29cfda53192ef161cfae047f2871f6bff
-  data.tar.gz: 558586ded2489faf79ce7f36ee1ab6df267d9dc30d67e6ba554be61bde959e19
+  metadata.gz: 874671c2167c8e17c21b59d7805434644565d8d1292bb9fbc7d57383080ace48
+  data.tar.gz: 439ee65fbadd68192b60c48a1688fb4b23026be373a9375ee8ad2017d12a2cd1
 SHA512:
-  metadata.gz: c02b99bb1e652fe8c26ad80ed8dc4652c8eab5cc9a8bb4699b656080066772f811ee66ced0faa584f9a526322620c9e628f3a47c194f54b900706f968274c4dc
-  data.tar.gz: 9d7fea0ffe63fb2c10825d906831fb70dce2f1ab3d3d0c02c814dbd499c81fa906f23bc27e794d5ed3670b381c86b9da1b6f1abc091392684f5c17f97be000b4
+  metadata.gz: da2067e94fbe2765248887edd40dac4d1587fff186520373dd8d21c889ff28b883770627fa0d926b8ccabd12b76ddb146bef57a3054c5e914d08b7a1ecefd13a
+  data.tar.gz: 46d3e8b4391e02bb3333dd92d6e2f3ca92aac0a73eaaa47c54614f65309f94ff36234a0a02c4b04c267da1217e04f9f93e32b1680484c885cc719e26392c71dc

data/README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# OCR
+# PDF OCR
 A lightweight Ruby gem for extracting text from PDFs, including scanned PDFs using OCR.
@@ -25,12 +25,12 @@ This gem supports:
 Add this line to your application's Gemfile:
 ```ruby
-gem 'ocr', git: 'https://github.com/your_username/ocr.git'
+gem 'pdf_ocr'
 ```
 Or install directly:
 ```ruby
-gem install ocr
+gem install pdf_ocr
 ```
 ## Dependencies
@@ -46,7 +46,7 @@ gem install ocr
 ## ⚙️ Usage
 ```ruby
-require 'ocr'
+require 'pdf_ocr'
 require 'stringio'
 # From a File object
@@ -119,6 +119,15 @@ bundle exec rspec
 - Open a Pull Request
+## 🧑‍💼 Author
+```
+Ravi Shankar Singhal
+Senior Backend Developer — Ruby on Rails
+📧 ravi.singhal2308@gmail.com
+🌐 https://github.com/RaviShankarSinghal
+```
 ## 📝 License
 MIT License © RaviShankarSinghal
@@ -134,5 +143,5 @@ This version includes:
 - System dependencies
 - Test instructions
 - Contributing guidelines
----
+- The gem is available as open source under the terms of the MIT License.
+---

data/lib/ocr/data_extractor.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require "mini_magick"
 require "pdf/reader"
 require "rtesseract"
@@ -6,17 +8,49 @@ require "shellwords"
 require "tmpdir"
 module Ocr
+  ##
+  # DataExtractor handles PDF text extraction.
+  # It can parse regular PDFs or scanned PDFs using OCR.
+  #
+  # @example Extract text from a PDF
+  #   extractor = Ocr::DataExtractor.new("example.pdf")
+  #   result = extractor.call
+  #   if result["success"]
+  #     puts result["raw_text"]
+  #   else
+  #     puts result["message"]
+  #   end
+  #
   class DataExtractor
+    ##
+    # Initializes a new DataExtractor.
+    #
+    # @param document [String, File, IO] Path to a PDF file, File object, or IO object.
+    #
     def initialize(document)
       @document = document
     end
+    ##
+    # Main method to extract text from the PDF.
+    #
+    # @return [Hash] Result hash containing:
+    #   - "success" [Boolean]
+    #   - "raw_text" [String] if extraction succeeded
+    #   - "message" [String] if extraction failed
+    #
     def call
       ocr_data(@document)
     end
     private
+    ##
+    # Handles parsing the PDF and determining if OCR is needed.
+    #
+    # @param document [String, File, IO] The PDF document
+    # @return [Hash]
+    #
     def ocr_data(document)
       extracted_text = ""
       is_scanned = false
@@ -48,6 +82,13 @@ module Ocr
       scanned_pdf_ocr(file)
     end
+    ##
+    # Returns a File object from the given document
+    #
+    # @param document [String, File, IO]
+    # @return [File]
+    # @raise [ArgumentError] if the type is unsupported
+    #
     def get_file_from(document)
       return document.tap(&:open) if document.respond_to?(:open)
       return document if document.is_a?(File)
@@ -57,23 +98,47 @@ module Ocr
       raise ArgumentError, "Unsupported document type: #{document.class}"
     end
+    ##
+    # Safely extract text from a PDF page
+    #
+    # @param page [PDF::Reader::Page]
+    # @return [String]
+    #
     def safe_page_text(page)
       page.text.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
     rescue
       ""
     end
+    ##
+    # Determine if a PDF is likely scanned
+    #
+    # @param text [String]
+    # @return [Boolean]
+    #
     def scanned_pdf?(text)
       return true if text.empty?
       junk_ratio = text.count("^A-Za-z0-9\s").to_f / text.size
       junk_ratio > 0.5 || text.size < 100
     end
+    ##
+    # Check if the page is mostly non-text content
+    #
+    # @param text [String]
+    # @return [Boolean]
+    #
     def mostly_junk?(text)
       return true if text.empty?
       text.scan(/[A-Za-z]/).count < (text.size * 0.2)
     end
+    ##
+    # Perform OCR on scanned PDFs
+    #
+    # @param file [File, String]
+    # @return [Hash]
+    #
     def scanned_pdf_ocr(file)
       images = []
       full_text = ""
@@ -94,12 +159,24 @@ module Ocr
       cleanup(images)
     end
+    ##
+    # Convert PDF to PNG images
+    #
+    # @param pdf_path [String]
+    # @return [Array<String>] List of image paths
+    #
     def convert_pdf_to_images(pdf_path)
       output_prefix = File.join(Dir.tmpdir, "ocr_page_#{SecureRandom.hex(4)}")
       system("pdftoppm -png -r 300 #{Shellwords.escape(pdf_path)} #{Shellwords.escape(output_prefix)}")
       Dir["#{output_prefix}-*.png"]
     end
+    ##
+    # Extract text from an image using Tesseract
+    #
+    # @param image_path [String]
+    # @return [String]
+    #
     def extract_text(image_path)
       RTesseract.new(image_path, lang: "eng", processor: "mini_magick").to_s
     rescue => e
@@ -107,10 +184,20 @@ module Ocr
       ""
     end
+    ##
+    # Cleanup temporary images
+    #
+    # @param images [Array<String>]
+    #
     def cleanup(images)
       images&.each { |img| File.delete(img) if File.exist?(img) }
     end
+    ##
+    # Log warnings to Rails logger or stderr
+    #
+    # @param message [String]
+    #
     def log_warning(message)
       if defined?(Rails)
         Rails.logger.warn(message)

data/lib/ocr/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Ocr
-  VERSION = "0.1.0"
+  VERSION = "0.1.2"
 end

data/ocr.gemspec CHANGED Viewed

@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
     "homepage_uri"   => spec.homepage,
     "source_code_uri" => "https://github.com/RaviShankarSinghal/ocr_gem",
     "changelog_uri"   => "https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md",
-    "documentation_uri" => "https://rubydoc.info/gems/ocr"
+    "documentation_uri" => "https://rubydoc.info/gems/pdf_ocr/#{spec.version}"
   }
   spec.files = Dir.chdir(__dir__) do

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf_ocr
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.2
 platform: ruby
 authors:
 - Ravi Shankar Singhal
@@ -106,7 +106,7 @@ metadata:
   homepage_uri: https://github.com/RaviShankarSinghal/ocr_gem
   source_code_uri: https://github.com/RaviShankarSinghal/ocr_gem
   changelog_uri: https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md
-  documentation_uri: https://rubydoc.info/gems/ocr
+  documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.2
 post_install_message:
 rdoc_options: []
 require_paths: