RubyGems - pdf_ocr - Versions diffs - 0.1.2 → 0.1.4 - Mend

pdf_ocr 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 874671c2167c8e17c21b59d7805434644565d8d1292bb9fbc7d57383080ace48
-  data.tar.gz: 439ee65fbadd68192b60c48a1688fb4b23026be373a9375ee8ad2017d12a2cd1
+  metadata.gz: 12660b4b9e443f82cbe79d38c1a9a27fca76fe910eb4cf0639027b5f1450a274
+  data.tar.gz: 9bad4ecc41836e964709bcb5c1abdebcee9baf58c46679b39c8afbceb7e673f6
 SHA512:
-  metadata.gz: da2067e94fbe2765248887edd40dac4d1587fff186520373dd8d21c889ff28b883770627fa0d926b8ccabd12b76ddb146bef57a3054c5e914d08b7a1ecefd13a
-  data.tar.gz: 46d3e8b4391e02bb3333dd92d6e2f3ca92aac0a73eaaa47c54614f65309f94ff36234a0a02c4b04c267da1217e04f9f93e32b1680484c885cc719e26392c71dc
+  metadata.gz: 82234b97c83ceb564b9693c91ebfd4d2ba5f15d860975c19b007066a57cf824824415968be8e73904bf69f7c015009ae49d12044ad1ad80c178d55123c5b2bd3
+  data.tar.gz: e361d9700efc051a09f868feeb70954a7f8937bcc094252f01770a76e2779da5db94f8c4411b703129a6e47d568e64bbb15bff79e9ee9c584b20d5138ab346e1

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    pdf_ocr (0.1.0)
+    pdf_ocr (0.1.3)
       mini_magick
       pdf-reader
       rtesseract

data/README.md CHANGED Viewed

@@ -46,7 +46,7 @@ gem install pdf_ocr
 ## ⚙️ Usage
 ```ruby
-require 'pdf_ocr'
+require 'ocr'
 require 'stringio'
 # From a File object
@@ -78,22 +78,30 @@ result = Ocr::DataExtractor.new(pdf_data).call
 ```
 ## 🔧 Notes
 1. Ensure Tesseract OCR is installed on your system:
-```
-# Ubuntu/Debian
-sudo apt install tesseract-ocr
-# MacOS (with Homebrew)
-brew install tesseract
-```
+   ```
+    # Ubuntu/Debian
+    sudo apt install tesseract-ocr
+    # MacOS (with Homebrew)
+    brew install tesseract
+   ```
 2. Ensure pdftoppm is installed (for PDF-to-image conversion):
-```
-# Ubuntu/Debian
-sudo apt install poppler-utils
-# MacOS (with Homebrew)
-brew install poppler
-```
-3. This gem does not require Rails, but it will work with Rails ActiveStorage objects that respond to .open.
+   ```
+    # Ubuntu/Debian
+    sudo apt install poppler-utils
+    # MacOS (with Homebrew)
+    brew install poppler
+   ```
+3. Ensure ImageMagick is installed ( for images):
+   ```
+    # Ubuntu/Debian
+    sudo apt install imagemagick
+    # MacOS (with Homebrew)
+    brew install imagemagick
+   ```
+4. This gem does not require Rails, but it will work with Rails ActiveStorage objects that respond to .open.
 ## 🧪 Running Tests
 ```

data/lib/ocr/data_extractor.rb CHANGED Viewed

@@ -52,7 +52,7 @@ module Ocr
     # @return [Hash]
     #
     def ocr_data(document)
-      extracted_text = ""
+      extracted_text = String.new
       is_scanned = false
       file = get_file_from(document)
@@ -75,7 +75,7 @@ module Ocr
       if is_scanned || scanned_pdf?(extracted_text)
         scanned_pdf_ocr(file)
       else
-        { "success" => true, "raw_text" => extracted_text.strip }
+        { "success" => true, "raw_text" => clean(extracted_text) }
       end
     rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
       log_warning "PDF parsing failed: #{e.message}"
@@ -151,7 +151,7 @@ module Ocr
       full_text += images.map { |img| extract_text(img) }.join(" ")
       unless full_text.strip.empty?
-        { "success" => true, "raw_text" => full_text.strip }
+        { "success" => true, "raw_text" => clean(full_text) }
       else
         { "success" => false, "message" => "Unable to extract text using OCR" }
       end
@@ -205,5 +205,17 @@ module Ocr
         warn(message)
       end
     end
+    def clean(raw_text)
+      return "" if raw_text.empty?
+      raw_text
+        .gsub(/\n+/, " ")
+        .gsub(/\s+/, " ")
+        .gsub(/-\s+/, "")
+        .gsub(" . .", ".00")
+        .encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
+        .strip
+    end
   end
 end

data/lib/ocr/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Ocr
-  VERSION = "0.1.2"
+  VERSION = "0.1.4"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pdf_ocr
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.4
 platform: ruby
 authors:
 - Ravi Shankar Singhal
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-10-24 00:00:00.000000000 Z
+date: 2025-11-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: pdf-reader
@@ -106,7 +106,7 @@ metadata:
   homepage_uri: https://github.com/RaviShankarSinghal/ocr_gem
   source_code_uri: https://github.com/RaviShankarSinghal/ocr_gem
   changelog_uri: https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md
-  documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.2
+  documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.4
 post_install_message:
 rdoc_options: []
 require_paths: