pdf_ocr 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 874671c2167c8e17c21b59d7805434644565d8d1292bb9fbc7d57383080ace48
4
- data.tar.gz: 439ee65fbadd68192b60c48a1688fb4b23026be373a9375ee8ad2017d12a2cd1
3
+ metadata.gz: 12660b4b9e443f82cbe79d38c1a9a27fca76fe910eb4cf0639027b5f1450a274
4
+ data.tar.gz: 9bad4ecc41836e964709bcb5c1abdebcee9baf58c46679b39c8afbceb7e673f6
5
5
  SHA512:
6
- metadata.gz: da2067e94fbe2765248887edd40dac4d1587fff186520373dd8d21c889ff28b883770627fa0d926b8ccabd12b76ddb146bef57a3054c5e914d08b7a1ecefd13a
7
- data.tar.gz: 46d3e8b4391e02bb3333dd92d6e2f3ca92aac0a73eaaa47c54614f65309f94ff36234a0a02c4b04c267da1217e04f9f93e32b1680484c885cc719e26392c71dc
6
+ metadata.gz: 82234b97c83ceb564b9693c91ebfd4d2ba5f15d860975c19b007066a57cf824824415968be8e73904bf69f7c015009ae49d12044ad1ad80c178d55123c5b2bd3
7
+ data.tar.gz: e361d9700efc051a09f868feeb70954a7f8937bcc094252f01770a76e2779da5db94f8c4411b703129a6e47d568e64bbb15bff79e9ee9c584b20d5138ab346e1
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- pdf_ocr (0.1.0)
4
+ pdf_ocr (0.1.3)
5
5
  mini_magick
6
6
  pdf-reader
7
7
  rtesseract
data/README.md CHANGED
@@ -46,7 +46,7 @@ gem install pdf_ocr
46
46
 
47
47
  ## ⚙️ Usage
48
48
  ```ruby
49
- require 'pdf_ocr'
49
+ require 'ocr'
50
50
  require 'stringio'
51
51
 
52
52
  # From a File object
@@ -78,22 +78,30 @@ result = Ocr::DataExtractor.new(pdf_data).call
78
78
  ```
79
79
  ## 🔧 Notes
80
80
  1. Ensure Tesseract OCR is installed on your system:
81
- ```
82
- # Ubuntu/Debian
83
- sudo apt install tesseract-ocr
84
-
85
- # MacOS (with Homebrew)
86
- brew install tesseract
87
- ```
81
+ ```
82
+ # Ubuntu/Debian
83
+ sudo apt install tesseract-ocr
84
+
85
+ # MacOS (with Homebrew)
86
+ brew install tesseract
87
+ ```
88
88
  2. Ensure pdftoppm is installed (for PDF-to-image conversion):
89
- ```
90
- # Ubuntu/Debian
91
- sudo apt install poppler-utils
92
-
93
- # MacOS (with Homebrew)
94
- brew install poppler
95
- ```
96
- 3. This gem does not require Rails, but it will work with Rails ActiveStorage objects that respond to .open.
89
+ ```
90
+ # Ubuntu/Debian
91
+ sudo apt install poppler-utils
92
+
93
+ # MacOS (with Homebrew)
94
+ brew install poppler
95
+ ```
96
+ 3. Ensure ImageMagick is installed ( for images):
97
+ ```
98
+ # Ubuntu/Debian
99
+ sudo apt install imagemagick
100
+
101
+ # MacOS (with Homebrew)
102
+ brew install imagemagick
103
+ ```
104
+ 4. This gem does not require Rails, but it will work with Rails ActiveStorage objects that respond to .open.
97
105
 
98
106
  ## 🧪 Running Tests
99
107
  ```
@@ -52,7 +52,7 @@ module Ocr
52
52
  # @return [Hash]
53
53
  #
54
54
  def ocr_data(document)
55
- extracted_text = ""
55
+ extracted_text = String.new
56
56
  is_scanned = false
57
57
 
58
58
  file = get_file_from(document)
@@ -75,7 +75,7 @@ module Ocr
75
75
  if is_scanned || scanned_pdf?(extracted_text)
76
76
  scanned_pdf_ocr(file)
77
77
  else
78
- { "success" => true, "raw_text" => extracted_text.strip }
78
+ { "success" => true, "raw_text" => clean(extracted_text) }
79
79
  end
80
80
  rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
81
81
  log_warning "PDF parsing failed: #{e.message}"
@@ -151,7 +151,7 @@ module Ocr
151
151
  full_text += images.map { |img| extract_text(img) }.join(" ")
152
152
 
153
153
  unless full_text.strip.empty?
154
- { "success" => true, "raw_text" => full_text.strip }
154
+ { "success" => true, "raw_text" => clean(full_text) }
155
155
  else
156
156
  { "success" => false, "message" => "Unable to extract text using OCR" }
157
157
  end
@@ -205,5 +205,17 @@ module Ocr
205
205
  warn(message)
206
206
  end
207
207
  end
208
+
209
+ def clean(raw_text)
210
+ return "" if raw_text.empty?
211
+
212
+ raw_text
213
+ .gsub(/\n+/, " ")
214
+ .gsub(/\s+/, " ")
215
+ .gsub(/-\s+/, "")
216
+ .gsub(" . .", ".00")
217
+ .encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
218
+ .strip
219
+ end
208
220
  end
209
221
  end
data/lib/ocr/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ocr
4
- VERSION = "0.1.2"
4
+ VERSION = "0.1.4"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf_ocr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ravi Shankar Singhal
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-10-24 00:00:00.000000000 Z
11
+ date: 2025-11-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pdf-reader
@@ -106,7 +106,7 @@ metadata:
106
106
  homepage_uri: https://github.com/RaviShankarSinghal/ocr_gem
107
107
  source_code_uri: https://github.com/RaviShankarSinghal/ocr_gem
108
108
  changelog_uri: https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md
109
- documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.2
109
+ documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.4
110
110
  post_install_message:
111
111
  rdoc_options: []
112
112
  require_paths: