pdf_ocr 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 96003dcb66cc0427ddfa6241161b0b5c358b652f161bd2ff5fab5abca8e31236
4
- data.tar.gz: 431385258bee6950aa37ed494997f86003f75b215bb4ed0cd2480cfb116fae00
3
+ metadata.gz: 12660b4b9e443f82cbe79d38c1a9a27fca76fe910eb4cf0639027b5f1450a274
4
+ data.tar.gz: 9bad4ecc41836e964709bcb5c1abdebcee9baf58c46679b39c8afbceb7e673f6
5
5
  SHA512:
6
- metadata.gz: 7a69ad20675eebabb9db5f9c6ffc3a85a3adc1290bc42bdfbe0d45b2ece92d21b3618051a5193e447b7f3c36373952d8abe27bebd1e561807ad4663be4334aca
7
- data.tar.gz: 767e89448d6eea25a12a84f016d0baeb566c34af1157269a62b0af352cd3fbbd4a4adf40a731b8bd7b0a26bfe247c3d864eea843405731e13a9df431bf2a6d7c
6
+ metadata.gz: 82234b97c83ceb564b9693c91ebfd4d2ba5f15d860975c19b007066a57cf824824415968be8e73904bf69f7c015009ae49d12044ad1ad80c178d55123c5b2bd3
7
+ data.tar.gz: e361d9700efc051a09f868feeb70954a7f8937bcc094252f01770a76e2779da5db94f8c4411b703129a6e47d568e64bbb15bff79e9ee9c584b20d5138ab346e1
@@ -75,7 +75,7 @@ module Ocr
75
75
  if is_scanned || scanned_pdf?(extracted_text)
76
76
  scanned_pdf_ocr(file)
77
77
  else
78
- { "success" => true, "raw_text" => extracted_text.strip }
78
+ { "success" => true, "raw_text" => clean(extracted_text) }
79
79
  end
80
80
  rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
81
81
  log_warning "PDF parsing failed: #{e.message}"
@@ -151,7 +151,7 @@ module Ocr
151
151
  full_text += images.map { |img| extract_text(img) }.join(" ")
152
152
 
153
153
  unless full_text.strip.empty?
154
- { "success" => true, "raw_text" => full_text.strip }
154
+ { "success" => true, "raw_text" => clean(full_text) }
155
155
  else
156
156
  { "success" => false, "message" => "Unable to extract text using OCR" }
157
157
  end
@@ -205,5 +205,17 @@ module Ocr
205
205
  warn(message)
206
206
  end
207
207
  end
208
+
209
+ def clean(raw_text)
210
+ return "" if raw_text.empty?
211
+
212
+ raw_text
213
+ .gsub(/\n+/, " ")
214
+ .gsub(/\s+/, " ")
215
+ .gsub(/-\s+/, "")
216
+ .gsub(" . .", ".00")
217
+ .encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
218
+ .strip
219
+ end
208
220
  end
209
221
  end
data/lib/ocr/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ocr
4
- VERSION = "0.1.3"
4
+ VERSION = "0.1.4"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf_ocr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ravi Shankar Singhal
@@ -106,7 +106,7 @@ metadata:
106
106
  homepage_uri: https://github.com/RaviShankarSinghal/ocr_gem
107
107
  source_code_uri: https://github.com/RaviShankarSinghal/ocr_gem
108
108
  changelog_uri: https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md
109
- documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.3
109
+ documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.4
110
110
  post_install_message:
111
111
  rdoc_options: []
112
112
  require_paths: