pdf_ocr 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ocr/data_extractor.rb +14 -2
- data/lib/ocr/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 12660b4b9e443f82cbe79d38c1a9a27fca76fe910eb4cf0639027b5f1450a274
|
|
4
|
+
data.tar.gz: 9bad4ecc41836e964709bcb5c1abdebcee9baf58c46679b39c8afbceb7e673f6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 82234b97c83ceb564b9693c91ebfd4d2ba5f15d860975c19b007066a57cf824824415968be8e73904bf69f7c015009ae49d12044ad1ad80c178d55123c5b2bd3
|
|
7
|
+
data.tar.gz: e361d9700efc051a09f868feeb70954a7f8937bcc094252f01770a76e2779da5db94f8c4411b703129a6e47d568e64bbb15bff79e9ee9c584b20d5138ab346e1
|
data/lib/ocr/data_extractor.rb
CHANGED
|
@@ -75,7 +75,7 @@ module Ocr
|
|
|
75
75
|
if is_scanned || scanned_pdf?(extracted_text)
|
|
76
76
|
scanned_pdf_ocr(file)
|
|
77
77
|
else
|
|
78
|
-
{ "success" => true, "raw_text" => extracted_text
|
|
78
|
+
{ "success" => true, "raw_text" => clean(extracted_text) }
|
|
79
79
|
end
|
|
80
80
|
rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
|
|
81
81
|
log_warning "PDF parsing failed: #{e.message}"
|
|
@@ -151,7 +151,7 @@ module Ocr
|
|
|
151
151
|
full_text += images.map { |img| extract_text(img) }.join(" ")
|
|
152
152
|
|
|
153
153
|
unless full_text.strip.empty?
|
|
154
|
-
{ "success" => true, "raw_text" => full_text
|
|
154
|
+
{ "success" => true, "raw_text" => clean(full_text) }
|
|
155
155
|
else
|
|
156
156
|
{ "success" => false, "message" => "Unable to extract text using OCR" }
|
|
157
157
|
end
|
|
@@ -205,5 +205,17 @@ module Ocr
|
|
|
205
205
|
warn(message)
|
|
206
206
|
end
|
|
207
207
|
end
|
|
208
|
+
|
|
209
|
+
def clean(raw_text)
|
|
210
|
+
return "" if raw_text.empty?
|
|
211
|
+
|
|
212
|
+
raw_text
|
|
213
|
+
.gsub(/\n+/, " ")
|
|
214
|
+
.gsub(/\s+/, " ")
|
|
215
|
+
.gsub(/-\s+/, "")
|
|
216
|
+
.gsub(" . .", ".00")
|
|
217
|
+
.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
|
218
|
+
.strip
|
|
219
|
+
end
|
|
208
220
|
end
|
|
209
221
|
end
|
data/lib/ocr/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pdf_ocr
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ravi Shankar Singhal
|
|
@@ -106,7 +106,7 @@ metadata:
|
|
|
106
106
|
homepage_uri: https://github.com/RaviShankarSinghal/ocr_gem
|
|
107
107
|
source_code_uri: https://github.com/RaviShankarSinghal/ocr_gem
|
|
108
108
|
changelog_uri: https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md
|
|
109
|
-
documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.
|
|
109
|
+
documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.4
|
|
110
110
|
post_install_message:
|
|
111
111
|
rdoc_options: []
|
|
112
112
|
require_paths:
|