pdf_ocr 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ocr/data_extractor.rb +87 -0
- data/lib/ocr/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 874671c2167c8e17c21b59d7805434644565d8d1292bb9fbc7d57383080ace48
|
|
4
|
+
data.tar.gz: 439ee65fbadd68192b60c48a1688fb4b23026be373a9375ee8ad2017d12a2cd1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: da2067e94fbe2765248887edd40dac4d1587fff186520373dd8d21c889ff28b883770627fa0d926b8ccabd12b76ddb146bef57a3054c5e914d08b7a1ecefd13a
|
|
7
|
+
data.tar.gz: 46d3e8b4391e02bb3333dd92d6e2f3ca92aac0a73eaaa47c54614f65309f94ff36234a0a02c4b04c267da1217e04f9f93e32b1680484c885cc719e26392c71dc
|
data/lib/ocr/data_extractor.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "mini_magick"
|
|
2
4
|
require "pdf/reader"
|
|
3
5
|
require "rtesseract"
|
|
@@ -6,17 +8,49 @@ require "shellwords"
|
|
|
6
8
|
require "tmpdir"
|
|
7
9
|
|
|
8
10
|
module Ocr
|
|
11
|
+
##
|
|
12
|
+
# DataExtractor handles PDF text extraction.
|
|
13
|
+
# It can parse regular PDFs or scanned PDFs using OCR.
|
|
14
|
+
#
|
|
15
|
+
# @example Extract text from a PDF
|
|
16
|
+
# extractor = Ocr::DataExtractor.new("example.pdf")
|
|
17
|
+
# result = extractor.call
|
|
18
|
+
# if result["success"]
|
|
19
|
+
# puts result["raw_text"]
|
|
20
|
+
# else
|
|
21
|
+
# puts result["message"]
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
9
24
|
class DataExtractor
|
|
25
|
+
##
|
|
26
|
+
# Initializes a new DataExtractor.
|
|
27
|
+
#
|
|
28
|
+
# @param document [String, File, IO] Path to a PDF file, File object, or IO object.
|
|
29
|
+
#
|
|
10
30
|
def initialize(document)
|
|
11
31
|
@document = document
|
|
12
32
|
end
|
|
13
33
|
|
|
34
|
+
##
|
|
35
|
+
# Main method to extract text from the PDF.
|
|
36
|
+
#
|
|
37
|
+
# @return [Hash] Result hash containing:
|
|
38
|
+
# - "success" [Boolean]
|
|
39
|
+
# - "raw_text" [String] if extraction succeeded
|
|
40
|
+
# - "message" [String] if extraction failed
|
|
41
|
+
#
|
|
14
42
|
def call
|
|
15
43
|
ocr_data(@document)
|
|
16
44
|
end
|
|
17
45
|
|
|
18
46
|
private
|
|
19
47
|
|
|
48
|
+
##
|
|
49
|
+
# Handles parsing the PDF and determining if OCR is needed.
|
|
50
|
+
#
|
|
51
|
+
# @param document [String, File, IO] The PDF document
|
|
52
|
+
# @return [Hash]
|
|
53
|
+
#
|
|
20
54
|
def ocr_data(document)
|
|
21
55
|
extracted_text = ""
|
|
22
56
|
is_scanned = false
|
|
@@ -48,6 +82,13 @@ module Ocr
|
|
|
48
82
|
scanned_pdf_ocr(file)
|
|
49
83
|
end
|
|
50
84
|
|
|
85
|
+
##
|
|
86
|
+
# Returns a File object from the given document
|
|
87
|
+
#
|
|
88
|
+
# @param document [String, File, IO]
|
|
89
|
+
# @return [File]
|
|
90
|
+
# @raise [ArgumentError] if the type is unsupported
|
|
91
|
+
#
|
|
51
92
|
def get_file_from(document)
|
|
52
93
|
return document.tap(&:open) if document.respond_to?(:open)
|
|
53
94
|
return document if document.is_a?(File)
|
|
@@ -57,23 +98,47 @@ module Ocr
|
|
|
57
98
|
raise ArgumentError, "Unsupported document type: #{document.class}"
|
|
58
99
|
end
|
|
59
100
|
|
|
101
|
+
##
|
|
102
|
+
# Safely extract text from a PDF page
|
|
103
|
+
#
|
|
104
|
+
# @param page [PDF::Reader::Page]
|
|
105
|
+
# @return [String]
|
|
106
|
+
#
|
|
60
107
|
def safe_page_text(page)
|
|
61
108
|
page.text.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
|
62
109
|
rescue
|
|
63
110
|
""
|
|
64
111
|
end
|
|
65
112
|
|
|
113
|
+
##
|
|
114
|
+
# Determine if a PDF is likely scanned
|
|
115
|
+
#
|
|
116
|
+
# @param text [String]
|
|
117
|
+
# @return [Boolean]
|
|
118
|
+
#
|
|
66
119
|
def scanned_pdf?(text)
|
|
67
120
|
return true if text.empty?
|
|
68
121
|
junk_ratio = text.count("^A-Za-z0-9\s").to_f / text.size
|
|
69
122
|
junk_ratio > 0.5 || text.size < 100
|
|
70
123
|
end
|
|
71
124
|
|
|
125
|
+
##
|
|
126
|
+
# Check if the page is mostly non-text content
|
|
127
|
+
#
|
|
128
|
+
# @param text [String]
|
|
129
|
+
# @return [Boolean]
|
|
130
|
+
#
|
|
72
131
|
def mostly_junk?(text)
|
|
73
132
|
return true if text.empty?
|
|
74
133
|
text.scan(/[A-Za-z]/).count < (text.size * 0.2)
|
|
75
134
|
end
|
|
76
135
|
|
|
136
|
+
##
|
|
137
|
+
# Perform OCR on scanned PDFs
|
|
138
|
+
#
|
|
139
|
+
# @param file [File, String]
|
|
140
|
+
# @return [Hash]
|
|
141
|
+
#
|
|
77
142
|
def scanned_pdf_ocr(file)
|
|
78
143
|
images = []
|
|
79
144
|
full_text = ""
|
|
@@ -94,12 +159,24 @@ module Ocr
|
|
|
94
159
|
cleanup(images)
|
|
95
160
|
end
|
|
96
161
|
|
|
162
|
+
##
|
|
163
|
+
# Convert PDF to PNG images
|
|
164
|
+
#
|
|
165
|
+
# @param pdf_path [String]
|
|
166
|
+
# @return [Array<String>] List of image paths
|
|
167
|
+
#
|
|
97
168
|
def convert_pdf_to_images(pdf_path)
|
|
98
169
|
output_prefix = File.join(Dir.tmpdir, "ocr_page_#{SecureRandom.hex(4)}")
|
|
99
170
|
system("pdftoppm -png -r 300 #{Shellwords.escape(pdf_path)} #{Shellwords.escape(output_prefix)}")
|
|
100
171
|
Dir["#{output_prefix}-*.png"]
|
|
101
172
|
end
|
|
102
173
|
|
|
174
|
+
##
|
|
175
|
+
# Extract text from an image using Tesseract
|
|
176
|
+
#
|
|
177
|
+
# @param image_path [String]
|
|
178
|
+
# @return [String]
|
|
179
|
+
#
|
|
103
180
|
def extract_text(image_path)
|
|
104
181
|
RTesseract.new(image_path, lang: "eng", processor: "mini_magick").to_s
|
|
105
182
|
rescue => e
|
|
@@ -107,10 +184,20 @@ module Ocr
|
|
|
107
184
|
""
|
|
108
185
|
end
|
|
109
186
|
|
|
187
|
+
##
|
|
188
|
+
# Cleanup temporary images
|
|
189
|
+
#
|
|
190
|
+
# @param images [Array<String>]
|
|
191
|
+
#
|
|
110
192
|
def cleanup(images)
|
|
111
193
|
images&.each { |img| File.delete(img) if File.exist?(img) }
|
|
112
194
|
end
|
|
113
195
|
|
|
196
|
+
##
|
|
197
|
+
# Log warnings to Rails logger or stderr
|
|
198
|
+
#
|
|
199
|
+
# @param message [String]
|
|
200
|
+
#
|
|
114
201
|
def log_warning(message)
|
|
115
202
|
if defined?(Rails)
|
|
116
203
|
Rails.logger.warn(message)
|
data/lib/ocr/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pdf_ocr
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ravi Shankar Singhal
|
|
@@ -106,7 +106,7 @@ metadata:
|
|
|
106
106
|
homepage_uri: https://github.com/RaviShankarSinghal/ocr_gem
|
|
107
107
|
source_code_uri: https://github.com/RaviShankarSinghal/ocr_gem
|
|
108
108
|
changelog_uri: https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md
|
|
109
|
-
documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.
|
|
109
|
+
documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.2
|
|
110
110
|
post_install_message:
|
|
111
111
|
rdoc_options: []
|
|
112
112
|
require_paths:
|