pdf_ocr 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -6
- data/lib/ocr/data_extractor.rb +87 -0
- data/lib/ocr/version.rb +1 -1
- data/ocr.gemspec +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 874671c2167c8e17c21b59d7805434644565d8d1292bb9fbc7d57383080ace48
|
|
4
|
+
data.tar.gz: 439ee65fbadd68192b60c48a1688fb4b23026be373a9375ee8ad2017d12a2cd1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: da2067e94fbe2765248887edd40dac4d1587fff186520373dd8d21c889ff28b883770627fa0d926b8ccabd12b76ddb146bef57a3054c5e914d08b7a1ecefd13a
|
|
7
|
+
data.tar.gz: 46d3e8b4391e02bb3333dd92d6e2f3ca92aac0a73eaaa47c54614f65309f94ff36234a0a02c4b04c267da1217e04f9f93e32b1680484c885cc719e26392c71dc
|
data/README.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# OCR
|
|
1
|
+
# PDF OCR
|
|
2
2
|
|
|
3
3
|
A lightweight Ruby gem for extracting text from PDFs, including scanned PDFs using OCR.
|
|
4
4
|
|
|
@@ -25,12 +25,12 @@ This gem supports:
|
|
|
25
25
|
Add this line to your application's Gemfile:
|
|
26
26
|
|
|
27
27
|
```ruby
|
|
28
|
-
gem '
|
|
28
|
+
gem 'pdf_ocr'
|
|
29
29
|
```
|
|
30
30
|
|
|
31
31
|
Or install directly:
|
|
32
32
|
```ruby
|
|
33
|
-
gem install
|
|
33
|
+
gem install pdf_ocr
|
|
34
34
|
```
|
|
35
35
|
|
|
36
36
|
## Dependencies
|
|
@@ -46,7 +46,7 @@ gem install ocr
|
|
|
46
46
|
|
|
47
47
|
## ⚙️ Usage
|
|
48
48
|
```ruby
|
|
49
|
-
require '
|
|
49
|
+
require 'pdf_ocr'
|
|
50
50
|
require 'stringio'
|
|
51
51
|
|
|
52
52
|
# From a File object
|
|
@@ -119,6 +119,15 @@ bundle exec rspec
|
|
|
119
119
|
|
|
120
120
|
- Open a Pull Request
|
|
121
121
|
|
|
122
|
+
## 🧑💼 Author
|
|
123
|
+
```
|
|
124
|
+
Ravi Shankar Singhal
|
|
125
|
+
Senior Backend Developer — Ruby on Rails
|
|
126
|
+
📧 ravi.singhal2308@gmail.com
|
|
127
|
+
|
|
128
|
+
🌐 https://github.com/RaviShankarSinghal
|
|
129
|
+
```
|
|
130
|
+
|
|
122
131
|
## 📝 License
|
|
123
132
|
|
|
124
133
|
MIT License © RaviShankarSinghal
|
|
@@ -134,5 +143,5 @@ This version includes:
|
|
|
134
143
|
- System dependencies
|
|
135
144
|
- Test instructions
|
|
136
145
|
- Contributing guidelines
|
|
137
|
-
|
|
138
|
-
---
|
|
146
|
+
- The gem is available as open source under the terms of the MIT License.
|
|
147
|
+
---
|
data/lib/ocr/data_extractor.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "mini_magick"
|
|
2
4
|
require "pdf/reader"
|
|
3
5
|
require "rtesseract"
|
|
@@ -6,17 +8,49 @@ require "shellwords"
|
|
|
6
8
|
require "tmpdir"
|
|
7
9
|
|
|
8
10
|
module Ocr
|
|
11
|
+
##
|
|
12
|
+
# DataExtractor handles PDF text extraction.
|
|
13
|
+
# It can parse regular PDFs or scanned PDFs using OCR.
|
|
14
|
+
#
|
|
15
|
+
# @example Extract text from a PDF
|
|
16
|
+
# extractor = Ocr::DataExtractor.new("example.pdf")
|
|
17
|
+
# result = extractor.call
|
|
18
|
+
# if result["success"]
|
|
19
|
+
# puts result["raw_text"]
|
|
20
|
+
# else
|
|
21
|
+
# puts result["message"]
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
9
24
|
class DataExtractor
|
|
25
|
+
##
|
|
26
|
+
# Initializes a new DataExtractor.
|
|
27
|
+
#
|
|
28
|
+
# @param document [String, File, IO] Path to a PDF file, File object, or IO object.
|
|
29
|
+
#
|
|
10
30
|
def initialize(document)
|
|
11
31
|
@document = document
|
|
12
32
|
end
|
|
13
33
|
|
|
34
|
+
##
|
|
35
|
+
# Main method to extract text from the PDF.
|
|
36
|
+
#
|
|
37
|
+
# @return [Hash] Result hash containing:
|
|
38
|
+
# - "success" [Boolean]
|
|
39
|
+
# - "raw_text" [String] if extraction succeeded
|
|
40
|
+
# - "message" [String] if extraction failed
|
|
41
|
+
#
|
|
14
42
|
def call
|
|
15
43
|
ocr_data(@document)
|
|
16
44
|
end
|
|
17
45
|
|
|
18
46
|
private
|
|
19
47
|
|
|
48
|
+
##
|
|
49
|
+
# Handles parsing the PDF and determining if OCR is needed.
|
|
50
|
+
#
|
|
51
|
+
# @param document [String, File, IO] The PDF document
|
|
52
|
+
# @return [Hash]
|
|
53
|
+
#
|
|
20
54
|
def ocr_data(document)
|
|
21
55
|
extracted_text = ""
|
|
22
56
|
is_scanned = false
|
|
@@ -48,6 +82,13 @@ module Ocr
|
|
|
48
82
|
scanned_pdf_ocr(file)
|
|
49
83
|
end
|
|
50
84
|
|
|
85
|
+
##
|
|
86
|
+
# Returns a File object from the given document
|
|
87
|
+
#
|
|
88
|
+
# @param document [String, File, IO]
|
|
89
|
+
# @return [File]
|
|
90
|
+
# @raise [ArgumentError] if the type is unsupported
|
|
91
|
+
#
|
|
51
92
|
def get_file_from(document)
|
|
52
93
|
return document.tap(&:open) if document.respond_to?(:open)
|
|
53
94
|
return document if document.is_a?(File)
|
|
@@ -57,23 +98,47 @@ module Ocr
|
|
|
57
98
|
raise ArgumentError, "Unsupported document type: #{document.class}"
|
|
58
99
|
end
|
|
59
100
|
|
|
101
|
+
##
|
|
102
|
+
# Safely extract text from a PDF page
|
|
103
|
+
#
|
|
104
|
+
# @param page [PDF::Reader::Page]
|
|
105
|
+
# @return [String]
|
|
106
|
+
#
|
|
60
107
|
def safe_page_text(page)
|
|
61
108
|
page.text.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
|
62
109
|
rescue
|
|
63
110
|
""
|
|
64
111
|
end
|
|
65
112
|
|
|
113
|
+
##
|
|
114
|
+
# Determine if a PDF is likely scanned
|
|
115
|
+
#
|
|
116
|
+
# @param text [String]
|
|
117
|
+
# @return [Boolean]
|
|
118
|
+
#
|
|
66
119
|
def scanned_pdf?(text)
|
|
67
120
|
return true if text.empty?
|
|
68
121
|
junk_ratio = text.count("^A-Za-z0-9\s").to_f / text.size
|
|
69
122
|
junk_ratio > 0.5 || text.size < 100
|
|
70
123
|
end
|
|
71
124
|
|
|
125
|
+
##
|
|
126
|
+
# Check if the page is mostly non-text content
|
|
127
|
+
#
|
|
128
|
+
# @param text [String]
|
|
129
|
+
# @return [Boolean]
|
|
130
|
+
#
|
|
72
131
|
def mostly_junk?(text)
|
|
73
132
|
return true if text.empty?
|
|
74
133
|
text.scan(/[A-Za-z]/).count < (text.size * 0.2)
|
|
75
134
|
end
|
|
76
135
|
|
|
136
|
+
##
|
|
137
|
+
# Perform OCR on scanned PDFs
|
|
138
|
+
#
|
|
139
|
+
# @param file [File, String]
|
|
140
|
+
# @return [Hash]
|
|
141
|
+
#
|
|
77
142
|
def scanned_pdf_ocr(file)
|
|
78
143
|
images = []
|
|
79
144
|
full_text = ""
|
|
@@ -94,12 +159,24 @@ module Ocr
|
|
|
94
159
|
cleanup(images)
|
|
95
160
|
end
|
|
96
161
|
|
|
162
|
+
##
|
|
163
|
+
# Convert PDF to PNG images
|
|
164
|
+
#
|
|
165
|
+
# @param pdf_path [String]
|
|
166
|
+
# @return [Array<String>] List of image paths
|
|
167
|
+
#
|
|
97
168
|
def convert_pdf_to_images(pdf_path)
|
|
98
169
|
output_prefix = File.join(Dir.tmpdir, "ocr_page_#{SecureRandom.hex(4)}")
|
|
99
170
|
system("pdftoppm -png -r 300 #{Shellwords.escape(pdf_path)} #{Shellwords.escape(output_prefix)}")
|
|
100
171
|
Dir["#{output_prefix}-*.png"]
|
|
101
172
|
end
|
|
102
173
|
|
|
174
|
+
##
|
|
175
|
+
# Extract text from an image using Tesseract
|
|
176
|
+
#
|
|
177
|
+
# @param image_path [String]
|
|
178
|
+
# @return [String]
|
|
179
|
+
#
|
|
103
180
|
def extract_text(image_path)
|
|
104
181
|
RTesseract.new(image_path, lang: "eng", processor: "mini_magick").to_s
|
|
105
182
|
rescue => e
|
|
@@ -107,10 +184,20 @@ module Ocr
|
|
|
107
184
|
""
|
|
108
185
|
end
|
|
109
186
|
|
|
187
|
+
##
|
|
188
|
+
# Cleanup temporary images
|
|
189
|
+
#
|
|
190
|
+
# @param images [Array<String>]
|
|
191
|
+
#
|
|
110
192
|
def cleanup(images)
|
|
111
193
|
images&.each { |img| File.delete(img) if File.exist?(img) }
|
|
112
194
|
end
|
|
113
195
|
|
|
196
|
+
##
|
|
197
|
+
# Log warnings to Rails logger or stderr
|
|
198
|
+
#
|
|
199
|
+
# @param message [String]
|
|
200
|
+
#
|
|
114
201
|
def log_warning(message)
|
|
115
202
|
if defined?(Rails)
|
|
116
203
|
Rails.logger.warn(message)
|
data/lib/ocr/version.rb
CHANGED
data/ocr.gemspec
CHANGED
|
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
|
19
19
|
"homepage_uri" => spec.homepage,
|
|
20
20
|
"source_code_uri" => "https://github.com/RaviShankarSinghal/ocr_gem",
|
|
21
21
|
"changelog_uri" => "https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md",
|
|
22
|
-
"documentation_uri" => "https://rubydoc.info/gems/
|
|
22
|
+
"documentation_uri" => "https://rubydoc.info/gems/pdf_ocr/#{spec.version}"
|
|
23
23
|
}
|
|
24
24
|
|
|
25
25
|
spec.files = Dir.chdir(__dir__) do
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pdf_ocr
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ravi Shankar Singhal
|
|
@@ -106,7 +106,7 @@ metadata:
|
|
|
106
106
|
homepage_uri: https://github.com/RaviShankarSinghal/ocr_gem
|
|
107
107
|
source_code_uri: https://github.com/RaviShankarSinghal/ocr_gem
|
|
108
108
|
changelog_uri: https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md
|
|
109
|
-
documentation_uri: https://rubydoc.info/gems/
|
|
109
|
+
documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.2
|
|
110
110
|
post_install_message:
|
|
111
111
|
rdoc_options: []
|
|
112
112
|
require_paths:
|