pdf_ocr 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1e04f45ee3dbb8cc9703ad628daa7e1ca20020b8b3dc3b4419706e881bffec3
4
- data.tar.gz: ce753f931e9dc2391c61e0f83feda1ea4f9ae19e8248f9c64a66a14f62544037
3
+ metadata.gz: 874671c2167c8e17c21b59d7805434644565d8d1292bb9fbc7d57383080ace48
4
+ data.tar.gz: 439ee65fbadd68192b60c48a1688fb4b23026be373a9375ee8ad2017d12a2cd1
5
5
  SHA512:
6
- metadata.gz: 8dd14930fc50eed3e0a4aca7bad66318cac441cb906c6b578d347689a644ccbf3a82169b17a04c533e01f7183533ef2cb779c2799c40e7ae8f5b7e57df81af38
7
- data.tar.gz: 9496ebd284e7f4660a3c200fdc08a78f041c515dfd5548421ddca40eac3059f14169a6fc75e284cd6dc5ae34891b33bab7499697bbb40acfc73bb432ed32c633
6
+ metadata.gz: da2067e94fbe2765248887edd40dac4d1587fff186520373dd8d21c889ff28b883770627fa0d926b8ccabd12b76ddb146bef57a3054c5e914d08b7a1ecefd13a
7
+ data.tar.gz: 46d3e8b4391e02bb3333dd92d6e2f3ca92aac0a73eaaa47c54614f65309f94ff36234a0a02c4b04c267da1217e04f9f93e32b1680484c885cc719e26392c71dc
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "mini_magick"
2
4
  require "pdf/reader"
3
5
  require "rtesseract"
@@ -6,17 +8,49 @@ require "shellwords"
6
8
  require "tmpdir"
7
9
 
8
10
  module Ocr
11
+ ##
12
+ # DataExtractor handles PDF text extraction.
13
+ # It can parse regular PDFs or scanned PDFs using OCR.
14
+ #
15
+ # @example Extract text from a PDF
16
+ # extractor = Ocr::DataExtractor.new("example.pdf")
17
+ # result = extractor.call
18
+ # if result["success"]
19
+ # puts result["raw_text"]
20
+ # else
21
+ # puts result["message"]
22
+ # end
23
+ #
9
24
  class DataExtractor
25
+ ##
26
+ # Initializes a new DataExtractor.
27
+ #
28
+ # @param document [String, File, IO] Path to a PDF file, File object, or IO object.
29
+ #
10
30
  def initialize(document)
11
31
  @document = document
12
32
  end
13
33
 
34
+ ##
35
+ # Main method to extract text from the PDF.
36
+ #
37
+ # @return [Hash] Result hash containing:
38
+ # - "success" [Boolean]
39
+ # - "raw_text" [String] if extraction succeeded
40
+ # - "message" [String] if extraction failed
41
+ #
14
42
  def call
15
43
  ocr_data(@document)
16
44
  end
17
45
 
18
46
  private
19
47
 
48
+ ##
49
+ # Handles parsing the PDF and determining if OCR is needed.
50
+ #
51
+ # @param document [String, File, IO] The PDF document
52
+ # @return [Hash]
53
+ #
20
54
  def ocr_data(document)
21
55
  extracted_text = ""
22
56
  is_scanned = false
@@ -48,6 +82,13 @@ module Ocr
48
82
  scanned_pdf_ocr(file)
49
83
  end
50
84
 
85
+ ##
86
+ # Returns a File object from the given document
87
+ #
88
+ # @param document [String, File, IO]
89
+ # @return [File]
90
+ # @raise [ArgumentError] if the type is unsupported
91
+ #
51
92
  def get_file_from(document)
52
93
  return document.tap(&:open) if document.respond_to?(:open)
53
94
  return document if document.is_a?(File)
@@ -57,23 +98,47 @@ module Ocr
57
98
  raise ArgumentError, "Unsupported document type: #{document.class}"
58
99
  end
59
100
 
101
+ ##
102
+ # Safely extract text from a PDF page
103
+ #
104
+ # @param page [PDF::Reader::Page]
105
+ # @return [String]
106
+ #
60
107
  def safe_page_text(page)
61
108
  page.text.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
62
109
  rescue
63
110
  ""
64
111
  end
65
112
 
113
+ ##
114
+ # Determine if a PDF is likely scanned
115
+ #
116
+ # @param text [String]
117
+ # @return [Boolean]
118
+ #
66
119
  def scanned_pdf?(text)
67
120
  return true if text.empty?
68
121
  junk_ratio = text.count("^A-Za-z0-9\s").to_f / text.size
69
122
  junk_ratio > 0.5 || text.size < 100
70
123
  end
71
124
 
125
+ ##
126
+ # Check if the page is mostly non-text content
127
+ #
128
+ # @param text [String]
129
+ # @return [Boolean]
130
+ #
72
131
  def mostly_junk?(text)
73
132
  return true if text.empty?
74
133
  text.scan(/[A-Za-z]/).count < (text.size * 0.2)
75
134
  end
76
135
 
136
+ ##
137
+ # Perform OCR on scanned PDFs
138
+ #
139
+ # @param file [File, String]
140
+ # @return [Hash]
141
+ #
77
142
  def scanned_pdf_ocr(file)
78
143
  images = []
79
144
  full_text = ""
@@ -94,12 +159,24 @@ module Ocr
94
159
  cleanup(images)
95
160
  end
96
161
 
162
+ ##
163
+ # Convert PDF to PNG images
164
+ #
165
+ # @param pdf_path [String]
166
+ # @return [Array<String>] List of image paths
167
+ #
97
168
  def convert_pdf_to_images(pdf_path)
98
169
  output_prefix = File.join(Dir.tmpdir, "ocr_page_#{SecureRandom.hex(4)}")
99
170
  system("pdftoppm -png -r 300 #{Shellwords.escape(pdf_path)} #{Shellwords.escape(output_prefix)}")
100
171
  Dir["#{output_prefix}-*.png"]
101
172
  end
102
173
 
174
+ ##
175
+ # Extract text from an image using Tesseract
176
+ #
177
+ # @param image_path [String]
178
+ # @return [String]
179
+ #
103
180
  def extract_text(image_path)
104
181
  RTesseract.new(image_path, lang: "eng", processor: "mini_magick").to_s
105
182
  rescue => e
@@ -107,10 +184,20 @@ module Ocr
107
184
  ""
108
185
  end
109
186
 
187
+ ##
188
+ # Cleanup temporary images
189
+ #
190
+ # @param images [Array<String>]
191
+ #
110
192
  def cleanup(images)
111
193
  images&.each { |img| File.delete(img) if File.exist?(img) }
112
194
  end
113
195
 
196
+ ##
197
+ # Log warnings to Rails logger or stderr
198
+ #
199
+ # @param message [String]
200
+ #
114
201
  def log_warning(message)
115
202
  if defined?(Rails)
116
203
  Rails.logger.warn(message)
data/lib/ocr/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ocr
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.2"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf_ocr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ravi Shankar Singhal
@@ -106,7 +106,7 @@ metadata:
106
106
  homepage_uri: https://github.com/RaviShankarSinghal/ocr_gem
107
107
  source_code_uri: https://github.com/RaviShankarSinghal/ocr_gem
108
108
  changelog_uri: https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md
109
- documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.1
109
+ documentation_uri: https://rubydoc.info/gems/pdf_ocr/0.1.2
110
110
  post_install_message:
111
111
  rdoc_options: []
112
112
  require_paths: