pdf_ocr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 32727eeb24656d1fce7cb43f2f5192f29cfda53192ef161cfae047f2871f6bff
4
+ data.tar.gz: 558586ded2489faf79ce7f36ee1ab6df267d9dc30d67e6ba554be61bde959e19
5
+ SHA512:
6
+ metadata.gz: c02b99bb1e652fe8c26ad80ed8dc4652c8eab5cc9a8bb4699b656080066772f811ee66ced0faa584f9a526322620c9e628f3a47c194f54b900706f968274c4dc
7
+ data.tar.gz: 9d7fea0ffe63fb2c10825d906831fb70dce2f1ab3d3d0c02c814dbd499c81fa906f23bc27e794d5ed3670b381c86b9da1b6f1abc091392684f5c17f97be000b4
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in ocr.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+ gem "rspec"
10
+ gem "pdf-reader"
11
+ gem "mini_magick"
12
+ gem "byebug"
13
+ gem "rtesseract"
data/Gemfile.lock ADDED
@@ -0,0 +1,63 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ pdf_ocr (0.1.0)
5
+ mini_magick
6
+ pdf-reader
7
+ rtesseract
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ Ascii85 (2.0.1)
13
+ afm (0.2.2)
14
+ bigdecimal (3.3.1)
15
+ byebug (12.0.0)
16
+ diff-lcs (1.6.2)
17
+ hashery (2.1.2)
18
+ mini_magick (4.13.2)
19
+ mini_portile2 (2.8.9)
20
+ nokogiri (1.18.10)
21
+ mini_portile2 (~> 2.8.2)
22
+ racc (~> 1.4)
23
+ pdf-reader (2.15.0)
24
+ Ascii85 (>= 1.0, < 3.0, != 2.0.0)
25
+ afm (>= 0.2.1, < 2)
26
+ hashery (~> 2.0)
27
+ ruby-rc4
28
+ ttfunk
29
+ racc (1.8.1)
30
+ rake (13.3.0)
31
+ rspec (3.13.1)
32
+ rspec-core (~> 3.13.0)
33
+ rspec-expectations (~> 3.13.0)
34
+ rspec-mocks (~> 3.13.0)
35
+ rspec-core (3.13.5)
36
+ rspec-support (~> 3.13.0)
37
+ rspec-expectations (3.13.5)
38
+ diff-lcs (>= 1.2.0, < 2.0)
39
+ rspec-support (~> 3.13.0)
40
+ rspec-mocks (3.13.6)
41
+ diff-lcs (>= 1.2.0, < 2.0)
42
+ rspec-support (~> 3.13.0)
43
+ rspec-support (3.13.6)
44
+ rtesseract (2.2.0)
45
+ nokogiri
46
+ ruby-rc4 (0.1.5)
47
+ ttfunk (1.8.0)
48
+ bigdecimal (~> 3.1)
49
+
50
+ PLATFORMS
51
+ x86_64-linux
52
+
53
+ DEPENDENCIES
54
+ byebug
55
+ mini_magick
56
+ pdf-reader
57
+ pdf_ocr!
58
+ rake (~> 13.0)
59
+ rspec
60
+ rtesseract
61
+
62
+ BUNDLED WITH
63
+ 2.4.12
data/README.md ADDED
@@ -0,0 +1,138 @@
1
+ # OCR
2
+
3
+ A lightweight Ruby gem for extracting text from PDFs, including scanned PDFs using OCR.
4
+
5
+ This gem supports:
6
+
7
+ - PDFs with readable text
8
+ - Scanned PDFs using Tesseract OCR
9
+ - File objects, file paths, StringIO, and Rails/ActiveStorage uploads
10
+ - Fully Rails-independent
11
+
12
+ ---
13
+
14
+ ## 🚀 Features
15
+
16
+ - Detect if PDF is scanned or text-based
17
+ - Extract text from normal PDFs using `PDF::Reader`
18
+ - Extract text from scanned PDFs using `RTesseract` and `MiniMagick`
19
+ - Automatic cleanup of temporary images
20
+
21
+ ---
22
+
23
+ ## 💻 Installation
24
+
25
+ Add this line to your application's Gemfile:
26
+
27
+ ```ruby
28
+ gem 'ocr', git: 'https://github.com/your_username/ocr.git'
29
+ ```
30
+
31
+ Or install directly:
32
+ ```ruby
33
+ gem install ocr
34
+ ```
35
+
36
+ ## Dependencies
37
+ - PDF::Reader
38
+
39
+ - RTesseract
40
+
41
+ - MiniMagick
42
+
43
+ - Tesseract OCR (system-level executable)
44
+
45
+ - pdftoppm from Poppler utils (for converting PDF pages to images)
46
+
47
+ ## ⚙️ Usage
48
+ ```ruby
49
+ require 'ocr'
50
+ require 'stringio'
51
+
52
+ # From a File object
53
+ file = File.open("path/to/document.pdf")
54
+ result = Ocr::DataExtractor.new(file).call
55
+ puts result["raw_text"] if result["success"]
56
+
57
+ # From a file path string
58
+ result = Ocr::DataExtractor.new("path/to/document.pdf").call
59
+
60
+ # From a StringIO object (in-memory PDF)
61
+ pdf_data = StringIO.new(File.read("path/to/document.pdf"))
62
+ result = Ocr::DataExtractor.new(pdf_data).call
63
+ ```
64
+
65
+ ## Example Result
66
+ ```ruby
67
+ {
68
+ "success" => true,
69
+ "raw_text" => "Extracted text content from PDF ..."
70
+ }
71
+ ```
72
+ - If OCR fails for a scanned PDF:
73
+ ```ruby
74
+ {
75
+ "success" => false,
76
+ "message" => "Unable to extract text using OCR"
77
+ }
78
+ ```
79
+ ## 🔧 Notes
80
+ 1. Ensure Tesseract OCR is installed on your system:
81
+ ```
82
+ # Ubuntu/Debian
83
+ sudo apt install tesseract-ocr
84
+
85
+ # MacOS (with Homebrew)
86
+ brew install tesseract
87
+ ```
88
+ 2. Ensure pdftoppm is installed (for PDF-to-image conversion):
89
+ ```
90
+ # Ubuntu/Debian
91
+ sudo apt install poppler-utils
92
+
93
+ # MacOS (with Homebrew)
94
+ brew install poppler
95
+ ```
96
+ 3. This gem does not require Rails, but it will work with Rails ActiveStorage objects that respond to .open.
97
+
98
+ ## 🧪 Running Tests
99
+ ```
100
+ bundle install
101
+ bundle exec rspec
102
+ ```
103
+
104
+ - PDFs with selectable text
105
+
106
+ - Scanned PDFs
107
+
108
+ - Malformed PDFs (fallback to OCR)
109
+
110
+ ## 📝 Contributing
111
+
112
+ - Fork the repository
113
+
114
+ - Create your feature branch (git checkout -b your-feature)
115
+
116
+ - Commit your changes (git commit -am 'Add new feature')
117
+
118
+ - Push to the branch (git push origin your-feature)
119
+
120
+ - Open a Pull Request
121
+
122
+ ## 📝 License
123
+
124
+ MIT License © RaviShankarSinghal
125
+
126
+
127
+ ---
128
+
129
+ This version includes:
130
+
131
+ - Version and build badges (replace with your repo info)
132
+ - Clear installation instructions
133
+ - Usage examples for File, path, and StringIO
134
+ - System dependencies
135
+ - Test instructions
136
+ - Contributing guidelines
137
+
138
+ ---
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ task default: %i[]
@@ -0,0 +1,122 @@
1
+ require "mini_magick"
2
+ require "pdf/reader"
3
+ require "rtesseract"
4
+ require "securerandom"
5
+ require "shellwords"
6
+ require "tmpdir"
7
+
8
+ module Ocr
9
+ class DataExtractor
10
+ def initialize(document)
11
+ @document = document
12
+ end
13
+
14
+ def call
15
+ ocr_data(@document)
16
+ end
17
+
18
+ private
19
+
20
+ def ocr_data(document)
21
+ extracted_text = ""
22
+ is_scanned = false
23
+
24
+ file = get_file_from(document)
25
+ reader = if file.respond_to?(:path)
26
+ PDF::Reader.new(file.path)
27
+ else
28
+ PDF::Reader.new(file)
29
+ end
30
+
31
+ reader.pages.each do |page|
32
+ page_text = safe_page_text(page)
33
+ extracted_text << " " << page_text
34
+
35
+ if page_text.strip.empty? || mostly_junk?(page_text)
36
+ is_scanned = true
37
+ break
38
+ end
39
+ end
40
+
41
+ if is_scanned || scanned_pdf?(extracted_text)
42
+ scanned_pdf_ocr(file)
43
+ else
44
+ { "success" => true, "raw_text" => extracted_text.strip }
45
+ end
46
+ rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
47
+ log_warning "PDF parsing failed: #{e.message}"
48
+ scanned_pdf_ocr(file)
49
+ end
50
+
51
+ def get_file_from(document)
52
+ return document.tap(&:open) if document.respond_to?(:open)
53
+ return document if document.is_a?(File)
54
+ return document if document.respond_to?(:read)
55
+ return File.open(document) if document.is_a?(String)
56
+
57
+ raise ArgumentError, "Unsupported document type: #{document.class}"
58
+ end
59
+
60
+ def safe_page_text(page)
61
+ page.text.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
62
+ rescue
63
+ ""
64
+ end
65
+
66
+ def scanned_pdf?(text)
67
+ return true if text.empty?
68
+ junk_ratio = text.count("^A-Za-z0-9\s").to_f / text.size
69
+ junk_ratio > 0.5 || text.size < 100
70
+ end
71
+
72
+ def mostly_junk?(text)
73
+ return true if text.empty?
74
+ text.scan(/[A-Za-z]/).count < (text.size * 0.2)
75
+ end
76
+
77
+ def scanned_pdf_ocr(file)
78
+ images = []
79
+ full_text = ""
80
+
81
+ images = if file.respond_to?(:path)
82
+ convert_pdf_to_images(file.path)
83
+ else
84
+ convert_pdf_to_images(file)
85
+ end
86
+ full_text += images.map { |img| extract_text(img) }.join(" ")
87
+
88
+ unless full_text.strip.empty?
89
+ { "success" => true, "raw_text" => full_text.strip }
90
+ else
91
+ { "success" => false, "message" => "Unable to extract text using OCR" }
92
+ end
93
+ ensure
94
+ cleanup(images)
95
+ end
96
+
97
+ def convert_pdf_to_images(pdf_path)
98
+ output_prefix = File.join(Dir.tmpdir, "ocr_page_#{SecureRandom.hex(4)}")
99
+ system("pdftoppm -png -r 300 #{Shellwords.escape(pdf_path)} #{Shellwords.escape(output_prefix)}")
100
+ Dir["#{output_prefix}-*.png"]
101
+ end
102
+
103
+ def extract_text(image_path)
104
+ RTesseract.new(image_path, lang: "eng", processor: "mini_magick").to_s
105
+ rescue => e
106
+ log_warning "OCR failed on #{image_path}: #{e.message}"
107
+ ""
108
+ end
109
+
110
+ def cleanup(images)
111
+ images&.each { |img| File.delete(img) if File.exist?(img) }
112
+ end
113
+
114
+ def log_warning(message)
115
+ if defined?(Rails)
116
+ Rails.logger.warn(message)
117
+ else
118
+ warn(message)
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ocr
4
+ VERSION = "0.1.0"
5
+ end
data/lib/ocr.rb ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "ocr/version"
4
+ require_relative "ocr/data_extractor"
5
+
6
+ module Ocr
7
+ class Error < StandardError; end
8
+ # Your code goes here...
9
+ end
data/ocr.gemspec ADDED
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/ocr/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "pdf_ocr"
7
+ spec.version = Ocr::VERSION
8
+ spec.authors = ["Ravi Shankar Singhal"]
9
+ spec.email = ["ravi.singhal2308@gmail.com"]
10
+
11
+ spec.summary = "A lightweight Ruby gem for extracting text from images using OCR."
12
+ spec.description = "OCR is a Ruby gem that allows you to easily extract text from image files (JPG, PNG, PDF) using Tesseract OCR engine. It provides a simple, intuitive interface for integrating OCR capabilities into your Ruby or Rails applications."
13
+ spec.homepage = "https://github.com/RaviShankarSinghal/ocr_gem"
14
+ spec.license = "MIT"
15
+
16
+ spec.required_ruby_version = ">= 2.6.0"
17
+
18
+ spec.metadata = {
19
+ "homepage_uri" => spec.homepage,
20
+ "source_code_uri" => "https://github.com/RaviShankarSinghal/ocr_gem",
21
+ "changelog_uri" => "https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md",
22
+ "documentation_uri" => "https://rubydoc.info/gems/ocr"
23
+ }
24
+
25
+ spec.files = Dir.chdir(__dir__) do
26
+ `git ls-files -z`.split("\x0").reject do |f|
27
+ (File.expand_path(f) == __FILE__) ||
28
+ f.start_with?(*%w[bin/ test/ spec/ features/ .git .circleci appveyor])
29
+ end
30
+ end
31
+
32
+ spec.bindir = "exe"
33
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
34
+ spec.require_paths = ["lib"]
35
+
36
+ # Common dependencies for OCR-based Ruby gems
37
+ # Runtime dependencies
38
+ spec.add_runtime_dependency "pdf-reader"
39
+ spec.add_runtime_dependency "mini_magick"
40
+ spec.add_runtime_dependency "rtesseract"
41
+
42
+ # Development dependencies
43
+ spec.add_development_dependency "rspec"
44
+ spec.add_development_dependency "byebug"
45
+
46
+ end
data/sig/ocr.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Ocr
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,129 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdf_ocr
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ravi Shankar Singhal
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2025-10-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: pdf-reader
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: mini_magick
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rtesseract
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: byebug
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: OCR is a Ruby gem that allows you to easily extract text from image files
84
+ (JPG, PNG, PDF) using Tesseract OCR engine. It provides a simple, intuitive interface
85
+ for integrating OCR capabilities into your Ruby or Rails applications.
86
+ email:
87
+ - ravi.singhal2308@gmail.com
88
+ executables: []
89
+ extensions: []
90
+ extra_rdoc_files: []
91
+ files:
92
+ - ".rspec"
93
+ - Gemfile
94
+ - Gemfile.lock
95
+ - README.md
96
+ - Rakefile
97
+ - lib/ocr.rb
98
+ - lib/ocr/data_extractor.rb
99
+ - lib/ocr/version.rb
100
+ - ocr.gemspec
101
+ - sig/ocr.rbs
102
+ homepage: https://github.com/RaviShankarSinghal/ocr_gem
103
+ licenses:
104
+ - MIT
105
+ metadata:
106
+ homepage_uri: https://github.com/RaviShankarSinghal/ocr_gem
107
+ source_code_uri: https://github.com/RaviShankarSinghal/ocr_gem
108
+ changelog_uri: https://github.com/RaviShankarSinghal/ocr_gem/blob/main/CHANGELOG.md
109
+ documentation_uri: https://rubydoc.info/gems/ocr
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: 2.6.0
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubygems_version: 3.3.7
126
+ signing_key:
127
+ specification_version: 4
128
+ summary: A lightweight Ruby gem for extracting text from images using OCR.
129
+ test_files: []