ocr-file 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +56 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +83 -0
- data/LICENSE +21 -0
- data/README.md +121 -0
- data/Rakefile +10 -0
- data/bin/console +11 -0
- data/bin/ocr-file +5 -0
- data/bin/setup +8 -0
- data/lib/ocr-file/cli.rb +5 -0
- data/lib/ocr-file/document.rb +195 -0
- data/lib/ocr-file/file_helpers.rb +40 -0
- data/lib/ocr-file/image_engines/image_magick.rb +14 -0
- data/lib/ocr-file/image_engines/pdf_engine.rb +75 -0
- data/lib/ocr-file/image_engines/pdftoppm.rb +27 -0
- data/lib/ocr-file/ocr_engines/cloud_vision.rb +59 -0
- data/lib/ocr-file/ocr_engines/tesseract.rb +22 -0
- data/lib/ocr-file/version.rb +3 -0
- data/lib/ocr-file.rb +19 -0
- data/ocr-file.gemspec +38 -0
- metadata +151 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6b558ce36c35e74b410f42928eae1a987485d1bbd64da77750574062bc05b91e
|
4
|
+
data.tar.gz: d906c620a02c5a2d139b3d89d05e9b3872ee6c929b4aa661b20f9033d8f3605f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 81049908609ba3d622be2b6f99dabeca2960a455fa3d56ee1fca4c177c2ee4365281421c1128ec3fa5476d068daa53b3d7f7600c5fd1c31fcb5834ca688f9747
|
7
|
+
data.tar.gz: 1a7dcd56a7196694371abf70633635545138bdc7bc0af2873fc5e7c22bdbfc97e9986ba1a18afc288b24e488caf999b644ec1f9d8889ce6e5efa6fcfe776c204
|
data/.gitignore
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
# Used by dotenv library to load environment variables.
|
14
|
+
# .env
|
15
|
+
|
16
|
+
# Ignore Byebug command history file.
|
17
|
+
.byebug_history
|
18
|
+
|
19
|
+
## Specific to RubyMotion:
|
20
|
+
.dat*
|
21
|
+
.repl_history
|
22
|
+
build/
|
23
|
+
*.bridgesupport
|
24
|
+
build-iPhoneOS/
|
25
|
+
build-iPhoneSimulator/
|
26
|
+
|
27
|
+
## Specific to RubyMotion (use of CocoaPods):
|
28
|
+
#
|
29
|
+
# We recommend against adding the Pods directory to your .gitignore. However
|
30
|
+
# you should judge for yourself, the pros and cons are mentioned at:
|
31
|
+
# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
|
32
|
+
#
|
33
|
+
# vendor/Pods/
|
34
|
+
|
35
|
+
## Documentation cache and generated files:
|
36
|
+
/.yardoc/
|
37
|
+
/_yardoc/
|
38
|
+
/doc/
|
39
|
+
/rdoc/
|
40
|
+
|
41
|
+
## Environment normalization:
|
42
|
+
/.bundle/
|
43
|
+
/vendor/bundle
|
44
|
+
/lib/bundler/man/
|
45
|
+
|
46
|
+
# for a library or gem, you might want to ignore these files since the code is
|
47
|
+
# intended to run in multiple environments; otherwise, check them in:
|
48
|
+
# Gemfile.lock
|
49
|
+
# .ruby-version
|
50
|
+
# .ruby-gemset
|
51
|
+
|
52
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
53
|
+
.rvmrc
|
54
|
+
|
55
|
+
# Used by RuboCop. Remote config files pulled in from inherit_from directive.
|
56
|
+
# .rubocop-https?--*
|
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
2
|
+
|
3
|
+
## Our Pledge
|
4
|
+
|
5
|
+
In the interest of fostering an open and welcoming environment, we as
|
6
|
+
contributors and maintainers pledge to making participation in our project and
|
7
|
+
our community a harassment-free experience for everyone, regardless of age, body
|
8
|
+
size, disability, ethnicity, gender identity and expression, level of experience,
|
9
|
+
nationality, personal appearance, race, religion, or sexual identity and
|
10
|
+
orientation.
|
11
|
+
|
12
|
+
## Our Standards
|
13
|
+
|
14
|
+
Examples of behaviour that contributes to creating a positive environment
|
15
|
+
include:
|
16
|
+
|
17
|
+
* Using welcoming and inclusive language
|
18
|
+
* Being respectful of differing viewpoints and experiences
|
19
|
+
* Gracefully accepting constructive criticism
|
20
|
+
* Focusing on what is best for the community
|
21
|
+
* Showing empathy towards other community members
|
22
|
+
|
23
|
+
Examples of unacceptable behaviour by participants include:
|
24
|
+
|
25
|
+
* The use of sexualised language or imagery and unwelcome sexual attention or
|
26
|
+
advances
|
27
|
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28
|
+
* Public or private harassment
|
29
|
+
* Publishing others' private information, such as a physical or electronic
|
30
|
+
address, without explicit permission
|
31
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
32
|
+
professional setting
|
33
|
+
|
34
|
+
## Our Responsibilities
|
35
|
+
|
36
|
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37
|
+
behaviour and are expected to take appropriate and fair corrective action in
|
38
|
+
response to any instances of unacceptable behaviour.
|
39
|
+
|
40
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
41
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43
|
+
permanently any contributor for other behaviours that they deem inappropriate,
|
44
|
+
threatening, offensive, or harmful.
|
45
|
+
|
46
|
+
## Scope
|
47
|
+
|
48
|
+
This Code of Conduct applies both within project spaces and in public spaces
|
49
|
+
when an individual is representing the project or its community. Examples of
|
50
|
+
representing a project or community include using an official project e-mail
|
51
|
+
address, posting via an official social media account, or acting as an appointed
|
52
|
+
representative at an online or offline event. Representation of a project may be
|
53
|
+
further defined and clarified by project maintainers.
|
54
|
+
|
55
|
+
## Enforcement
|
56
|
+
|
57
|
+
Instances of abusive, harassing, or otherwise unacceptable behaviour may be
|
58
|
+
reported by contacting the project team at contact@jasonchalom.com. All
|
59
|
+
complaints will be reviewed and investigated and will result in a response that
|
60
|
+
is deemed necessary and appropriate to the circumstances. The project team is
|
61
|
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
62
|
+
Further details of specific enforcement policies may be posted separately.
|
63
|
+
|
64
|
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
65
|
+
faith may face temporary or permanent repercussions as determined by other
|
66
|
+
members of the project's leadership.
|
67
|
+
|
68
|
+
## Attribution
|
69
|
+
|
70
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
71
|
+
available at [http://contributor-covenant.org/version/1/4][version]
|
72
|
+
|
73
|
+
[homepage]: http://contributor-covenant.org
|
74
|
+
[version]: http://contributor-covenant.org/version/1/4/
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
ocr-file (0.0.1)
|
5
|
+
active_attr (~> 0.15.4)
|
6
|
+
console-style (~> 0.0.1)
|
7
|
+
hexapdf (~> 0.23.0)
|
8
|
+
mini_magick (~> 4.11.0)
|
9
|
+
rtesseract (~> 3.1.2)
|
10
|
+
|
11
|
+
GEM
|
12
|
+
remote: https://rubygems.org/
|
13
|
+
specs:
|
14
|
+
actionpack (7.0.3)
|
15
|
+
actionview (= 7.0.3)
|
16
|
+
activesupport (= 7.0.3)
|
17
|
+
rack (~> 2.0, >= 2.2.0)
|
18
|
+
rack-test (>= 0.6.3)
|
19
|
+
rails-dom-testing (~> 2.0)
|
20
|
+
rails-html-sanitizer (~> 1.0, >= 1.2.0)
|
21
|
+
actionview (7.0.3)
|
22
|
+
activesupport (= 7.0.3)
|
23
|
+
builder (~> 3.1)
|
24
|
+
erubi (~> 1.4)
|
25
|
+
rails-dom-testing (~> 2.0)
|
26
|
+
rails-html-sanitizer (~> 1.1, >= 1.2.0)
|
27
|
+
active_attr (0.15.4)
|
28
|
+
actionpack (>= 3.0.2, < 7.1)
|
29
|
+
activemodel (>= 3.0.2, < 7.1)
|
30
|
+
activesupport (>= 3.0.2, < 7.1)
|
31
|
+
activemodel (7.0.3)
|
32
|
+
activesupport (= 7.0.3)
|
33
|
+
activesupport (7.0.3)
|
34
|
+
concurrent-ruby (~> 1.0, >= 1.0.2)
|
35
|
+
i18n (>= 1.6, < 2)
|
36
|
+
minitest (>= 5.1)
|
37
|
+
tzinfo (~> 2.0)
|
38
|
+
builder (3.2.4)
|
39
|
+
cmdparse (3.0.7)
|
40
|
+
coderay (1.1.3)
|
41
|
+
concurrent-ruby (1.1.10)
|
42
|
+
console-style (0.0.1)
|
43
|
+
crass (1.0.6)
|
44
|
+
erubi (1.10.0)
|
45
|
+
geom2d (0.3.1)
|
46
|
+
hexapdf (0.23.0)
|
47
|
+
cmdparse (~> 3.0, >= 3.0.3)
|
48
|
+
geom2d (~> 0.3)
|
49
|
+
i18n (1.10.0)
|
50
|
+
concurrent-ruby (~> 1.0)
|
51
|
+
loofah (2.18.0)
|
52
|
+
crass (~> 1.0.2)
|
53
|
+
nokogiri (>= 1.5.9)
|
54
|
+
method_source (1.0.0)
|
55
|
+
mini_magick (4.11.0)
|
56
|
+
minitest (5.16.0)
|
57
|
+
nokogiri (1.13.6-arm64-darwin)
|
58
|
+
racc (~> 1.4)
|
59
|
+
pry (0.14.1)
|
60
|
+
coderay (~> 1.1)
|
61
|
+
method_source (~> 1.0)
|
62
|
+
racc (1.6.0)
|
63
|
+
rack (2.2.3.1)
|
64
|
+
rack-test (1.1.0)
|
65
|
+
rack (>= 1.0, < 3)
|
66
|
+
rails-dom-testing (2.0.3)
|
67
|
+
activesupport (>= 4.2.0)
|
68
|
+
nokogiri (>= 1.6)
|
69
|
+
rails-html-sanitizer (1.4.3)
|
70
|
+
loofah (~> 2.3)
|
71
|
+
rtesseract (3.1.2)
|
72
|
+
tzinfo (2.0.4)
|
73
|
+
concurrent-ruby (~> 1.0)
|
74
|
+
|
75
|
+
PLATFORMS
|
76
|
+
arm64-darwin-20
|
77
|
+
|
78
|
+
DEPENDENCIES
|
79
|
+
ocr-file!
|
80
|
+
pry (~> 0.14.1)
|
81
|
+
|
82
|
+
BUNDLED WITH
|
83
|
+
2.3.5
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2022 Jason Chalom
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
# OCR-File
|
2
|
+
A tool to combine PDF tools, OCR tools and image processing into a
|
3
|
+
single interface as both a CLI and a library.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'ocr-file'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install ocr-file
|
20
|
+
|
21
|
+
### Other required dependencies
|
22
|
+
You will need to install `tesseract` with your desired language on your system,
|
23
|
+
`pdftoppm` needs to be available and also `image-magick`.
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
```ruby
|
27
|
+
require 'ocr-file'
|
28
|
+
|
29
|
+
config = {
|
30
|
+
# Images from PDF
|
31
|
+
filetype: 'png',
|
32
|
+
quality: 100,
|
33
|
+
dpi: 300,
|
34
|
+
# Text to PDF
|
35
|
+
font: 'Helvetica',
|
36
|
+
font_size: 5, #8 # 12
|
37
|
+
text_x: 20,
|
38
|
+
text_y: 800,
|
39
|
+
minimum_word: 5,
|
40
|
+
# Cloud-Vision OCR
|
41
|
+
image_annotator: nil, # Needed for Cloud-Vision
|
42
|
+
type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
|
43
|
+
ocr_engine: 'tesseract', # 'cloud-vision'
|
44
|
+
# Image Pre-Processing
|
45
|
+
image_pre_preprocess: true,
|
46
|
+
effects: ['bw', 'norm'],
|
47
|
+
threshold: 0.25,
|
48
|
+
# PDF to Image Processing
|
49
|
+
optimise_pdf: true,
|
50
|
+
extract_pdf_images: true, # if false will screenshot each PDF page
|
51
|
+
temp_filename_prefix: 'image',
|
52
|
+
# Console Output
|
53
|
+
verbose: true,
|
54
|
+
}
|
55
|
+
|
56
|
+
doc = OcrFile::Document.new(
|
57
|
+
original_file_path: '/path-to-original-file/', # supports PDFs and images
|
58
|
+
save_file_path: '/folder-to-save-to/',
|
59
|
+
config: config # Not needed as defaults are used when not provided
|
60
|
+
)
|
61
|
+
|
62
|
+
doc.to_s # Returns text, removes temp files and wont save
|
63
|
+
doc.to_pdf # Saves a PDF (either searchable over the images or dumped text)
|
64
|
+
doc.to_text # Saves a text file with OCR text
|
65
|
+
|
66
|
+
# How to generate PDFs of images or text files:
|
67
|
+
original_file_path = 'file.txt' OR 'file.png'
|
68
|
+
|
69
|
+
doc = OcrFile::Document.new(
|
70
|
+
original_file_path: original_file_path, # supports PDFs and images
|
71
|
+
save_file_path: '/folder-to-save-to/',
|
72
|
+
config: config # Not needed as defaults are used when not provided
|
73
|
+
)
|
74
|
+
|
75
|
+
doc.to_pdf
|
76
|
+
|
77
|
+
# How to merge files into a single PDF:
|
78
|
+
filepaths = []
|
79
|
+
documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
|
80
|
+
merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
|
81
|
+
OcrFile::ImageEngines::PdfEngine.save_pdf(merged_document, save_file_path, optimise: true)
|
82
|
+
```
|
83
|
+
|
84
|
+
### Notes / Tips
|
85
|
+
Set `extract_pdf_images` to `false` for higher quality OCR. However this will consume more temporary space per PDF page and also be considerably slower.
|
86
|
+
|
87
|
+
Image pre-processing is not yet implemented.
|
88
|
+
|
89
|
+
## Development
|
90
|
+
|
91
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
92
|
+
|
93
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
94
|
+
|
95
|
+
### TODOs
|
96
|
+
- input validation
|
97
|
+
- CLI
|
98
|
+
- image processing
|
99
|
+
- password
|
100
|
+
- Base64 encoding
|
101
|
+
- requirements checking (installed dependencies etc ...)
|
102
|
+
- Tests
|
103
|
+
- Configurable temp folder cleanup
|
104
|
+
- Improve console output
|
105
|
+
|
106
|
+
### Tests
|
107
|
+
To run tests execute:
|
108
|
+
|
109
|
+
$ rake test
|
110
|
+
|
111
|
+
## Contributing
|
112
|
+
|
113
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/trex22/ocr-file. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
114
|
+
|
115
|
+
## License
|
116
|
+
|
117
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
118
|
+
|
119
|
+
## Code of Conduct
|
120
|
+
|
121
|
+
Everyone interacting in the OCR-File: project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/trex22/ocr-file/blob/master/CODE_OF_CONDUCT.md).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "ocr-file"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
require "pry"
|
11
|
+
Pry.start
|
data/bin/ocr-file
ADDED
data/bin/setup
ADDED
data/lib/ocr-file/cli.rb
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
module OcrFile
|
2
|
+
class Document
|
3
|
+
ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
|
4
|
+
PAGE_BREAK = "\n\r\n" # TODO: Make configurable
|
5
|
+
DEFAULT_CONFIG = {
|
6
|
+
# Images from PDF
|
7
|
+
filetype: 'png',
|
8
|
+
quality: 100,
|
9
|
+
dpi: 300,
|
10
|
+
# Text to PDF
|
11
|
+
font: 'Helvetica',
|
12
|
+
font_size: 5, #8 # 12
|
13
|
+
text_x: 20,
|
14
|
+
text_y: 800,
|
15
|
+
minimum_word: 5,
|
16
|
+
# Cloud-Vision OCR
|
17
|
+
image_annotator: nil, # Needed for Cloud-Vision
|
18
|
+
type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
|
19
|
+
ocr_engine: 'tesseract', # 'cloud-vision'
|
20
|
+
# Image Pre-Processing
|
21
|
+
image_pre_preprocess: true,
|
22
|
+
effects: ['bw', 'norm'],
|
23
|
+
threshold: 0.25,
|
24
|
+
# PDF to Image Processing
|
25
|
+
optimise_pdf: true,
|
26
|
+
extract_pdf_images: true, # if false will screenshot each PDF page
|
27
|
+
temp_filename_prefix: 'image',
|
28
|
+
# Console Output
|
29
|
+
verbose: true,
|
30
|
+
}
|
31
|
+
|
32
|
+
attr_reader :original_file_path,
|
33
|
+
:filename,
|
34
|
+
:save_file_path,
|
35
|
+
:final_save_file,
|
36
|
+
:config,
|
37
|
+
:ocr_engine
|
38
|
+
|
39
|
+
# save_file_path will also generate a tmp path for tmp files. Expected folder path
|
40
|
+
# TODO: Add in more input validation
|
41
|
+
def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG)
|
42
|
+
@original_file_path = original_file_path
|
43
|
+
@filename = original_file_path.split('/').last.split('.').first
|
44
|
+
|
45
|
+
date = Time.now.to_s.split(' ').first
|
46
|
+
|
47
|
+
@save_file_path = save_file_path
|
48
|
+
@final_save_file = "#{@save_file_path}/#{@filename}-#{date}-#{Time.now.to_i}"
|
49
|
+
|
50
|
+
@config = config
|
51
|
+
@ocr_engine = find_ocr_engine(config[:ocr_engine])
|
52
|
+
end
|
53
|
+
|
54
|
+
def pdf?
|
55
|
+
@original_file_path.include?('.pdf')
|
56
|
+
end
|
57
|
+
|
58
|
+
def image?
|
59
|
+
return false if pdf?
|
60
|
+
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
|
61
|
+
end
|
62
|
+
|
63
|
+
# Treat anything which isnt a PDF or image as text
|
64
|
+
def text?
|
65
|
+
!pdf? && !image?
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_pdf
|
69
|
+
if pdf?
|
70
|
+
create_temp_folder
|
71
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
72
|
+
|
73
|
+
pdfs_to_merge = []
|
74
|
+
|
75
|
+
image_paths.each do |image_path|
|
76
|
+
pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
|
77
|
+
end
|
78
|
+
|
79
|
+
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
|
80
|
+
|
81
|
+
OcrFile::ImageEngines::PdfEngine
|
82
|
+
.save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
83
|
+
|
84
|
+
close
|
85
|
+
elsif text?
|
86
|
+
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
87
|
+
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
88
|
+
|
89
|
+
OcrFile::ImageEngines::PdfEngine
|
90
|
+
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
91
|
+
else # is an image
|
92
|
+
ocr_image_to_pdf
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def to_text
|
97
|
+
if pdf?
|
98
|
+
create_temp_folder
|
99
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
100
|
+
|
101
|
+
image_paths.each do |image_path|
|
102
|
+
text = @ocr_engine.ocr_to_text(image_path, options: @config)
|
103
|
+
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
|
104
|
+
end
|
105
|
+
|
106
|
+
close
|
107
|
+
elsif text?
|
108
|
+
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
109
|
+
else # is an image
|
110
|
+
ocr_image_to_text(save: true)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def to_s
|
115
|
+
if pdf?
|
116
|
+
create_temp_folder
|
117
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
118
|
+
|
119
|
+
text = ''
|
120
|
+
|
121
|
+
image_paths.each do |image_path|
|
122
|
+
text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
|
123
|
+
end
|
124
|
+
|
125
|
+
close
|
126
|
+
text
|
127
|
+
elsif text?
|
128
|
+
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
129
|
+
else # is an image
|
130
|
+
ocr_image_to_text(save: false)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def close
|
135
|
+
::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
def extract_image_paths_from_pdf(file_path)
|
141
|
+
document = OcrFile::ImageEngines::PdfEngine.open_pdf(file_path, password: '')
|
142
|
+
|
143
|
+
if @config[:extract_pdf_images]
|
144
|
+
OcrFile::ImageEngines::PdfEngine
|
145
|
+
.extract_images(document, @temp_folder_path, verbose: @config[:verbose])
|
146
|
+
else # Generate screenshots of each image
|
147
|
+
OcrFile::ImageEngines::Pdftoppm.images_from_pdf(
|
148
|
+
file_path,
|
149
|
+
@temp_folder_path,
|
150
|
+
filename: @config[:temp_filename_prefix],
|
151
|
+
filetype: @config[:filetype],
|
152
|
+
quality: @config[:quality],
|
153
|
+
dpi: @config[:dpi],
|
154
|
+
verbose: @config[:verbose]
|
155
|
+
)
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def create_temp_folder
|
160
|
+
# TODO: Make this a bit more robust
|
161
|
+
@temp_folder_path = "#{save_file_path}/temp/".gsub(' ', '\ ')
|
162
|
+
::OcrFile::FileHelpers.make_directory(@temp_folder_path)
|
163
|
+
end
|
164
|
+
|
165
|
+
def ocr_image_to_pdf
|
166
|
+
pdf_document = @ocr_engine.ocr_to_pdf(@original_file_path, options: @config)
|
167
|
+
OcrFile::ImageEngines::PdfEngine
|
168
|
+
.save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
169
|
+
end
|
170
|
+
|
171
|
+
def ocr_image_to_text(save: true)
|
172
|
+
text = @ocr_engine.ocr_to_text(@original_file_path, options: @config)
|
173
|
+
|
174
|
+
if save
|
175
|
+
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
|
176
|
+
else
|
177
|
+
text
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def find_ocr_engine(engine_id)
|
182
|
+
ocr_engine_constants
|
183
|
+
.map { |c| ocr_module(c) }
|
184
|
+
.find { |selected_module| selected_module.id == engine_id }
|
185
|
+
end
|
186
|
+
|
187
|
+
def ocr_module(constant)
|
188
|
+
OcrFile::OcrEngines.const_get(constant)
|
189
|
+
end
|
190
|
+
|
191
|
+
def ocr_engine_constants
|
192
|
+
OcrFile::OcrEngines.constants
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module OcrFile
|
2
|
+
module FileHelpers
|
3
|
+
extend self
|
4
|
+
|
5
|
+
def merge_pdfs(file_paths, save_file_path)
|
6
|
+
documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path) }
|
7
|
+
merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
|
8
|
+
save_pdf(merged_document, save_file_path, optimise: true)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Beware this is dangerous!
|
12
|
+
def clear_folder(path)
|
13
|
+
return unless path.include?('/temp') # Small hacky safeguard
|
14
|
+
`rm -rf #{path}` # Cleanup
|
15
|
+
end
|
16
|
+
|
17
|
+
def make_directory(path)
|
18
|
+
`mkdir -p #{path}`
|
19
|
+
end
|
20
|
+
|
21
|
+
def open_json(path)
|
22
|
+
JSON.parse(File.read(path))
|
23
|
+
end
|
24
|
+
|
25
|
+
def append_file(path, text)
|
26
|
+
File.open(path, 'a') { |file| file.write(text) }
|
27
|
+
end
|
28
|
+
|
29
|
+
def open_text_file(path)
|
30
|
+
File.read(path)
|
31
|
+
end
|
32
|
+
|
33
|
+
def fetch_temp_image_paths(save_path, temp_filename, filetype)
|
34
|
+
filenames = `ls #{save_path} | grep .#{filetype}`.split("\n")
|
35
|
+
filenames.map do |filename|
|
36
|
+
"#{save_path}/#{filename}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module OcrFile
|
2
|
+
module ImageEngines
|
3
|
+
module PdfEngine
|
4
|
+
extend self
|
5
|
+
|
6
|
+
PAGE_BREAK = "\n\r\n"
|
7
|
+
|
8
|
+
DEFAULT_PAGE_OPTIONS = {
|
9
|
+
font: 'Helvetica',
|
10
|
+
font_size: 5, #8 # 12
|
11
|
+
text_x: 20,
|
12
|
+
text_y: 800,
|
13
|
+
minimum_word: 5,
|
14
|
+
}
|
15
|
+
|
16
|
+
def pdf_from_text(text, options = DEFAULT_PAGE_OPTIONS)
|
17
|
+
document = ::HexaPDF::Document.new
|
18
|
+
|
19
|
+
text
|
20
|
+
.split(PAGE_BREAK)
|
21
|
+
.reject { |line| line.size < options[:minimum_word] }
|
22
|
+
.each { |page_text| document = add_page(document, page_text, options) }
|
23
|
+
|
24
|
+
document
|
25
|
+
end
|
26
|
+
|
27
|
+
def add_page(document, text, options)
|
28
|
+
canvas = document.pages.add.canvas
|
29
|
+
canvas.font(options[:font], size: options[:font_size])
|
30
|
+
canvas.text(text, at: [options[:text_x], options[:text_y]])
|
31
|
+
|
32
|
+
document
|
33
|
+
end
|
34
|
+
|
35
|
+
def save_pdf(document, save_file_path, optimise: true)
|
36
|
+
document.write(save_file_path, optimize: true)
|
37
|
+
end
|
38
|
+
|
39
|
+
def open_pdf(file, password: '')
|
40
|
+
::HexaPDF::Document.open(file, decryption_opts: { password: password })
|
41
|
+
end
|
42
|
+
|
43
|
+
def extract_images(document, save_path, verbose: false)
|
44
|
+
image_paths = []
|
45
|
+
|
46
|
+
::HexaPDF::CLI::Images.new.send(:each_image, document) do |image, index, pindex, (_x_ppi, _y_ppi)|
|
47
|
+
puts "Processing page: #{pindex} ..."
|
48
|
+
info = image.info
|
49
|
+
|
50
|
+
if info.writable
|
51
|
+
image_filename = "#{index}.#{image.info.extension}"
|
52
|
+
image_path = "#{save_path}/#{image_filename}"
|
53
|
+
image.write(image_path)
|
54
|
+
|
55
|
+
image_paths << image_path
|
56
|
+
elsif command_parser.verbosity_warning?
|
57
|
+
puts style("Warning (image #{index}, page #{pindex}): PDF image format not supported for writing", RED)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
image_paths
|
62
|
+
end
|
63
|
+
|
64
|
+
def merge(documents)
|
65
|
+
target = ::HexaPDF::Document.new
|
66
|
+
|
67
|
+
documents.each do |document|
|
68
|
+
document.pages.each { |page| target.pages << target.import(page) }
|
69
|
+
end
|
70
|
+
|
71
|
+
target
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module OcrFile
|
2
|
+
module ImageEngines
|
3
|
+
module Pdftoppm
|
4
|
+
extend self
|
5
|
+
|
6
|
+
# TODO: other options
|
7
|
+
# https://www.xpdfreader.com/pdftoppm-man.html
|
8
|
+
# password
|
9
|
+
# −mono Generate a monochrome PBM file (instead of an RGB PPM file).
|
10
|
+
# −gray Generate a grayscale PGM file (instead of an RGB PPM file).
|
11
|
+
# −cmyk Generate a CMYK PAM file (instead of an RGB PPM file).
|
12
|
+
def images_from_pdf(pdf_path, save_path, filename: 'image', filetype: 'png', quality: 100, dpi: 300, verbose: true)
|
13
|
+
print 'Generating screenshots of each PDF page ... '
|
14
|
+
|
15
|
+
if filetype == 'jpg'
|
16
|
+
`pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
|
17
|
+
else
|
18
|
+
`pdftoppm -#{filetype} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
|
19
|
+
end
|
20
|
+
|
21
|
+
puts 'Complete!'
|
22
|
+
|
23
|
+
OcrFile::FileHelpers.fetch_temp_image_paths(save_path, filename, filetype)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module OcrFile
|
2
|
+
module OcrEngines
|
3
|
+
module CloudVision
|
4
|
+
extend self
|
5
|
+
|
6
|
+
DEFAULT_LANGUAGE = 'en'
|
7
|
+
|
8
|
+
# Available Types: https://github.com/googleapis/google-cloud-ruby/blob/master/google-cloud-vision/lib/google/cloud/vision/v1/image_annotator_pb.rb
|
9
|
+
TEXT_DETECTION = 'TEXT_DETECTION' # Used for low-quality images
|
10
|
+
DOCUMENT_TEXT_DETECTION = 'DOCUMENT_TEXT_DETECTION' # Used for dense text documents
|
11
|
+
|
12
|
+
def id
|
13
|
+
'cloud-vision'
|
14
|
+
end
|
15
|
+
|
16
|
+
def ocr_to_text(file_path, options: { type_of_ocr: '', image_annotator: nil })
|
17
|
+
type_of_ocr = options[:type_of_ocr]
|
18
|
+
image_annotator = options[:image_annotator]
|
19
|
+
|
20
|
+
response = detect_text(type_of_ocr, file_path, image_annotator)
|
21
|
+
extract_text(response)
|
22
|
+
end
|
23
|
+
|
24
|
+
def ocr_to_pdf(file_path, options: { type_of_ocr: '', image_annotator: nil })
|
25
|
+
text = ocr_to_text(file_path, options: { type_of_ocr: '', image_annotator: nil })
|
26
|
+
OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, options)
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def detect_text(type_of_ocr, image_path, image_annotator)
|
32
|
+
if type_of_ocr == 'DOCUMENT_TEXT_DETECTION'
|
33
|
+
image_annotator.document_text_detection(image: image_path)
|
34
|
+
else
|
35
|
+
image_annotator.text_detection(image: image_path)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def extract_text(response)
|
40
|
+
raw_text = ''
|
41
|
+
foreign_text = ''
|
42
|
+
|
43
|
+
response.responses.each do |section|
|
44
|
+
section.text_annotations.each do |annotation|
|
45
|
+
raw_text << annotation.description
|
46
|
+
|
47
|
+
if annotation.locale && annotation.locale != DEFAULT_LANGUAGE
|
48
|
+
foreign_text << annotation.description
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
raw_text = raw_text.split("\n")
|
54
|
+
raw_text.pop # Remove the last line
|
55
|
+
raw_text.join("\n")
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module OcrFile
|
2
|
+
module OcrEngines
|
3
|
+
module Tesseract
|
4
|
+
extend self
|
5
|
+
|
6
|
+
def id
|
7
|
+
'tesseract'
|
8
|
+
end
|
9
|
+
|
10
|
+
def ocr_to_text(file_path, options: {})
|
11
|
+
image = ::RTesseract.new(file_path)
|
12
|
+
image.to_s # Getting the value
|
13
|
+
end
|
14
|
+
|
15
|
+
def ocr_to_pdf(file_path, options: {})
|
16
|
+
image = ::RTesseract.new(file_path)
|
17
|
+
raw_output = image.to_pdf # Getting open file of pdf
|
18
|
+
OcrFile::ImageEngines::PdfEngine.open_pdf(raw_output, password: '')
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/ocr-file.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'hexapdf'
|
2
|
+
require 'hexapdf/cli/images'
|
3
|
+
require 'rtesseract'
|
4
|
+
require 'mini_magick'
|
5
|
+
|
6
|
+
require 'ocr-file/version'
|
7
|
+
|
8
|
+
require 'ocr-file/image_engines/pdf_engine'
|
9
|
+
require 'ocr-file/image_engines/image_magick'
|
10
|
+
require 'ocr-file/image_engines/pdftoppm'
|
11
|
+
require 'ocr-file/ocr_engines/tesseract'
|
12
|
+
require 'ocr-file/ocr_engines/cloud_vision'
|
13
|
+
require 'ocr-file/file_helpers'
|
14
|
+
require 'ocr-file/document'
|
15
|
+
require 'ocr-file/cli'
|
16
|
+
|
17
|
+
module OcrFile
|
18
|
+
class Error < StandardError; end
|
19
|
+
end
|
data/ocr-file.gemspec
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
lib = File.expand_path("../lib", __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require "ocr-file/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "ocr-file"
|
7
|
+
spec.version = OcrFile::VERSION
|
8
|
+
spec.authors = ["trex22"]
|
9
|
+
spec.email = ["contact@jasonchalom.com"]
|
10
|
+
|
11
|
+
spec.summary = "A tool to combine PDF tools, OCR tools and image processing into a single interface as both a CLI and a library."
|
12
|
+
spec.description = "A tool to combine PDF tools, OCR tools and image processing into a single interface as both a CLI and a library."
|
13
|
+
spec.homepage = "https://github.com/TRex22/ocr-file"
|
14
|
+
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Specify which files should be added to the gem when it is released.
|
18
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
19
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
20
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
21
|
+
end
|
22
|
+
|
23
|
+
spec.bindir = "bin"
|
24
|
+
spec.executables = ["ocr-file"] #spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
25
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
26
|
+
spec.require_paths = ["lib"]
|
27
|
+
|
28
|
+
# Dependencies
|
29
|
+
spec.add_dependency "console-style", "~> 0.0.1"
|
30
|
+
|
31
|
+
spec.add_dependency "active_attr", "~> 0.15.4"
|
32
|
+
spec.add_dependency "hexapdf", "~> 0.23.0"
|
33
|
+
spec.add_dependency "rtesseract", "~> 3.1.2"
|
34
|
+
spec.add_dependency "mini_magick", "~> 4.11.0"
|
35
|
+
|
36
|
+
# Development Dependencies
|
37
|
+
spec.add_development_dependency "pry", "~> 0.14.1"
|
38
|
+
end
|
metadata
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ocr-file
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- trex22
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-06-19 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: console-style
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.0.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.0.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: active_attr
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.15.4
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.15.4
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: hexapdf
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.23.0
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.23.0
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rtesseract
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 3.1.2
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 3.1.2
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: mini_magick
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 4.11.0
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 4.11.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: pry
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.14.1
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.14.1
|
97
|
+
description: A tool to combine PDF tools, OCR tools and image processing into a single
|
98
|
+
interface as both a CLI and a library.
|
99
|
+
email:
|
100
|
+
- contact@jasonchalom.com
|
101
|
+
executables:
|
102
|
+
- ocr-file
|
103
|
+
extensions: []
|
104
|
+
extra_rdoc_files: []
|
105
|
+
files:
|
106
|
+
- ".gitignore"
|
107
|
+
- CODE_OF_CONDUCT.md
|
108
|
+
- Gemfile
|
109
|
+
- Gemfile.lock
|
110
|
+
- LICENSE
|
111
|
+
- README.md
|
112
|
+
- Rakefile
|
113
|
+
- bin/console
|
114
|
+
- bin/ocr-file
|
115
|
+
- bin/setup
|
116
|
+
- lib/ocr-file.rb
|
117
|
+
- lib/ocr-file/cli.rb
|
118
|
+
- lib/ocr-file/document.rb
|
119
|
+
- lib/ocr-file/file_helpers.rb
|
120
|
+
- lib/ocr-file/image_engines/image_magick.rb
|
121
|
+
- lib/ocr-file/image_engines/pdf_engine.rb
|
122
|
+
- lib/ocr-file/image_engines/pdftoppm.rb
|
123
|
+
- lib/ocr-file/ocr_engines/cloud_vision.rb
|
124
|
+
- lib/ocr-file/ocr_engines/tesseract.rb
|
125
|
+
- lib/ocr-file/version.rb
|
126
|
+
- ocr-file.gemspec
|
127
|
+
homepage: https://github.com/TRex22/ocr-file
|
128
|
+
licenses:
|
129
|
+
- MIT
|
130
|
+
metadata: {}
|
131
|
+
post_install_message:
|
132
|
+
rdoc_options: []
|
133
|
+
require_paths:
|
134
|
+
- lib
|
135
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
136
|
+
requirements:
|
137
|
+
- - ">="
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: '0'
|
140
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - ">="
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '0'
|
145
|
+
requirements: []
|
146
|
+
rubygems_version: 3.3.4
|
147
|
+
signing_key:
|
148
|
+
specification_version: 4
|
149
|
+
summary: A tool to combine PDF tools, OCR tools and image processing into a single
|
150
|
+
interface as both a CLI and a library.
|
151
|
+
test_files: []
|