ocr-file 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +21 -5
- data/bin/ocr-file +1 -1
- data/lib/ocr-file/cli.rb +37 -1
- data/lib/ocr-file/document.rb +40 -17
- data/lib/ocr-file/image_engines/image_magick.rb +66 -7
- data/lib/ocr-file/image_engines/pdftoppm.rb +1 -1
- data/lib/ocr-file/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f188bc0b29f4232b379e5e15d924c57a64a1758f04d8e168d2a44a744d20d1af
|
4
|
+
data.tar.gz: 5b54d844f01a5a5249572dd0abc270ae1fb37ff0070df9ad47eb84cf5f233fe7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c51ab724a77e8b22568dc0c7cefcf3ba28407f7050976d6900824954221d4f04e677b31b58ae644c87752e60024e1667194eda8b00c89dfab30f9a81d53ba1d5
|
7
|
+
data.tar.gz: 9b521be6e75808899398e77cf0c0b9dee842350a5c81c0ba513ad56125725607906c8c19e6b493201750ba331521db4ba247723a1c09d82dfb61e8caec857428
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -42,9 +42,8 @@ You will need to install `tesseract` with your desired language on your system,
|
|
42
42
|
type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
|
43
43
|
ocr_engine: 'tesseract', # 'cloud-vision'
|
44
44
|
# Image Pre-Processing
|
45
|
-
|
46
|
-
effects: ['bw', 'norm'
|
47
|
-
threshold: 0.25,
|
45
|
+
image_preprocess: true,
|
46
|
+
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'], # Applies effects as listed. 'norm' is also available
|
48
47
|
# PDF to Image Processing
|
49
48
|
optimise_pdf: true,
|
50
49
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
@@ -84,7 +83,21 @@ You will need to install `tesseract` with your desired language on your system,
|
|
84
83
|
### Notes / Tips
|
85
84
|
Set `extract_pdf_images` to `false` for higher quality OCR. However this will consume more temporary space per PDF page and also be considerably slower.
|
86
85
|
|
87
|
-
Image pre-processing is
|
86
|
+
Image pre-processing only thresholds (bw), normalises the colour space, removes speckles and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary.
|
87
|
+
|
88
|
+
### Simple CLI
|
89
|
+
Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
|
90
|
+
|
91
|
+
```
|
92
|
+
# Basic Usage with console output
|
93
|
+
ocr-file input_file_path output_folder_path
|
94
|
+
|
95
|
+
# Output to PDF
|
96
|
+
ocr-file input_file_path output_folder_path pdf
|
97
|
+
|
98
|
+
# Output to TXT
|
99
|
+
ocr-file input_file_path output_folder_path txt
|
100
|
+
```
|
88
101
|
|
89
102
|
## Development
|
90
103
|
|
@@ -94,7 +107,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
94
107
|
|
95
108
|
### TODOs
|
96
109
|
- input validation
|
97
|
-
- CLI
|
110
|
+
- Better CLI
|
98
111
|
- image processing
|
99
112
|
- password
|
100
113
|
- Base64 encoding
|
@@ -102,6 +115,9 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
102
115
|
- Tests
|
103
116
|
- Configurable temp folder cleanup
|
104
117
|
- Improve console output
|
118
|
+
- Fix spaces in file names
|
119
|
+
- Better verbosity
|
120
|
+
- Timing
|
105
121
|
|
106
122
|
### Tests
|
107
123
|
To run tests execute:
|
data/bin/ocr-file
CHANGED
data/lib/ocr-file/cli.rb
CHANGED
@@ -1,5 +1,41 @@
|
|
1
1
|
module OcrFile
|
2
|
-
|
2
|
+
class Cli
|
3
|
+
attr_reader :args
|
3
4
|
|
5
|
+
def initialize(args)
|
6
|
+
@args = args
|
7
|
+
end
|
8
|
+
|
9
|
+
def valid?
|
10
|
+
return true if args.size == 2 || args.size == 3
|
11
|
+
false
|
12
|
+
end
|
13
|
+
|
14
|
+
def invalid?
|
15
|
+
!valid?
|
16
|
+
end
|
17
|
+
|
18
|
+
def call
|
19
|
+
# TODO: Use ConsoleStyle::Functions
|
20
|
+
# TODO: Heading and better CLI interface
|
21
|
+
# Simple cli for now
|
22
|
+
puts "OCR Tool © Jason Chalom 2022, Version: #{OcrFile::VERSION}"
|
23
|
+
abort "File path, Save Folder Paths, and output type (pdf, txt) are required!" if invalid?
|
24
|
+
|
25
|
+
# Using default config for now
|
26
|
+
original_file_path = args[0]
|
27
|
+
save_file_path = args[1]
|
28
|
+
output_type = args[2]
|
29
|
+
|
30
|
+
document = OcrFile::Document.new(original_file_path: original_file_path, save_file_path: save_file_path)
|
31
|
+
|
32
|
+
if output_type.to_s.downcase.include?('pdf')
|
33
|
+
document.to_pdf
|
34
|
+
elsif output_type.to_s.downcase.include?('txt') || output_type.to_s.downcase.include?('text')
|
35
|
+
document.to_text
|
36
|
+
else # Display in console
|
37
|
+
puts document.to_s
|
38
|
+
end
|
39
|
+
end
|
4
40
|
end
|
5
41
|
end
|
data/lib/ocr-file/document.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
module OcrFile
|
2
2
|
class Document
|
3
|
+
# TODO: Skewness / text orientation detection
|
4
|
+
# TODO: Better handwriting analysis
|
5
|
+
|
3
6
|
ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
|
4
7
|
PAGE_BREAK = "\n\r\n" # TODO: Make configurable
|
5
8
|
DEFAULT_CONFIG = {
|
@@ -18,9 +21,8 @@ module OcrFile
|
|
18
21
|
type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
|
19
22
|
ocr_engine: 'tesseract', # 'cloud-vision'
|
20
23
|
# Image Pre-Processing
|
21
|
-
|
22
|
-
effects: ['
|
23
|
-
threshold: 0.25,
|
24
|
+
image_preprocess: true,
|
25
|
+
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'],
|
24
26
|
# PDF to Image Processing
|
25
27
|
optimise_pdf: true,
|
26
28
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
@@ -52,12 +54,12 @@ module OcrFile
|
|
52
54
|
end
|
53
55
|
|
54
56
|
def pdf?
|
55
|
-
@original_file_path.include?('.pdf')
|
57
|
+
@original_file_path.downcase.include?('.pdf')
|
56
58
|
end
|
57
59
|
|
58
60
|
def image?
|
59
61
|
return false if pdf?
|
60
|
-
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
|
62
|
+
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
|
61
63
|
end
|
62
64
|
|
63
65
|
# Treat anything which isnt a PDF or image as text
|
@@ -65,6 +67,7 @@ module OcrFile
|
|
65
67
|
!pdf? && !image?
|
66
68
|
end
|
67
69
|
|
70
|
+
# Trigger OCR pipeline
|
68
71
|
def to_pdf
|
69
72
|
if pdf?
|
70
73
|
create_temp_folder
|
@@ -73,15 +76,13 @@ module OcrFile
|
|
73
76
|
pdfs_to_merge = []
|
74
77
|
|
75
78
|
image_paths.each do |image_path|
|
76
|
-
pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
|
79
|
+
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
77
80
|
end
|
78
81
|
|
79
82
|
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
|
80
83
|
|
81
84
|
OcrFile::ImageEngines::PdfEngine
|
82
85
|
.save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
83
|
-
|
84
|
-
close
|
85
86
|
elsif text?
|
86
87
|
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
87
88
|
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
@@ -91,6 +92,8 @@ module OcrFile
|
|
91
92
|
else # is an image
|
92
93
|
ocr_image_to_pdf
|
93
94
|
end
|
95
|
+
|
96
|
+
close
|
94
97
|
end
|
95
98
|
|
96
99
|
def to_text
|
@@ -99,16 +102,16 @@ module OcrFile
|
|
99
102
|
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
100
103
|
|
101
104
|
image_paths.each do |image_path|
|
102
|
-
text = @ocr_engine.ocr_to_text(image_path, options: @config)
|
105
|
+
text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
|
103
106
|
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
|
104
107
|
end
|
105
|
-
|
106
|
-
close
|
107
108
|
elsif text?
|
108
109
|
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
109
110
|
else # is an image
|
110
111
|
ocr_image_to_text(save: true)
|
111
112
|
end
|
113
|
+
|
114
|
+
close
|
112
115
|
end
|
113
116
|
|
114
117
|
def to_s
|
@@ -119,7 +122,7 @@ module OcrFile
|
|
119
122
|
text = ''
|
120
123
|
|
121
124
|
image_paths.each do |image_path|
|
122
|
-
text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
|
125
|
+
text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
|
123
126
|
end
|
124
127
|
|
125
128
|
close
|
@@ -127,7 +130,10 @@ module OcrFile
|
|
127
130
|
elsif text?
|
128
131
|
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
129
132
|
else # is an image
|
130
|
-
ocr_image_to_text(save: false)
|
133
|
+
text = ocr_image_to_text(save: false)
|
134
|
+
|
135
|
+
close
|
136
|
+
text
|
131
137
|
end
|
132
138
|
end
|
133
139
|
|
@@ -157,19 +163,36 @@ module OcrFile
|
|
157
163
|
end
|
158
164
|
|
159
165
|
def create_temp_folder
|
160
|
-
|
161
|
-
|
166
|
+
date = Time.now.to_s.split(' ').first
|
167
|
+
|
168
|
+
@temp_folder_path = "#{save_file_path}/temp-#{date}/".gsub(' ', '\ ')
|
162
169
|
::OcrFile::FileHelpers.make_directory(@temp_folder_path)
|
163
170
|
end
|
164
171
|
|
172
|
+
def process_image(path)
|
173
|
+
return path unless @config[:image_preprocess]
|
174
|
+
|
175
|
+
create_temp_folder
|
176
|
+
save_file_path = "#{@temp_folder_path}/#{Time.now.to_i}.#{@config[:filetype]}"
|
177
|
+
|
178
|
+
image_processor = OcrFile::ImageEngines::ImageMagick.new(
|
179
|
+
image_path: path,
|
180
|
+
temp_path: @temp_folder_path,
|
181
|
+
save_file_path: save_file_path,
|
182
|
+
config: @config
|
183
|
+
)
|
184
|
+
|
185
|
+
image_processor.convert!
|
186
|
+
end
|
187
|
+
|
165
188
|
def ocr_image_to_pdf
|
166
|
-
pdf_document = @ocr_engine.ocr_to_pdf(@original_file_path, options: @config)
|
189
|
+
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
|
167
190
|
OcrFile::ImageEngines::PdfEngine
|
168
191
|
.save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
169
192
|
end
|
170
193
|
|
171
194
|
def ocr_image_to_text(save: true)
|
172
|
-
text = @ocr_engine.ocr_to_text(@original_file_path, options: @config)
|
195
|
+
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
|
173
196
|
|
174
197
|
if save
|
175
198
|
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
|
@@ -1,14 +1,73 @@
|
|
1
1
|
module OcrFile
|
2
2
|
module ImageEngines
|
3
|
-
|
4
|
-
extend self
|
5
|
-
|
3
|
+
class ImageMagick
|
6
4
|
# TODO:
|
7
|
-
# B/W
|
8
|
-
# Contrast
|
9
|
-
# Image Norm
|
10
|
-
# Threshold
|
11
5
|
# Conversion of image types
|
6
|
+
# Rotation and detection of skew
|
7
|
+
|
8
|
+
attr_reader :image_path, :image, :temp_path, :save_file_path, :config
|
9
|
+
|
10
|
+
def initialize(image_path:, temp_path:, save_file_path:, config:)
|
11
|
+
@image_path = image_path
|
12
|
+
@config = config
|
13
|
+
@save_file_path = save_file_path
|
14
|
+
|
15
|
+
@temp_path = temp_path
|
16
|
+
|
17
|
+
# Will be available in the next version of MiniMagick > 4.11.0
|
18
|
+
# https://github.com/minimagick/minimagick/pull/541
|
19
|
+
# MiniMagick.configure do |config|
|
20
|
+
# # cli_version graphicsmagick? imagemagick7? imagemagick? version
|
21
|
+
# config.tmpdir = File.join(Dir.tmpdir, @temp_path)
|
22
|
+
# end
|
23
|
+
|
24
|
+
@image = MiniMagick::Image.open(image_path)
|
25
|
+
end
|
26
|
+
|
27
|
+
def convert!
|
28
|
+
return @image_path unless @config[:image_preprocess]
|
29
|
+
|
30
|
+
@config[:effects].each do |effect|
|
31
|
+
self.send(effect.to_sym)
|
32
|
+
end
|
33
|
+
|
34
|
+
save!
|
35
|
+
end
|
36
|
+
|
37
|
+
def save!
|
38
|
+
image.write(@save_file_path)
|
39
|
+
@save_file_path
|
40
|
+
end
|
41
|
+
|
42
|
+
# Effects
|
43
|
+
# http://www.imagemagick.org/script/command-line-options.php
|
44
|
+
def bw
|
45
|
+
@image.alpha('off')
|
46
|
+
@image.auto_threshold("otsu")
|
47
|
+
end
|
48
|
+
|
49
|
+
def enhance
|
50
|
+
@image.enhance
|
51
|
+
end
|
52
|
+
|
53
|
+
def norm
|
54
|
+
@image.equalize
|
55
|
+
end
|
56
|
+
|
57
|
+
# Most likely not going to be configurable because
|
58
|
+
# these are aggressive parameters used to optimised OCR results
|
59
|
+
# and not the final results of the PDFs
|
60
|
+
def sharpen
|
61
|
+
@image.sharpen('0x4') # radiusXsigma
|
62
|
+
end
|
63
|
+
|
64
|
+
def deskew
|
65
|
+
@image.deskew('40%') # threshold recommended in the docs
|
66
|
+
end
|
67
|
+
|
68
|
+
def despeckle
|
69
|
+
@image.despeckle
|
70
|
+
end
|
12
71
|
end
|
13
72
|
end
|
14
73
|
end
|
@@ -13,7 +13,7 @@ module OcrFile
|
|
13
13
|
print 'Generating screenshots of each PDF page ... '
|
14
14
|
|
15
15
|
if filetype == 'jpg'
|
16
|
-
`pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
|
16
|
+
`pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} "#{pdf_path}" "#{save_path}/#{filename}"`
|
17
17
|
else
|
18
18
|
`pdftoppm -#{filetype} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
|
19
19
|
end
|
data/lib/ocr-file/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ocr-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-06-
|
11
|
+
date: 2022-06-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: console-style
|