ocr-file 0.0.1 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +24 -6
- data/bin/ocr-file +1 -1
- data/lib/ocr-file/cli.rb +37 -1
- data/lib/ocr-file/document.rb +137 -59
- data/lib/ocr-file/image_engines/image_magick.rb +73 -7
- data/lib/ocr-file/image_engines/pdftoppm.rb +1 -1
- data/lib/ocr-file/text_engines/result_processor.rb +34 -0
- data/lib/ocr-file/version.rb +1 -1
- data/lib/ocr-file.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
|
4
|
+
data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
|
7
|
+
data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -42,15 +42,16 @@ You will need to install `tesseract` with your desired language on your system,
|
|
42
42
|
type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
|
43
43
|
ocr_engine: 'tesseract', # 'cloud-vision'
|
44
44
|
# Image Pre-Processing
|
45
|
-
|
46
|
-
effects: ['bw', 'norm'
|
47
|
-
|
45
|
+
image_preprocess: true,
|
46
|
+
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
|
47
|
+
automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
|
48
48
|
# PDF to Image Processing
|
49
49
|
optimise_pdf: true,
|
50
50
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
51
51
|
temp_filename_prefix: 'image',
|
52
52
|
# Console Output
|
53
53
|
verbose: true,
|
54
|
+
timing: true,
|
54
55
|
}
|
55
56
|
|
56
57
|
doc = OcrFile::Document.new(
|
@@ -84,7 +85,23 @@ You will need to install `tesseract` with your desired language on your system,
|
|
84
85
|
### Notes / Tips
|
85
86
|
Set `extract_pdf_images` to `false` for higher quality OCR. However this will consume more temporary space per PDF page and also be considerably slower.
|
86
87
|
|
87
|
-
Image pre-processing is
|
88
|
+
Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
|
89
|
+
|
90
|
+
`automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
|
91
|
+
|
92
|
+
### Simple CLI
|
93
|
+
Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
|
94
|
+
|
95
|
+
```
|
96
|
+
# Basic Usage with console output
|
97
|
+
ocr-file input_file_path output_folder_path
|
98
|
+
|
99
|
+
# Output to PDF
|
100
|
+
ocr-file input_file_path output_folder_path pdf
|
101
|
+
|
102
|
+
# Output to TXT
|
103
|
+
ocr-file input_file_path output_folder_path txt
|
104
|
+
```
|
88
105
|
|
89
106
|
## Development
|
90
107
|
|
@@ -94,14 +111,15 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
94
111
|
|
95
112
|
### TODOs
|
96
113
|
- input validation
|
97
|
-
- CLI
|
98
|
-
- image processing
|
114
|
+
- Better CLI
|
99
115
|
- password
|
100
116
|
- Base64 encoding
|
101
117
|
- requirements checking (installed dependencies etc ...)
|
102
118
|
- Tests
|
103
119
|
- Configurable temp folder cleanup
|
104
120
|
- Improve console output
|
121
|
+
- Fix spaces in file names
|
122
|
+
- Better verbosity
|
105
123
|
|
106
124
|
### Tests
|
107
125
|
To run tests execute:
|
data/bin/ocr-file
CHANGED
data/lib/ocr-file/cli.rb
CHANGED
@@ -1,5 +1,41 @@
|
|
1
1
|
module OcrFile
|
2
|
-
|
2
|
+
class Cli
|
3
|
+
attr_reader :args
|
3
4
|
|
5
|
+
def initialize(args)
|
6
|
+
@args = args
|
7
|
+
end
|
8
|
+
|
9
|
+
def valid?
|
10
|
+
return true if args.size == 2 || args.size == 3
|
11
|
+
false
|
12
|
+
end
|
13
|
+
|
14
|
+
def invalid?
|
15
|
+
!valid?
|
16
|
+
end
|
17
|
+
|
18
|
+
def call
|
19
|
+
# TODO: Use ConsoleStyle::Functions
|
20
|
+
# TODO: Heading and better CLI interface
|
21
|
+
# Simple cli for now
|
22
|
+
puts "OCR Tool © Jason Chalom 2022, Version: #{OcrFile::VERSION}"
|
23
|
+
abort "File path, Save Folder Paths, and output type (pdf, txt) are required!" if invalid?
|
24
|
+
|
25
|
+
# Using default config for now
|
26
|
+
original_file_path = args[0]
|
27
|
+
save_file_path = args[1]
|
28
|
+
output_type = args[2]
|
29
|
+
|
30
|
+
document = OcrFile::Document.new(original_file_path: original_file_path, save_file_path: save_file_path)
|
31
|
+
|
32
|
+
if output_type.to_s.downcase.include?('pdf')
|
33
|
+
document.to_pdf
|
34
|
+
elsif output_type.to_s.downcase.include?('txt') || output_type.to_s.downcase.include?('text')
|
35
|
+
document.to_text
|
36
|
+
else # Display in console
|
37
|
+
puts document.to_s
|
38
|
+
end
|
39
|
+
end
|
4
40
|
end
|
5
41
|
end
|
data/lib/ocr-file/document.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
module OcrFile
|
2
2
|
class Document
|
3
|
+
# TODO: Skewness / text orientation detection
|
4
|
+
# TODO: Better handwriting analysis
|
5
|
+
|
3
6
|
ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
|
4
7
|
PAGE_BREAK = "\n\r\n" # TODO: Make configurable
|
8
|
+
EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
|
5
9
|
DEFAULT_CONFIG = {
|
6
10
|
# Images from PDF
|
7
11
|
filetype: 'png',
|
@@ -18,15 +22,16 @@ module OcrFile
|
|
18
22
|
type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
|
19
23
|
ocr_engine: 'tesseract', # 'cloud-vision'
|
20
24
|
# Image Pre-Processing
|
21
|
-
|
22
|
-
effects: ['
|
23
|
-
|
25
|
+
image_preprocess: true,
|
26
|
+
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
|
27
|
+
automatic_reprocess: true,
|
24
28
|
# PDF to Image Processing
|
25
29
|
optimise_pdf: true,
|
26
30
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
27
31
|
temp_filename_prefix: 'image',
|
28
32
|
# Console Output
|
29
33
|
verbose: true,
|
34
|
+
timing: true,
|
30
35
|
}
|
31
36
|
|
32
37
|
attr_reader :original_file_path,
|
@@ -34,7 +39,9 @@ module OcrFile
|
|
34
39
|
:save_file_path,
|
35
40
|
:final_save_file,
|
36
41
|
:config,
|
37
|
-
:ocr_engine
|
42
|
+
:ocr_engine,
|
43
|
+
:start_time,
|
44
|
+
:end_time
|
38
45
|
|
39
46
|
# save_file_path will also generate a tmp path for tmp files. Expected folder path
|
40
47
|
# TODO: Add in more input validation
|
@@ -52,12 +59,12 @@ module OcrFile
|
|
52
59
|
end
|
53
60
|
|
54
61
|
def pdf?
|
55
|
-
@original_file_path.include?('.pdf')
|
62
|
+
@original_file_path.downcase.include?('.pdf')
|
56
63
|
end
|
57
64
|
|
58
65
|
def image?
|
59
66
|
return false if pdf?
|
60
|
-
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
|
67
|
+
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
|
61
68
|
end
|
62
69
|
|
63
70
|
# Treat anything which isnt a PDF or image as text
|
@@ -65,74 +72,52 @@ module OcrFile
|
|
65
72
|
!pdf? && !image?
|
66
73
|
end
|
67
74
|
|
75
|
+
# Trigger OCR pipeline
|
68
76
|
def to_pdf
|
69
|
-
|
70
|
-
|
71
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
72
|
-
|
73
|
-
pdfs_to_merge = []
|
74
|
-
|
75
|
-
image_paths.each do |image_path|
|
76
|
-
pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
|
77
|
-
end
|
78
|
-
|
79
|
-
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
|
77
|
+
@start_time = Time.now
|
78
|
+
find_best_image_processing if config[:automatic_reprocess] && !text?
|
80
79
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
close
|
80
|
+
if pdf?
|
81
|
+
ocr_pdf_to_searchable_pdf
|
85
82
|
elsif text?
|
86
|
-
|
87
|
-
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
88
|
-
|
89
|
-
OcrFile::ImageEngines::PdfEngine
|
90
|
-
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
83
|
+
text_to_pdf
|
91
84
|
else # is an image
|
92
85
|
ocr_image_to_pdf
|
93
86
|
end
|
87
|
+
|
88
|
+
close
|
89
|
+
|
90
|
+
@end_time = Time.now
|
91
|
+
print_time
|
94
92
|
end
|
95
93
|
|
96
94
|
def to_text
|
97
|
-
|
98
|
-
|
99
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
95
|
+
@start_time = Time.now
|
96
|
+
return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
|
100
97
|
|
101
|
-
|
102
|
-
|
103
|
-
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
|
104
|
-
end
|
98
|
+
find_best_image_processing(save: true)
|
99
|
+
close
|
105
100
|
|
106
|
-
|
107
|
-
|
108
|
-
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
109
|
-
else # is an image
|
110
|
-
ocr_image_to_text(save: true)
|
111
|
-
end
|
101
|
+
@end_time = Time.now
|
102
|
+
print_time
|
112
103
|
end
|
113
104
|
|
114
105
|
def to_s
|
115
|
-
|
116
|
-
|
117
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
106
|
+
@start_time = Time.now
|
107
|
+
return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
|
118
108
|
|
119
|
-
|
109
|
+
text = find_best_image_processing(save: false)
|
120
110
|
|
121
|
-
|
122
|
-
text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
|
123
|
-
end
|
111
|
+
close
|
124
112
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
else # is an image
|
130
|
-
ocr_image_to_text(save: false)
|
131
|
-
end
|
113
|
+
@end_time = Time.now
|
114
|
+
print_time
|
115
|
+
|
116
|
+
text
|
132
117
|
end
|
133
118
|
|
134
119
|
def close
|
135
|
-
::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
|
120
|
+
# ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
|
136
121
|
end
|
137
122
|
|
138
123
|
private
|
@@ -157,19 +142,80 @@ module OcrFile
|
|
157
142
|
end
|
158
143
|
|
159
144
|
def create_temp_folder
|
160
|
-
|
161
|
-
|
145
|
+
date = Time.now.to_s.split(' ').first
|
146
|
+
|
147
|
+
@temp_folder_path = "#{save_file_path}/temp-#{date}/".gsub(' ', '\ ')
|
162
148
|
::OcrFile::FileHelpers.make_directory(@temp_folder_path)
|
163
149
|
end
|
164
150
|
|
151
|
+
def process_image(path)
|
152
|
+
return path unless @config[:image_preprocess]
|
153
|
+
|
154
|
+
create_temp_folder
|
155
|
+
save_file_path = "#{@temp_folder_path}/#{Time.now.to_i}.#{@config[:filetype]}"
|
156
|
+
|
157
|
+
image_processor = OcrFile::ImageEngines::ImageMagick.new(
|
158
|
+
image_path: path,
|
159
|
+
temp_path: @temp_folder_path,
|
160
|
+
save_file_path: save_file_path,
|
161
|
+
config: @config
|
162
|
+
)
|
163
|
+
|
164
|
+
image_processor.convert!
|
165
|
+
end
|
166
|
+
|
167
|
+
def ocr_pdf_to_searchable_pdf
|
168
|
+
create_temp_folder
|
169
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
170
|
+
|
171
|
+
pdfs_to_merge = []
|
172
|
+
|
173
|
+
image_paths.each do |image_path|
|
174
|
+
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
175
|
+
end
|
176
|
+
|
177
|
+
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
|
178
|
+
|
179
|
+
OcrFile::ImageEngines::PdfEngine
|
180
|
+
.save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
181
|
+
end
|
182
|
+
|
183
|
+
def text_to_pdf
|
184
|
+
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
185
|
+
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
186
|
+
|
187
|
+
OcrFile::ImageEngines::PdfEngine
|
188
|
+
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
189
|
+
end
|
190
|
+
|
165
191
|
def ocr_image_to_pdf
|
166
|
-
|
192
|
+
find_best_image_processing if config[:automatic_reprocess]
|
193
|
+
|
194
|
+
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
|
167
195
|
OcrFile::ImageEngines::PdfEngine
|
168
196
|
.save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
169
197
|
end
|
170
198
|
|
171
|
-
def
|
172
|
-
|
199
|
+
def ocr_pdf_to_text(save:)
|
200
|
+
create_temp_folder
|
201
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
202
|
+
|
203
|
+
text = ''
|
204
|
+
|
205
|
+
image_paths.each do |image_path|
|
206
|
+
text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
|
207
|
+
end
|
208
|
+
|
209
|
+
if save
|
210
|
+
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
|
211
|
+
else
|
212
|
+
text
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
def ocr_image_to_text(save:)
|
217
|
+
create_temp_folder
|
218
|
+
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
|
173
219
|
|
174
220
|
if save
|
175
221
|
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
|
@@ -178,6 +224,38 @@ module OcrFile
|
|
178
224
|
end
|
179
225
|
end
|
180
226
|
|
227
|
+
def ocr_file_to_text(save:)
|
228
|
+
if pdf? &&
|
229
|
+
ocr_pdf_to_text(save: save)
|
230
|
+
else # is an image
|
231
|
+
ocr_image_to_text(save: save)
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
def find_best_image_processing(save:)
|
236
|
+
ocr_file_to_text(save: save) if !config[:automatic_reprocess]
|
237
|
+
|
238
|
+
text = ''
|
239
|
+
effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
|
240
|
+
effects_to_test.each do |effect|
|
241
|
+
config[:effects] = config[:effects] - [effect]
|
242
|
+
|
243
|
+
text = ocr_file_to_text(save: false)
|
244
|
+
break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
|
245
|
+
end
|
246
|
+
|
247
|
+
# Adds in extra operations which is unfortunately inefficient
|
248
|
+
if save
|
249
|
+
ocr_file_to_text(save: save)
|
250
|
+
else
|
251
|
+
text
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def print_time
|
256
|
+
puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
|
257
|
+
end
|
258
|
+
|
181
259
|
def find_ocr_engine(engine_id)
|
182
260
|
ocr_engine_constants
|
183
261
|
.map { |c| ocr_module(c) }
|
@@ -1,14 +1,80 @@
|
|
1
1
|
module OcrFile
|
2
2
|
module ImageEngines
|
3
|
-
|
4
|
-
extend self
|
5
|
-
|
3
|
+
class ImageMagick
|
6
4
|
# TODO:
|
7
|
-
# B/W
|
8
|
-
# Contrast
|
9
|
-
# Image Norm
|
10
|
-
# Threshold
|
11
5
|
# Conversion of image types
|
6
|
+
# Rotation and detection of skew
|
7
|
+
|
8
|
+
attr_reader :image_path, :image, :temp_path, :save_file_path, :config
|
9
|
+
|
10
|
+
def initialize(image_path:, temp_path:, save_file_path:, config:)
|
11
|
+
@image_path = image_path
|
12
|
+
@config = config
|
13
|
+
@save_file_path = save_file_path
|
14
|
+
|
15
|
+
@temp_path = temp_path
|
16
|
+
|
17
|
+
# Will be available in the next version of MiniMagick > 4.11.0
|
18
|
+
# https://github.com/minimagick/minimagick/pull/541
|
19
|
+
# MiniMagick.configure do |config|
|
20
|
+
# # cli_version graphicsmagick? imagemagick7? imagemagick? version
|
21
|
+
# config.tmpdir = File.join(Dir.tmpdir, @temp_path)
|
22
|
+
# end
|
23
|
+
|
24
|
+
@image = MiniMagick::Image.open(image_path)
|
25
|
+
end
|
26
|
+
|
27
|
+
def convert!
|
28
|
+
return @image_path unless @config[:image_preprocess]
|
29
|
+
|
30
|
+
@config[:effects].each do |effect|
|
31
|
+
self.send(effect.to_sym)
|
32
|
+
end
|
33
|
+
|
34
|
+
save!
|
35
|
+
end
|
36
|
+
|
37
|
+
def save!
|
38
|
+
image.write(@save_file_path)
|
39
|
+
@save_file_path
|
40
|
+
end
|
41
|
+
|
42
|
+
# Effects
|
43
|
+
# http://www.imagemagick.org/script/command-line-options.php
|
44
|
+
def bw
|
45
|
+
@image.alpha('off')
|
46
|
+
@image.auto_threshold("otsu")
|
47
|
+
end
|
48
|
+
|
49
|
+
def enhance
|
50
|
+
@image.enhance
|
51
|
+
end
|
52
|
+
|
53
|
+
def norm
|
54
|
+
@image.equalize
|
55
|
+
end
|
56
|
+
|
57
|
+
# Most likely not going to be configurable because
|
58
|
+
# these are aggressive parameters used to optimised OCR results
|
59
|
+
# and not the final results of the PDFs
|
60
|
+
def sharpen
|
61
|
+
@image.sharpen('0x4') # radiusXsigma
|
62
|
+
end
|
63
|
+
|
64
|
+
# https://github.com/ImageMagick/ImageMagick/discussions/4145
|
65
|
+
def remove_shadow
|
66
|
+
@image.negate
|
67
|
+
@image.lat("20x20+10\%")
|
68
|
+
@image.negate
|
69
|
+
end
|
70
|
+
|
71
|
+
def deskew
|
72
|
+
@image.deskew('40%') # threshold recommended in the docs
|
73
|
+
end
|
74
|
+
|
75
|
+
def despeckle
|
76
|
+
@image.despeckle
|
77
|
+
end
|
12
78
|
end
|
13
79
|
end
|
14
80
|
end
|
@@ -13,7 +13,7 @@ module OcrFile
|
|
13
13
|
print 'Generating screenshots of each PDF page ... '
|
14
14
|
|
15
15
|
if filetype == 'jpg'
|
16
|
-
`pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
|
16
|
+
`pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} "#{pdf_path}" "#{save_path}/#{filename}"`
|
17
17
|
else
|
18
18
|
`pdftoppm -#{filetype} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
|
19
19
|
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module OcrFile
|
2
|
+
module TextEngines
|
3
|
+
class ResultProcessor
|
4
|
+
MINIMUM_WORD_LENGTH = 3
|
5
|
+
|
6
|
+
attr_reader :text, :clear_text
|
7
|
+
|
8
|
+
def initialize(text)
|
9
|
+
@text = text
|
10
|
+
@clear_text = remove_lines
|
11
|
+
end
|
12
|
+
|
13
|
+
# This is a very naive way of determining if we should re-do OCR with
|
14
|
+
# shifted options
|
15
|
+
def valid_words?
|
16
|
+
word_size_average >= MINIMUM_WORD_LENGTH
|
17
|
+
end
|
18
|
+
|
19
|
+
def word_count
|
20
|
+
@_word_count ||= clear_text.split(' ').size
|
21
|
+
end
|
22
|
+
|
23
|
+
def word_size_average
|
24
|
+
@_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def remove_lines
|
30
|
+
text.gsub("\n", ' ').gsub("\r", ' ').gsub(' ', '')
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/ocr-file/version.rb
CHANGED
data/lib/ocr-file.rb
CHANGED
@@ -10,6 +10,7 @@ require 'ocr-file/image_engines/image_magick'
|
|
10
10
|
require 'ocr-file/image_engines/pdftoppm'
|
11
11
|
require 'ocr-file/ocr_engines/tesseract'
|
12
12
|
require 'ocr-file/ocr_engines/cloud_vision'
|
13
|
+
require 'ocr-file/text_engines/result_processor'
|
13
14
|
require 'ocr-file/file_helpers'
|
14
15
|
require 'ocr-file/document'
|
15
16
|
require 'ocr-file/cli'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ocr-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-06-
|
11
|
+
date: 2022-06-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: console-style
|
@@ -122,6 +122,7 @@ files:
|
|
122
122
|
- lib/ocr-file/image_engines/pdftoppm.rb
|
123
123
|
- lib/ocr-file/ocr_engines/cloud_vision.rb
|
124
124
|
- lib/ocr-file/ocr_engines/tesseract.rb
|
125
|
+
- lib/ocr-file/text_engines/result_processor.rb
|
125
126
|
- lib/ocr-file/version.rb
|
126
127
|
- ocr-file.gemspec
|
127
128
|
homepage: https://github.com/TRex22/ocr-file
|