ocr-file 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +4 -2
- data/lib/ocr-file/document.rb +105 -50
- data/lib/ocr-file/text_engines/result_processor.rb +34 -0
- data/lib/ocr-file/version.rb +1 -1
- data/lib/ocr-file.rb +1 -0
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
|
4
|
+
data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
|
7
|
+
data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -44,12 +44,14 @@ You will need to install `tesseract` with your desired language on your system,
|
|
44
44
|
# Image Pre-Processing
|
45
45
|
image_preprocess: true,
|
46
46
|
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
|
47
|
+
automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
|
47
48
|
# PDF to Image Processing
|
48
49
|
optimise_pdf: true,
|
49
50
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
50
51
|
temp_filename_prefix: 'image',
|
51
52
|
# Console Output
|
52
53
|
verbose: true,
|
54
|
+
timing: true,
|
53
55
|
}
|
54
56
|
|
55
57
|
doc = OcrFile::Document.new(
|
@@ -85,6 +87,8 @@ Set `extract_pdf_images` to `false` for higher quality OCR. However this will co
|
|
85
87
|
|
86
88
|
Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
|
87
89
|
|
90
|
+
`automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
|
91
|
+
|
88
92
|
### Simple CLI
|
89
93
|
Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
|
90
94
|
|
@@ -108,7 +112,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
108
112
|
### TODOs
|
109
113
|
- input validation
|
110
114
|
- Better CLI
|
111
|
-
- image processing
|
112
115
|
- password
|
113
116
|
- Base64 encoding
|
114
117
|
- requirements checking (installed dependencies etc ...)
|
@@ -117,7 +120,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
117
120
|
- Improve console output
|
118
121
|
- Fix spaces in file names
|
119
122
|
- Better verbosity
|
120
|
-
- Timing
|
121
123
|
|
122
124
|
### Tests
|
123
125
|
To run tests execute:
|
data/lib/ocr-file/document.rb
CHANGED
@@ -5,6 +5,7 @@ module OcrFile
|
|
5
5
|
|
6
6
|
ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
|
7
7
|
PAGE_BREAK = "\n\r\n" # TODO: Make configurable
|
8
|
+
EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
|
8
9
|
DEFAULT_CONFIG = {
|
9
10
|
# Images from PDF
|
10
11
|
filetype: 'png',
|
@@ -23,12 +24,14 @@ module OcrFile
|
|
23
24
|
# Image Pre-Processing
|
24
25
|
image_preprocess: true,
|
25
26
|
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
|
27
|
+
automatic_reprocess: true,
|
26
28
|
# PDF to Image Processing
|
27
29
|
optimise_pdf: true,
|
28
30
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
29
31
|
temp_filename_prefix: 'image',
|
30
32
|
# Console Output
|
31
33
|
verbose: true,
|
34
|
+
timing: true,
|
32
35
|
}
|
33
36
|
|
34
37
|
attr_reader :original_file_path,
|
@@ -36,7 +39,9 @@ module OcrFile
|
|
36
39
|
:save_file_path,
|
37
40
|
:final_save_file,
|
38
41
|
:config,
|
39
|
-
:ocr_engine
|
42
|
+
:ocr_engine,
|
43
|
+
:start_time,
|
44
|
+
:end_time
|
40
45
|
|
41
46
|
# save_file_path will also generate a tmp path for tmp files. Expected folder path
|
42
47
|
# TODO: Add in more input validation
|
@@ -69,76 +74,50 @@ module OcrFile
|
|
69
74
|
|
70
75
|
# Trigger OCR pipeline
|
71
76
|
def to_pdf
|
72
|
-
|
73
|
-
|
74
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
75
|
-
|
76
|
-
pdfs_to_merge = []
|
77
|
-
|
78
|
-
image_paths.each do |image_path|
|
79
|
-
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
80
|
-
end
|
81
|
-
|
82
|
-
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
|
77
|
+
@start_time = Time.now
|
78
|
+
find_best_image_processing if config[:automatic_reprocess] && !text?
|
83
79
|
|
84
|
-
|
85
|
-
|
80
|
+
if pdf?
|
81
|
+
ocr_pdf_to_searchable_pdf
|
86
82
|
elsif text?
|
87
|
-
|
88
|
-
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
89
|
-
|
90
|
-
OcrFile::ImageEngines::PdfEngine
|
91
|
-
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
83
|
+
text_to_pdf
|
92
84
|
else # is an image
|
93
85
|
ocr_image_to_pdf
|
94
86
|
end
|
95
87
|
|
96
88
|
close
|
89
|
+
|
90
|
+
@end_time = Time.now
|
91
|
+
print_time
|
97
92
|
end
|
98
93
|
|
99
94
|
def to_text
|
100
|
-
|
101
|
-
|
102
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
103
|
-
|
104
|
-
image_paths.each do |image_path|
|
105
|
-
text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
|
106
|
-
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
|
107
|
-
end
|
108
|
-
elsif text?
|
109
|
-
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
110
|
-
else # is an image
|
111
|
-
ocr_image_to_text(save: true)
|
112
|
-
end
|
95
|
+
@start_time = Time.now
|
96
|
+
return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
|
113
97
|
|
98
|
+
find_best_image_processing(save: true)
|
114
99
|
close
|
100
|
+
|
101
|
+
@end_time = Time.now
|
102
|
+
print_time
|
115
103
|
end
|
116
104
|
|
117
105
|
def to_s
|
118
|
-
|
119
|
-
|
120
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
106
|
+
@start_time = Time.now
|
107
|
+
return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
|
121
108
|
|
122
|
-
|
109
|
+
text = find_best_image_processing(save: false)
|
123
110
|
|
124
|
-
|
125
|
-
text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
|
126
|
-
end
|
111
|
+
close
|
127
112
|
|
128
|
-
|
129
|
-
|
130
|
-
elsif text?
|
131
|
-
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
132
|
-
else # is an image
|
133
|
-
text = ocr_image_to_text(save: false)
|
113
|
+
@end_time = Time.now
|
114
|
+
print_time
|
134
115
|
|
135
|
-
|
136
|
-
text
|
137
|
-
end
|
116
|
+
text
|
138
117
|
end
|
139
118
|
|
140
119
|
def close
|
141
|
-
::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
|
120
|
+
# ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
|
142
121
|
end
|
143
122
|
|
144
123
|
private
|
@@ -185,13 +164,57 @@ module OcrFile
|
|
185
164
|
image_processor.convert!
|
186
165
|
end
|
187
166
|
|
167
|
+
def ocr_pdf_to_searchable_pdf
|
168
|
+
create_temp_folder
|
169
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
170
|
+
|
171
|
+
pdfs_to_merge = []
|
172
|
+
|
173
|
+
image_paths.each do |image_path|
|
174
|
+
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
175
|
+
end
|
176
|
+
|
177
|
+
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
|
178
|
+
|
179
|
+
OcrFile::ImageEngines::PdfEngine
|
180
|
+
.save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
181
|
+
end
|
182
|
+
|
183
|
+
def text_to_pdf
|
184
|
+
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
185
|
+
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
186
|
+
|
187
|
+
OcrFile::ImageEngines::PdfEngine
|
188
|
+
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
189
|
+
end
|
190
|
+
|
188
191
|
def ocr_image_to_pdf
|
192
|
+
find_best_image_processing if config[:automatic_reprocess]
|
193
|
+
|
189
194
|
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
|
190
195
|
OcrFile::ImageEngines::PdfEngine
|
191
196
|
.save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
192
197
|
end
|
193
198
|
|
194
|
-
def
|
199
|
+
def ocr_pdf_to_text(save:)
|
200
|
+
create_temp_folder
|
201
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
202
|
+
|
203
|
+
text = ''
|
204
|
+
|
205
|
+
image_paths.each do |image_path|
|
206
|
+
text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
|
207
|
+
end
|
208
|
+
|
209
|
+
if save
|
210
|
+
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
|
211
|
+
else
|
212
|
+
text
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
def ocr_image_to_text(save:)
|
217
|
+
create_temp_folder
|
195
218
|
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
|
196
219
|
|
197
220
|
if save
|
@@ -201,6 +224,38 @@ module OcrFile
|
|
201
224
|
end
|
202
225
|
end
|
203
226
|
|
227
|
+
def ocr_file_to_text(save:)
|
228
|
+
if pdf? &&
|
229
|
+
ocr_pdf_to_text(save: save)
|
230
|
+
else # is an image
|
231
|
+
ocr_image_to_text(save: save)
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
def find_best_image_processing(save:)
|
236
|
+
ocr_file_to_text(save: save) if !config[:automatic_reprocess]
|
237
|
+
|
238
|
+
text = ''
|
239
|
+
effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
|
240
|
+
effects_to_test.each do |effect|
|
241
|
+
config[:effects] = config[:effects] - [effect]
|
242
|
+
|
243
|
+
text = ocr_file_to_text(save: false)
|
244
|
+
break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
|
245
|
+
end
|
246
|
+
|
247
|
+
# Adds in extra operations which is unfortunately inefficient
|
248
|
+
if save
|
249
|
+
ocr_file_to_text(save: save)
|
250
|
+
else
|
251
|
+
text
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def print_time
|
256
|
+
puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
|
257
|
+
end
|
258
|
+
|
204
259
|
def find_ocr_engine(engine_id)
|
205
260
|
ocr_engine_constants
|
206
261
|
.map { |c| ocr_module(c) }
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module OcrFile
|
2
|
+
module TextEngines
|
3
|
+
class ResultProcessor
|
4
|
+
MINIMUM_WORD_LENGTH = 3
|
5
|
+
|
6
|
+
attr_reader :text, :clear_text
|
7
|
+
|
8
|
+
def initialize(text)
|
9
|
+
@text = text
|
10
|
+
@clear_text = remove_lines
|
11
|
+
end
|
12
|
+
|
13
|
+
# This is a very naive way of determining if we should re-do OCR with
|
14
|
+
# shifted options
|
15
|
+
def valid_words?
|
16
|
+
word_size_average >= MINIMUM_WORD_LENGTH
|
17
|
+
end
|
18
|
+
|
19
|
+
def word_count
|
20
|
+
@_word_count ||= clear_text.split(' ').size
|
21
|
+
end
|
22
|
+
|
23
|
+
def word_size_average
|
24
|
+
@_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def remove_lines
|
30
|
+
text.gsub("\n", ' ').gsub("\r", ' ').gsub(' ', '')
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/ocr-file/version.rb
CHANGED
data/lib/ocr-file.rb
CHANGED
@@ -10,6 +10,7 @@ require 'ocr-file/image_engines/image_magick'
|
|
10
10
|
require 'ocr-file/image_engines/pdftoppm'
|
11
11
|
require 'ocr-file/ocr_engines/tesseract'
|
12
12
|
require 'ocr-file/ocr_engines/cloud_vision'
|
13
|
+
require 'ocr-file/text_engines/result_processor'
|
13
14
|
require 'ocr-file/file_helpers'
|
14
15
|
require 'ocr-file/document'
|
15
16
|
require 'ocr-file/cli'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ocr-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
@@ -122,6 +122,7 @@ files:
|
|
122
122
|
- lib/ocr-file/image_engines/pdftoppm.rb
|
123
123
|
- lib/ocr-file/ocr_engines/cloud_vision.rb
|
124
124
|
- lib/ocr-file/ocr_engines/tesseract.rb
|
125
|
+
- lib/ocr-file/text_engines/result_processor.rb
|
125
126
|
- lib/ocr-file/version.rb
|
126
127
|
- ocr-file.gemspec
|
127
128
|
homepage: https://github.com/TRex22/ocr-file
|