ocr-file 0.0.2 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -2
- data/README.md +13 -4
- data/lib/ocr-file/document.rb +134 -50
- data/lib/ocr-file/image_engines/image_magick.rb +7 -0
- data/lib/ocr-file/image_engines/pdf_engine.rb +28 -1
- data/lib/ocr-file/text_engines/result_processor.rb +82 -0
- data/lib/ocr-file/version.rb +1 -1
- data/lib/ocr-file.rb +2 -0
- data/ocr-file.gemspec +1 -0
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9660e3d19c210789a7aeab56e63b002b507c12917edd1418202900d05647773b
|
4
|
+
data.tar.gz: 28861544f58374db141e5e3cc3ee8569201c4ffd611458e77774c98835e2f882
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 55f4266d6877a7f2c8a4175f5601930c9940aa1a8a06fc1d6d84faca455232173f2ae4be887dc9d20246cb224498a213b91871b66ee3fb3b02699399be33453a
|
7
|
+
data.tar.gz: e78263baffd1ae1ff1246f705250d788ae6b5c24b3d59d78e169b3ddf93e139ac9b9688cf9d954053a913a1c3bd6be54c40466d372f915c4430d55d93282f1bb
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ocr-file (0.0.
|
4
|
+
ocr-file (0.0.6)
|
5
5
|
active_attr (~> 0.15.4)
|
6
6
|
console-style (~> 0.0.1)
|
7
7
|
hexapdf (~> 0.23.0)
|
8
8
|
mini_magick (~> 4.11.0)
|
9
9
|
rtesseract (~> 3.1.2)
|
10
|
+
ruby-spellchecker (~> 0.1.5)
|
10
11
|
|
11
12
|
GEM
|
12
13
|
remote: https://rubygems.org/
|
@@ -60,7 +61,7 @@ GEM
|
|
60
61
|
coderay (~> 1.1)
|
61
62
|
method_source (~> 1.0)
|
62
63
|
racc (1.6.0)
|
63
|
-
rack (2.2.
|
64
|
+
rack (2.2.4)
|
64
65
|
rack-test (1.1.0)
|
65
66
|
rack (>= 1.0, < 3)
|
66
67
|
rails-dom-testing (2.0.3)
|
@@ -69,6 +70,7 @@ GEM
|
|
69
70
|
rails-html-sanitizer (1.4.3)
|
70
71
|
loofah (~> 2.3)
|
71
72
|
rtesseract (3.1.2)
|
73
|
+
ruby-spellchecker (0.1.5)
|
72
74
|
tzinfo (2.0.4)
|
73
75
|
concurrent-ruby (~> 1.0)
|
74
76
|
|
data/README.md
CHANGED
@@ -43,13 +43,16 @@ You will need to install `tesseract` with your desired language on your system,
|
|
43
43
|
ocr_engine: 'tesseract', # 'cloud-vision'
|
44
44
|
# Image Pre-Processing
|
45
45
|
image_preprocess: true,
|
46
|
-
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'], # Applies effects as listed. 'norm' is also available
|
46
|
+
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
|
47
|
+
automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
|
47
48
|
# PDF to Image Processing
|
48
49
|
optimise_pdf: true,
|
49
50
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
50
51
|
temp_filename_prefix: 'image',
|
52
|
+
spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
|
51
53
|
# Console Output
|
52
54
|
verbose: true,
|
55
|
+
timing: true,
|
53
56
|
}
|
54
57
|
|
55
58
|
doc = OcrFile::Document.new(
|
@@ -74,6 +77,7 @@ You will need to install `tesseract` with your desired language on your system,
|
|
74
77
|
doc.to_pdf
|
75
78
|
|
76
79
|
# How to merge files into a single PDF:
|
80
|
+
# The files can be images or other PDFs
|
77
81
|
filepaths = []
|
78
82
|
documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
|
79
83
|
merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
|
@@ -83,7 +87,9 @@ You will need to install `tesseract` with your desired language on your system,
|
|
83
87
|
### Notes / Tips
|
84
88
|
Set `extract_pdf_images` to `false` for higher quality OCR. However this will consume more temporary space per PDF page and also be considerably slower.
|
85
89
|
|
86
|
-
Image pre-processing only thresholds (bw), normalises the colour space, removes speckles and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary.
|
90
|
+
Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
|
91
|
+
|
92
|
+
`automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
|
87
93
|
|
88
94
|
### Simple CLI
|
89
95
|
Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
|
@@ -108,7 +114,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
108
114
|
### TODOs
|
109
115
|
- input validation
|
110
116
|
- Better CLI
|
111
|
-
- image processing
|
112
117
|
- password
|
113
118
|
- Base64 encoding
|
114
119
|
- requirements checking (installed dependencies etc ...)
|
@@ -117,7 +122,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
117
122
|
- Improve console output
|
118
123
|
- Fix spaces in file names
|
119
124
|
- Better verbosity
|
120
|
-
-
|
125
|
+
- Docker
|
126
|
+
- pdftk / pdf merge for text and bookmarks etc ...
|
127
|
+
- https://github.com/tesseract-ocr/tesseract/issues/660
|
128
|
+
- tesseract -c naked_pdf=true
|
129
|
+
-
|
121
130
|
|
122
131
|
### Tests
|
123
132
|
To run tests execute:
|
data/lib/ocr-file/document.rb
CHANGED
@@ -5,6 +5,7 @@ module OcrFile
|
|
5
5
|
|
6
6
|
ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
|
7
7
|
PAGE_BREAK = "\n\r\n" # TODO: Make configurable
|
8
|
+
EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
|
8
9
|
DEFAULT_CONFIG = {
|
9
10
|
# Images from PDF
|
10
11
|
filetype: 'png',
|
@@ -22,13 +23,16 @@ module OcrFile
|
|
22
23
|
ocr_engine: 'tesseract', # 'cloud-vision'
|
23
24
|
# Image Pre-Processing
|
24
25
|
image_preprocess: true,
|
25
|
-
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'],
|
26
|
+
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
|
27
|
+
automatic_reprocess: true,
|
26
28
|
# PDF to Image Processing
|
27
29
|
optimise_pdf: true,
|
28
30
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
29
31
|
temp_filename_prefix: 'image',
|
32
|
+
spelling_correction: true,
|
30
33
|
# Console Output
|
31
34
|
verbose: true,
|
35
|
+
timing: true,
|
32
36
|
}
|
33
37
|
|
34
38
|
attr_reader :original_file_path,
|
@@ -36,7 +40,9 @@ module OcrFile
|
|
36
40
|
:save_file_path,
|
37
41
|
:final_save_file,
|
38
42
|
:config,
|
39
|
-
:ocr_engine
|
43
|
+
:ocr_engine,
|
44
|
+
:start_time,
|
45
|
+
:end_time
|
40
46
|
|
41
47
|
# save_file_path will also generate a tmp path for tmp files. Expected folder path
|
42
48
|
# TODO: Add in more input validation
|
@@ -69,72 +75,46 @@ module OcrFile
|
|
69
75
|
|
70
76
|
# Trigger OCR pipeline
|
71
77
|
def to_pdf
|
72
|
-
|
73
|
-
|
74
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
75
|
-
|
76
|
-
pdfs_to_merge = []
|
77
|
-
|
78
|
-
image_paths.each do |image_path|
|
79
|
-
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
80
|
-
end
|
81
|
-
|
82
|
-
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
|
78
|
+
@start_time = Time.now
|
79
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
|
83
80
|
|
84
|
-
|
85
|
-
|
81
|
+
if pdf?
|
82
|
+
ocr_pdf_to_searchable_pdf
|
86
83
|
elsif text?
|
87
|
-
|
88
|
-
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
89
|
-
|
90
|
-
OcrFile::ImageEngines::PdfEngine
|
91
|
-
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
84
|
+
text_to_pdf
|
92
85
|
else # is an image
|
93
86
|
ocr_image_to_pdf
|
94
87
|
end
|
95
88
|
|
96
89
|
close
|
90
|
+
|
91
|
+
@end_time = Time.now
|
92
|
+
print_time
|
97
93
|
end
|
98
94
|
|
99
95
|
def to_text
|
100
|
-
|
101
|
-
|
102
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
103
|
-
|
104
|
-
image_paths.each do |image_path|
|
105
|
-
text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
|
106
|
-
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
|
107
|
-
end
|
108
|
-
elsif text?
|
109
|
-
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
110
|
-
else # is an image
|
111
|
-
ocr_image_to_text(save: true)
|
112
|
-
end
|
96
|
+
@start_time = Time.now
|
97
|
+
return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
|
113
98
|
|
99
|
+
find_best_image_processing(save: true)
|
114
100
|
close
|
101
|
+
|
102
|
+
@end_time = Time.now
|
103
|
+
print_time
|
115
104
|
end
|
116
105
|
|
117
106
|
def to_s
|
118
|
-
|
119
|
-
|
120
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
107
|
+
@start_time = Time.now
|
108
|
+
return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
|
121
109
|
|
122
|
-
|
110
|
+
text = find_best_image_processing(save: false)
|
123
111
|
|
124
|
-
|
125
|
-
text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
|
126
|
-
end
|
112
|
+
close
|
127
113
|
|
128
|
-
|
129
|
-
|
130
|
-
elsif text?
|
131
|
-
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
132
|
-
else # is an image
|
133
|
-
text = ocr_image_to_text(save: false)
|
114
|
+
@end_time = Time.now
|
115
|
+
print_time
|
134
116
|
|
135
|
-
|
136
|
-
text
|
137
|
-
end
|
117
|
+
text
|
138
118
|
end
|
139
119
|
|
140
120
|
def close
|
@@ -185,14 +165,67 @@ module OcrFile
|
|
185
165
|
image_processor.convert!
|
186
166
|
end
|
187
167
|
|
168
|
+
def ocr_pdf_to_searchable_pdf
|
169
|
+
create_temp_folder
|
170
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
171
|
+
|
172
|
+
pdfs_to_merge = []
|
173
|
+
|
174
|
+
image_paths.each do |image_path|
|
175
|
+
puts image_path
|
176
|
+
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
177
|
+
end
|
178
|
+
|
179
|
+
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
|
180
|
+
|
181
|
+
OcrFile::ImageEngines::PdfEngine
|
182
|
+
.save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
183
|
+
end
|
184
|
+
|
185
|
+
def text_to_pdf
|
186
|
+
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
187
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
188
|
+
|
189
|
+
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
190
|
+
|
191
|
+
OcrFile::ImageEngines::PdfEngine
|
192
|
+
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
193
|
+
end
|
194
|
+
|
188
195
|
def ocr_image_to_pdf
|
196
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess]
|
197
|
+
|
189
198
|
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
|
190
199
|
OcrFile::ImageEngines::PdfEngine
|
191
200
|
.save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
192
201
|
end
|
193
202
|
|
194
|
-
def
|
203
|
+
def ocr_pdf_to_text(save:)
|
204
|
+
create_temp_folder
|
205
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
206
|
+
|
207
|
+
text = ''
|
208
|
+
|
209
|
+
image_paths.each do |image_path|
|
210
|
+
puts image_path
|
211
|
+
text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
|
212
|
+
|
213
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
214
|
+
text = "#{text}#{PAGE_BREAK}#{text}"
|
215
|
+
end
|
216
|
+
|
217
|
+
if save
|
218
|
+
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
|
219
|
+
else
|
220
|
+
text
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def ocr_image_to_text(save:)
|
225
|
+
create_temp_folder
|
226
|
+
|
195
227
|
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
|
228
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
196
229
|
|
197
230
|
if save
|
198
231
|
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
|
@@ -201,6 +234,57 @@ module OcrFile
|
|
201
234
|
end
|
202
235
|
end
|
203
236
|
|
237
|
+
def ocr_file_to_text(save:)
|
238
|
+
if pdf?
|
239
|
+
ocr_pdf_to_text(save: save)
|
240
|
+
else # is an image
|
241
|
+
ocr_image_to_text(save: save)
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def find_best_image_processing(save:)
|
246
|
+
ocr_file_to_text(save: save) unless config[:automatic_reprocess]
|
247
|
+
|
248
|
+
text = ''
|
249
|
+
best_text_count = 0
|
250
|
+
best_effects = config[:effects]
|
251
|
+
|
252
|
+
effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
|
253
|
+
effects_to_test.each do |effect|
|
254
|
+
text = test_ocr_settings(effect)
|
255
|
+
processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
|
256
|
+
|
257
|
+
if processed_result.count_of_issues < best_text_count
|
258
|
+
best_text_count = processed_result.count_of_issues
|
259
|
+
best_effects = config[:effects]
|
260
|
+
end
|
261
|
+
|
262
|
+
break if processed_result.valid_words?
|
263
|
+
end
|
264
|
+
|
265
|
+
# Fallback
|
266
|
+
if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
|
267
|
+
config[:effects] = best_effects
|
268
|
+
text = ocr_file_to_text(save: false)
|
269
|
+
end
|
270
|
+
|
271
|
+
# Adds in extra operations which is unfortunately inefficient
|
272
|
+
if save
|
273
|
+
ocr_file_to_text(save: save)
|
274
|
+
else
|
275
|
+
text
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
def test_ocr_settings(effect)
|
280
|
+
config[:effects] = config[:effects] - [effect]
|
281
|
+
ocr_file_to_text(save: false)
|
282
|
+
end
|
283
|
+
|
284
|
+
def print_time
|
285
|
+
puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
|
286
|
+
end
|
287
|
+
|
204
288
|
def find_ocr_engine(engine_id)
|
205
289
|
ocr_engine_constants
|
206
290
|
.map { |c| ocr_module(c) }
|
@@ -61,6 +61,13 @@ module OcrFile
|
|
61
61
|
@image.sharpen('0x4') # radiusXsigma
|
62
62
|
end
|
63
63
|
|
64
|
+
# https://github.com/ImageMagick/ImageMagick/discussions/4145
|
65
|
+
def remove_shadow
|
66
|
+
@image.negate
|
67
|
+
@image.lat("20x20+10\%")
|
68
|
+
@image.negate
|
69
|
+
end
|
70
|
+
|
64
71
|
def deskew
|
65
72
|
@image.deskew('40%') # threshold recommended in the docs
|
66
73
|
end
|
@@ -61,11 +61,38 @@ module OcrFile
|
|
61
61
|
image_paths
|
62
62
|
end
|
63
63
|
|
64
|
+
def insert_image(document, image_path)
|
65
|
+
canvas = document.pages.add.canvas
|
66
|
+
canvas.image(image_path, at: [0, 0], height: 700)
|
67
|
+
end
|
68
|
+
|
69
|
+
def combine(text, pdf_of_images)
|
70
|
+
return unless pdf_of_images.is_a?(::HexaPDF::Document)
|
71
|
+
|
72
|
+
if text.is_a?(::HexaPDF::Document)
|
73
|
+
pages_of_text = text.pages
|
74
|
+
else # Assume raw text with PAGE_BREAK
|
75
|
+
pages_of_text = text.split(PAGE_BREAK)
|
76
|
+
end
|
77
|
+
|
78
|
+
return unless pages_of_text.size == pdf_of_images.pages.size
|
79
|
+
|
80
|
+
if text.is_a?(::HexaPDF::Document) # Keep the page structure
|
81
|
+
|
82
|
+
else # Just text to embed
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
64
87
|
def merge(documents)
|
65
88
|
target = ::HexaPDF::Document.new
|
66
89
|
|
67
90
|
documents.each do |document|
|
68
|
-
document.
|
91
|
+
if document.is_a?(::HexaPDF::Document)
|
92
|
+
document.pages.each { |page| target.pages << target.import(page) }
|
93
|
+
else # Assume an image
|
94
|
+
insert_image(target, document)
|
95
|
+
end
|
69
96
|
end
|
70
97
|
|
71
98
|
target
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module OcrFile
|
2
|
+
module TextEngines
|
3
|
+
class ResultProcessor
|
4
|
+
MINIMUM_WORD_LENGTH = 4
|
5
|
+
ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
|
6
|
+
ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
|
7
|
+
|
8
|
+
# REGEX
|
9
|
+
ASCII_ONLY = /[^\u{0000}-\u{007f}]/
|
10
|
+
NOISE_CHARACTERS = /[^\w\s\/-;:]/
|
11
|
+
DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
|
12
|
+
EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
|
13
|
+
|
14
|
+
attr_reader :text, :clear_text
|
15
|
+
|
16
|
+
def initialize(text)
|
17
|
+
@text = text
|
18
|
+
@clear_text = generate_clear_text || text || ''
|
19
|
+
end
|
20
|
+
|
21
|
+
def correct
|
22
|
+
Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
|
23
|
+
end
|
24
|
+
|
25
|
+
# This is a very naive way of determining if we should re-do OCR with
|
26
|
+
# shifted options
|
27
|
+
def valid_words?
|
28
|
+
word_size_average >= MINIMUM_WORD_LENGTH &&
|
29
|
+
spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
|
30
|
+
unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
|
31
|
+
end
|
32
|
+
|
33
|
+
def invalid_words?
|
34
|
+
!valid_words?
|
35
|
+
end
|
36
|
+
|
37
|
+
def word_count
|
38
|
+
return 0 if empty_text?
|
39
|
+
@_word_count ||= clear_words.size
|
40
|
+
end
|
41
|
+
|
42
|
+
def word_size_average
|
43
|
+
return 0 if empty_text?
|
44
|
+
@_word_size_average ||= clear_words.map(&:size).sum / word_count
|
45
|
+
end
|
46
|
+
|
47
|
+
# Assume English
|
48
|
+
def unidentified_word_count
|
49
|
+
clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
|
50
|
+
end
|
51
|
+
|
52
|
+
def spelling_error_count
|
53
|
+
Spellchecker.check(clear_text).count
|
54
|
+
end
|
55
|
+
|
56
|
+
def count_of_issues
|
57
|
+
spelling_error_count + unidentified_word_count
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def empty_text?
|
63
|
+
clear_text.nil? || clear_text == ''
|
64
|
+
end
|
65
|
+
|
66
|
+
def clear_words
|
67
|
+
@clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_clear_text
|
71
|
+
remove_lines
|
72
|
+
&.gsub(ASCII_ONLY, '')
|
73
|
+
&.gsub(NOISE_CHARACTERS, '')
|
74
|
+
&.gsub(DUPLICATE_WORDS, '')
|
75
|
+
end
|
76
|
+
|
77
|
+
def remove_lines
|
78
|
+
text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/ocr-file/version.rb
CHANGED
data/lib/ocr-file.rb
CHANGED
@@ -2,6 +2,7 @@ require 'hexapdf'
|
|
2
2
|
require 'hexapdf/cli/images'
|
3
3
|
require 'rtesseract'
|
4
4
|
require 'mini_magick'
|
5
|
+
require 'ruby-spellchecker'
|
5
6
|
|
6
7
|
require 'ocr-file/version'
|
7
8
|
|
@@ -10,6 +11,7 @@ require 'ocr-file/image_engines/image_magick'
|
|
10
11
|
require 'ocr-file/image_engines/pdftoppm'
|
11
12
|
require 'ocr-file/ocr_engines/tesseract'
|
12
13
|
require 'ocr-file/ocr_engines/cloud_vision'
|
14
|
+
require 'ocr-file/text_engines/result_processor'
|
13
15
|
require 'ocr-file/file_helpers'
|
14
16
|
require 'ocr-file/document'
|
15
17
|
require 'ocr-file/cli'
|
data/ocr-file.gemspec
CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_dependency "hexapdf", "~> 0.23.0"
|
33
33
|
spec.add_dependency "rtesseract", "~> 3.1.2"
|
34
34
|
spec.add_dependency "mini_magick", "~> 4.11.0"
|
35
|
+
spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
|
35
36
|
|
36
37
|
# Development Dependencies
|
37
38
|
spec.add_development_dependency "pry", "~> 0.14.1"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ocr-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: console-style
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 4.11.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-spellchecker
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.5
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.1.5
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: pry
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +136,7 @@ files:
|
|
122
136
|
- lib/ocr-file/image_engines/pdftoppm.rb
|
123
137
|
- lib/ocr-file/ocr_engines/cloud_vision.rb
|
124
138
|
- lib/ocr-file/ocr_engines/tesseract.rb
|
139
|
+
- lib/ocr-file/text_engines/result_processor.rb
|
125
140
|
- lib/ocr-file/version.rb
|
126
141
|
- ocr-file.gemspec
|
127
142
|
homepage: https://github.com/TRex22/ocr-file
|