ocr-file 0.0.3 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -2
- data/README.md +12 -2
- data/lib/ocr-file/document.rb +139 -49
- data/lib/ocr-file/image_engines/pdf_engine.rb +28 -1
- data/lib/ocr-file/text_engines/result_processor.rb +82 -0
- data/lib/ocr-file/version.rb +1 -1
- data/lib/ocr-file.rb +2 -0
- data/ocr-file.gemspec +1 -0
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8b87806d21622a72c6166c35fe4367f5b07135e5e7fab4e8be8b8941f75439dc
|
4
|
+
data.tar.gz: d342a91e9b23f8677784553327ba1cc1c00e1599415512b28226f8e9f6bc55b4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ecadeeb21a358274bce4ed3d7fce66e53d31ff3abe940ff1b9d77893f12b73bfd41e9ac35324e3a98f004638f9d1906760ef962a3637fbaf48973faeec9a17cb
|
7
|
+
data.tar.gz: 5d4a149dd6d0da1feb723b08c327edab414b75f0b633cea53aaee00d43313d26b84659956957acec7550a822998b76a760b3888770a606d8b4a1f9bb14f807c2
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ocr-file (0.0.
|
4
|
+
ocr-file (0.0.6)
|
5
5
|
active_attr (~> 0.15.4)
|
6
6
|
console-style (~> 0.0.1)
|
7
7
|
hexapdf (~> 0.23.0)
|
8
8
|
mini_magick (~> 4.11.0)
|
9
9
|
rtesseract (~> 3.1.2)
|
10
|
+
ruby-spellchecker (~> 0.1.5)
|
10
11
|
|
11
12
|
GEM
|
12
13
|
remote: https://rubygems.org/
|
@@ -60,7 +61,7 @@ GEM
|
|
60
61
|
coderay (~> 1.1)
|
61
62
|
method_source (~> 1.0)
|
62
63
|
racc (1.6.0)
|
63
|
-
rack (2.2.
|
64
|
+
rack (2.2.4)
|
64
65
|
rack-test (1.1.0)
|
65
66
|
rack (>= 1.0, < 3)
|
66
67
|
rails-dom-testing (2.0.3)
|
@@ -69,6 +70,7 @@ GEM
|
|
69
70
|
rails-html-sanitizer (1.4.3)
|
70
71
|
loofah (~> 2.3)
|
71
72
|
rtesseract (3.1.2)
|
73
|
+
ruby-spellchecker (0.1.5)
|
72
74
|
tzinfo (2.0.4)
|
73
75
|
concurrent-ruby (~> 1.0)
|
74
76
|
|
data/README.md
CHANGED
@@ -44,12 +44,16 @@ You will need to install `tesseract` with your desired language on your system,
|
|
44
44
|
# Image Pre-Processing
|
45
45
|
image_preprocess: true,
|
46
46
|
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
|
47
|
+
automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
|
47
48
|
# PDF to Image Processing
|
48
49
|
optimise_pdf: true,
|
49
50
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
50
51
|
temp_filename_prefix: 'image',
|
52
|
+
spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
|
53
|
+
keep_files: false,
|
51
54
|
# Console Output
|
52
55
|
verbose: true,
|
56
|
+
timing: true
|
53
57
|
}
|
54
58
|
|
55
59
|
doc = OcrFile::Document.new(
|
@@ -74,6 +78,7 @@ You will need to install `tesseract` with your desired language on your system,
|
|
74
78
|
doc.to_pdf
|
75
79
|
|
76
80
|
# How to merge files into a single PDF:
|
81
|
+
# The files can be images or other PDFs
|
77
82
|
filepaths = []
|
78
83
|
documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
|
79
84
|
merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
|
@@ -85,6 +90,8 @@ Set `extract_pdf_images` to `false` for higher quality OCR. However this will co
|
|
85
90
|
|
86
91
|
Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
|
87
92
|
|
93
|
+
`automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
|
94
|
+
|
88
95
|
### Simple CLI
|
89
96
|
Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
|
90
97
|
|
@@ -108,7 +115,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
108
115
|
### TODOs
|
109
116
|
- input validation
|
110
117
|
- Better CLI
|
111
|
-
- image processing
|
112
118
|
- password
|
113
119
|
- Base64 encoding
|
114
120
|
- requirements checking (installed dependencies etc ...)
|
@@ -117,7 +123,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
117
123
|
- Improve console output
|
118
124
|
- Fix spaces in file names
|
119
125
|
- Better verbosity
|
120
|
-
-
|
126
|
+
- Docker
|
127
|
+
- pdftk / pdf merge for text and bookmarks etc ...
|
128
|
+
- https://github.com/tesseract-ocr/tesseract/issues/660
|
129
|
+
- tesseract -c naked_pdf=true
|
130
|
+
-
|
121
131
|
|
122
132
|
### Tests
|
123
133
|
To run tests execute:
|
data/lib/ocr-file/document.rb
CHANGED
@@ -5,6 +5,7 @@ module OcrFile
|
|
5
5
|
|
6
6
|
ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
|
7
7
|
PAGE_BREAK = "\n\r\n" # TODO: Make configurable
|
8
|
+
EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
|
8
9
|
DEFAULT_CONFIG = {
|
9
10
|
# Images from PDF
|
10
11
|
filetype: 'png',
|
@@ -23,12 +24,16 @@ module OcrFile
|
|
23
24
|
# Image Pre-Processing
|
24
25
|
image_preprocess: true,
|
25
26
|
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
|
27
|
+
automatic_reprocess: true,
|
26
28
|
# PDF to Image Processing
|
27
29
|
optimise_pdf: true,
|
28
30
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
29
31
|
temp_filename_prefix: 'image',
|
32
|
+
spelling_correction: true,
|
33
|
+
keep_files: false,
|
30
34
|
# Console Output
|
31
35
|
verbose: true,
|
36
|
+
timing: true
|
32
37
|
}
|
33
38
|
|
34
39
|
attr_reader :original_file_path,
|
@@ -36,7 +41,9 @@ module OcrFile
|
|
36
41
|
:save_file_path,
|
37
42
|
:final_save_file,
|
38
43
|
:config,
|
39
|
-
:ocr_engine
|
44
|
+
:ocr_engine,
|
45
|
+
:start_time,
|
46
|
+
:end_time
|
40
47
|
|
41
48
|
# save_file_path will also generate a tmp path for tmp files. Expected folder path
|
42
49
|
# TODO: Add in more input validation
|
@@ -69,75 +76,50 @@ module OcrFile
|
|
69
76
|
|
70
77
|
# Trigger OCR pipeline
|
71
78
|
def to_pdf
|
72
|
-
|
73
|
-
|
74
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
75
|
-
|
76
|
-
pdfs_to_merge = []
|
77
|
-
|
78
|
-
image_paths.each do |image_path|
|
79
|
-
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
80
|
-
end
|
81
|
-
|
82
|
-
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
|
79
|
+
@start_time = Time.now
|
80
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
|
83
81
|
|
84
|
-
|
85
|
-
|
82
|
+
if pdf?
|
83
|
+
ocr_pdf_to_searchable_pdf
|
86
84
|
elsif text?
|
87
|
-
|
88
|
-
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
89
|
-
|
90
|
-
OcrFile::ImageEngines::PdfEngine
|
91
|
-
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
85
|
+
text_to_pdf
|
92
86
|
else # is an image
|
93
87
|
ocr_image_to_pdf
|
94
88
|
end
|
95
89
|
|
96
90
|
close
|
91
|
+
|
92
|
+
@end_time = Time.now
|
93
|
+
print_time
|
97
94
|
end
|
98
95
|
|
99
96
|
def to_text
|
100
|
-
|
101
|
-
|
102
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
103
|
-
|
104
|
-
image_paths.each do |image_path|
|
105
|
-
text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
|
106
|
-
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
|
107
|
-
end
|
108
|
-
elsif text?
|
109
|
-
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
110
|
-
else # is an image
|
111
|
-
ocr_image_to_text(save: true)
|
112
|
-
end
|
97
|
+
@start_time = Time.now
|
98
|
+
return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
|
113
99
|
|
100
|
+
find_best_image_processing(save: true)
|
114
101
|
close
|
102
|
+
|
103
|
+
@end_time = Time.now
|
104
|
+
print_time
|
115
105
|
end
|
116
106
|
|
117
107
|
def to_s
|
118
|
-
|
119
|
-
|
120
|
-
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
108
|
+
@start_time = Time.now
|
109
|
+
return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
|
121
110
|
|
122
|
-
|
111
|
+
text = find_best_image_processing(save: false)
|
123
112
|
|
124
|
-
|
125
|
-
text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
|
126
|
-
end
|
113
|
+
close
|
127
114
|
|
128
|
-
|
129
|
-
|
130
|
-
elsif text?
|
131
|
-
::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
132
|
-
else # is an image
|
133
|
-
text = ocr_image_to_text(save: false)
|
115
|
+
@end_time = Time.now
|
116
|
+
print_time
|
134
117
|
|
135
|
-
|
136
|
-
text
|
137
|
-
end
|
118
|
+
text
|
138
119
|
end
|
139
120
|
|
140
121
|
def close
|
122
|
+
return if keep_files?
|
141
123
|
::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
|
142
124
|
end
|
143
125
|
|
@@ -162,6 +144,10 @@ module OcrFile
|
|
162
144
|
end
|
163
145
|
end
|
164
146
|
|
147
|
+
def keep_files?
|
148
|
+
config['keep_files']
|
149
|
+
end
|
150
|
+
|
165
151
|
def create_temp_folder
|
166
152
|
date = Time.now.to_s.split(' ').first
|
167
153
|
|
@@ -185,14 +171,67 @@ module OcrFile
|
|
185
171
|
image_processor.convert!
|
186
172
|
end
|
187
173
|
|
174
|
+
def ocr_pdf_to_searchable_pdf
|
175
|
+
create_temp_folder
|
176
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
177
|
+
|
178
|
+
pdfs_to_merge = []
|
179
|
+
|
180
|
+
image_paths.each do |image_path|
|
181
|
+
puts image_path
|
182
|
+
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
183
|
+
end
|
184
|
+
|
185
|
+
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
|
186
|
+
|
187
|
+
OcrFile::ImageEngines::PdfEngine
|
188
|
+
.save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
189
|
+
end
|
190
|
+
|
191
|
+
def text_to_pdf
|
192
|
+
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
193
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
194
|
+
|
195
|
+
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
196
|
+
|
197
|
+
OcrFile::ImageEngines::PdfEngine
|
198
|
+
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
199
|
+
end
|
200
|
+
|
188
201
|
def ocr_image_to_pdf
|
202
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess]
|
203
|
+
|
189
204
|
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
|
190
205
|
OcrFile::ImageEngines::PdfEngine
|
191
206
|
.save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
|
192
207
|
end
|
193
208
|
|
194
|
-
def
|
209
|
+
def ocr_pdf_to_text(save:)
|
210
|
+
create_temp_folder
|
211
|
+
image_paths = extract_image_paths_from_pdf(@original_file_path)
|
212
|
+
|
213
|
+
text = ''
|
214
|
+
|
215
|
+
image_paths.each do |image_path|
|
216
|
+
puts image_path
|
217
|
+
text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
|
218
|
+
|
219
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
220
|
+
text = "#{text}#{PAGE_BREAK}#{text}"
|
221
|
+
end
|
222
|
+
|
223
|
+
if save
|
224
|
+
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
|
225
|
+
else
|
226
|
+
text
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
def ocr_image_to_text(save:)
|
231
|
+
create_temp_folder
|
232
|
+
|
195
233
|
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
|
234
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
196
235
|
|
197
236
|
if save
|
198
237
|
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
|
@@ -201,6 +240,57 @@ module OcrFile
|
|
201
240
|
end
|
202
241
|
end
|
203
242
|
|
243
|
+
def ocr_file_to_text(save:)
|
244
|
+
if pdf?
|
245
|
+
ocr_pdf_to_text(save: save)
|
246
|
+
else # is an image
|
247
|
+
ocr_image_to_text(save: save)
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def find_best_image_processing(save:)
|
252
|
+
ocr_file_to_text(save: save) unless config[:automatic_reprocess]
|
253
|
+
|
254
|
+
text = ''
|
255
|
+
best_text_count = 0
|
256
|
+
best_effects = config[:effects]
|
257
|
+
|
258
|
+
effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
|
259
|
+
effects_to_test.each do |effect|
|
260
|
+
text = test_ocr_settings(effect)
|
261
|
+
processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
|
262
|
+
|
263
|
+
if processed_result.count_of_issues < best_text_count
|
264
|
+
best_text_count = processed_result.count_of_issues
|
265
|
+
best_effects = config[:effects]
|
266
|
+
end
|
267
|
+
|
268
|
+
break if processed_result.valid_words?
|
269
|
+
end
|
270
|
+
|
271
|
+
# Fallback
|
272
|
+
if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
|
273
|
+
config[:effects] = best_effects
|
274
|
+
text = ocr_file_to_text(save: false)
|
275
|
+
end
|
276
|
+
|
277
|
+
# Adds in extra operations which is unfortunately inefficient
|
278
|
+
if save
|
279
|
+
ocr_file_to_text(save: save)
|
280
|
+
else
|
281
|
+
text
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
def test_ocr_settings(effect)
|
286
|
+
config[:effects] = config[:effects] - [effect]
|
287
|
+
ocr_file_to_text(save: false)
|
288
|
+
end
|
289
|
+
|
290
|
+
def print_time
|
291
|
+
puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
|
292
|
+
end
|
293
|
+
|
204
294
|
def find_ocr_engine(engine_id)
|
205
295
|
ocr_engine_constants
|
206
296
|
.map { |c| ocr_module(c) }
|
@@ -61,11 +61,38 @@ module OcrFile
|
|
61
61
|
image_paths
|
62
62
|
end
|
63
63
|
|
64
|
+
def insert_image(document, image_path)
|
65
|
+
canvas = document.pages.add.canvas
|
66
|
+
canvas.image(image_path, at: [0, 0], height: 700)
|
67
|
+
end
|
68
|
+
|
69
|
+
def combine(text, pdf_of_images)
|
70
|
+
return unless pdf_of_images.is_a?(::HexaPDF::Document)
|
71
|
+
|
72
|
+
if text.is_a?(::HexaPDF::Document)
|
73
|
+
pages_of_text = text.pages
|
74
|
+
else # Assume raw text with PAGE_BREAK
|
75
|
+
pages_of_text = text.split(PAGE_BREAK)
|
76
|
+
end
|
77
|
+
|
78
|
+
return unless pages_of_text.size == pdf_of_images.pages.size
|
79
|
+
|
80
|
+
if text.is_a?(::HexaPDF::Document) # Keep the page structure
|
81
|
+
|
82
|
+
else # Just text to embed
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
64
87
|
def merge(documents)
|
65
88
|
target = ::HexaPDF::Document.new
|
66
89
|
|
67
90
|
documents.each do |document|
|
68
|
-
document.
|
91
|
+
if document.is_a?(::HexaPDF::Document)
|
92
|
+
document.pages.each { |page| target.pages << target.import(page) }
|
93
|
+
else # Assume an image
|
94
|
+
insert_image(target, document)
|
95
|
+
end
|
69
96
|
end
|
70
97
|
|
71
98
|
target
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module OcrFile
|
2
|
+
module TextEngines
|
3
|
+
class ResultProcessor
|
4
|
+
MINIMUM_WORD_LENGTH = 4
|
5
|
+
ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
|
6
|
+
ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
|
7
|
+
|
8
|
+
# REGEX
|
9
|
+
ASCII_ONLY = /[^\u{0000}-\u{007f}]/
|
10
|
+
NOISE_CHARACTERS = /[^\w\s\/-;:]/
|
11
|
+
DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
|
12
|
+
EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
|
13
|
+
|
14
|
+
attr_reader :text, :clear_text
|
15
|
+
|
16
|
+
def initialize(text)
|
17
|
+
@text = text
|
18
|
+
@clear_text = generate_clear_text || text || ''
|
19
|
+
end
|
20
|
+
|
21
|
+
def correct
|
22
|
+
Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
|
23
|
+
end
|
24
|
+
|
25
|
+
# This is a very naive way of determining if we should re-do OCR with
|
26
|
+
# shifted options
|
27
|
+
def valid_words?
|
28
|
+
word_size_average >= MINIMUM_WORD_LENGTH &&
|
29
|
+
spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
|
30
|
+
unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
|
31
|
+
end
|
32
|
+
|
33
|
+
def invalid_words?
|
34
|
+
!valid_words?
|
35
|
+
end
|
36
|
+
|
37
|
+
def word_count
|
38
|
+
return 0 if empty_text?
|
39
|
+
@_word_count ||= clear_words.size
|
40
|
+
end
|
41
|
+
|
42
|
+
def word_size_average
|
43
|
+
return 0 if empty_text?
|
44
|
+
@_word_size_average ||= clear_words.map(&:size).sum / word_count
|
45
|
+
end
|
46
|
+
|
47
|
+
# Assume English
|
48
|
+
def unidentified_word_count
|
49
|
+
clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
|
50
|
+
end
|
51
|
+
|
52
|
+
def spelling_error_count
|
53
|
+
Spellchecker.check(clear_text).count
|
54
|
+
end
|
55
|
+
|
56
|
+
def count_of_issues
|
57
|
+
spelling_error_count + unidentified_word_count
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def empty_text?
|
63
|
+
clear_text.nil? || clear_text == ''
|
64
|
+
end
|
65
|
+
|
66
|
+
def clear_words
|
67
|
+
@clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_clear_text
|
71
|
+
remove_lines
|
72
|
+
&.gsub(ASCII_ONLY, '')
|
73
|
+
&.gsub(NOISE_CHARACTERS, '')
|
74
|
+
&.gsub(DUPLICATE_WORDS, '')
|
75
|
+
end
|
76
|
+
|
77
|
+
def remove_lines
|
78
|
+
text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/ocr-file/version.rb
CHANGED
data/lib/ocr-file.rb
CHANGED
@@ -2,6 +2,7 @@ require 'hexapdf'
|
|
2
2
|
require 'hexapdf/cli/images'
|
3
3
|
require 'rtesseract'
|
4
4
|
require 'mini_magick'
|
5
|
+
require 'ruby-spellchecker'
|
5
6
|
|
6
7
|
require 'ocr-file/version'
|
7
8
|
|
@@ -10,6 +11,7 @@ require 'ocr-file/image_engines/image_magick'
|
|
10
11
|
require 'ocr-file/image_engines/pdftoppm'
|
11
12
|
require 'ocr-file/ocr_engines/tesseract'
|
12
13
|
require 'ocr-file/ocr_engines/cloud_vision'
|
14
|
+
require 'ocr-file/text_engines/result_processor'
|
13
15
|
require 'ocr-file/file_helpers'
|
14
16
|
require 'ocr-file/document'
|
15
17
|
require 'ocr-file/cli'
|
data/ocr-file.gemspec
CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_dependency "hexapdf", "~> 0.23.0"
|
33
33
|
spec.add_dependency "rtesseract", "~> 3.1.2"
|
34
34
|
spec.add_dependency "mini_magick", "~> 4.11.0"
|
35
|
+
spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
|
35
36
|
|
36
37
|
# Development Dependencies
|
37
38
|
spec.add_development_dependency "pry", "~> 0.14.1"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ocr-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: console-style
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 4.11.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-spellchecker
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.5
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.1.5
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: pry
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +136,7 @@ files:
|
|
122
136
|
- lib/ocr-file/image_engines/pdftoppm.rb
|
123
137
|
- lib/ocr-file/ocr_engines/cloud_vision.rb
|
124
138
|
- lib/ocr-file/ocr_engines/tesseract.rb
|
139
|
+
- lib/ocr-file/text_engines/result_processor.rb
|
125
140
|
- lib/ocr-file/version.rb
|
126
141
|
- ocr-file.gemspec
|
127
142
|
homepage: https://github.com/TRex22/ocr-file
|