ocr-file 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -2
- data/README.md +7 -0
- data/lib/ocr-file/document.rb +37 -8
- data/lib/ocr-file/image_engines/pdf_engine.rb +28 -1
- data/lib/ocr-file/text_engines/result_processor.rb +54 -6
- data/lib/ocr-file/version.rb +1 -1
- data/lib/ocr-file.rb +1 -0
- data/ocr-file.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9660e3d19c210789a7aeab56e63b002b507c12917edd1418202900d05647773b
|
4
|
+
data.tar.gz: 28861544f58374db141e5e3cc3ee8569201c4ffd611458e77774c98835e2f882
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 55f4266d6877a7f2c8a4175f5601930c9940aa1a8a06fc1d6d84faca455232173f2ae4be887dc9d20246cb224498a213b91871b66ee3fb3b02699399be33453a
|
7
|
+
data.tar.gz: e78263baffd1ae1ff1246f705250d788ae6b5c24b3d59d78e169b3ddf93e139ac9b9688cf9d954053a913a1c3bd6be54c40466d372f915c4430d55d93282f1bb
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ocr-file (0.0.
|
4
|
+
ocr-file (0.0.6)
|
5
5
|
active_attr (~> 0.15.4)
|
6
6
|
console-style (~> 0.0.1)
|
7
7
|
hexapdf (~> 0.23.0)
|
8
8
|
mini_magick (~> 4.11.0)
|
9
9
|
rtesseract (~> 3.1.2)
|
10
|
+
ruby-spellchecker (~> 0.1.5)
|
10
11
|
|
11
12
|
GEM
|
12
13
|
remote: https://rubygems.org/
|
@@ -60,7 +61,7 @@ GEM
|
|
60
61
|
coderay (~> 1.1)
|
61
62
|
method_source (~> 1.0)
|
62
63
|
racc (1.6.0)
|
63
|
-
rack (2.2.
|
64
|
+
rack (2.2.4)
|
64
65
|
rack-test (1.1.0)
|
65
66
|
rack (>= 1.0, < 3)
|
66
67
|
rails-dom-testing (2.0.3)
|
@@ -69,6 +70,7 @@ GEM
|
|
69
70
|
rails-html-sanitizer (1.4.3)
|
70
71
|
loofah (~> 2.3)
|
71
72
|
rtesseract (3.1.2)
|
73
|
+
ruby-spellchecker (0.1.5)
|
72
74
|
tzinfo (2.0.4)
|
73
75
|
concurrent-ruby (~> 1.0)
|
74
76
|
|
data/README.md
CHANGED
@@ -49,6 +49,7 @@ You will need to install `tesseract` with your desired language on your system,
|
|
49
49
|
optimise_pdf: true,
|
50
50
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
51
51
|
temp_filename_prefix: 'image',
|
52
|
+
spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
|
52
53
|
# Console Output
|
53
54
|
verbose: true,
|
54
55
|
timing: true,
|
@@ -76,6 +77,7 @@ You will need to install `tesseract` with your desired language on your system,
|
|
76
77
|
doc.to_pdf
|
77
78
|
|
78
79
|
# How to merge files into a single PDF:
|
80
|
+
# The files can be images or other PDFs
|
79
81
|
filepaths = []
|
80
82
|
documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
|
81
83
|
merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
|
@@ -120,6 +122,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
120
122
|
- Improve console output
|
121
123
|
- Fix spaces in file names
|
122
124
|
- Better verbosity
|
125
|
+
- Docker
|
126
|
+
- pdftk / pdf merge for text and bookmarks etc ...
|
127
|
+
- https://github.com/tesseract-ocr/tesseract/issues/660
|
128
|
+
- tesseract -c naked_pdf=true
|
129
|
+
-
|
123
130
|
|
124
131
|
### Tests
|
125
132
|
To run tests execute:
|
data/lib/ocr-file/document.rb
CHANGED
@@ -29,6 +29,7 @@ module OcrFile
|
|
29
29
|
optimise_pdf: true,
|
30
30
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
31
31
|
temp_filename_prefix: 'image',
|
32
|
+
spelling_correction: true,
|
32
33
|
# Console Output
|
33
34
|
verbose: true,
|
34
35
|
timing: true,
|
@@ -75,7 +76,7 @@ module OcrFile
|
|
75
76
|
# Trigger OCR pipeline
|
76
77
|
def to_pdf
|
77
78
|
@start_time = Time.now
|
78
|
-
find_best_image_processing if config[:automatic_reprocess] && !text?
|
79
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
|
79
80
|
|
80
81
|
if pdf?
|
81
82
|
ocr_pdf_to_searchable_pdf
|
@@ -117,7 +118,7 @@ module OcrFile
|
|
117
118
|
end
|
118
119
|
|
119
120
|
def close
|
120
|
-
|
121
|
+
::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
|
121
122
|
end
|
122
123
|
|
123
124
|
private
|
@@ -171,6 +172,7 @@ module OcrFile
|
|
171
172
|
pdfs_to_merge = []
|
172
173
|
|
173
174
|
image_paths.each do |image_path|
|
175
|
+
puts image_path
|
174
176
|
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
175
177
|
end
|
176
178
|
|
@@ -182,6 +184,8 @@ module OcrFile
|
|
182
184
|
|
183
185
|
def text_to_pdf
|
184
186
|
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
187
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
188
|
+
|
185
189
|
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
186
190
|
|
187
191
|
OcrFile::ImageEngines::PdfEngine
|
@@ -189,7 +193,7 @@ module OcrFile
|
|
189
193
|
end
|
190
194
|
|
191
195
|
def ocr_image_to_pdf
|
192
|
-
find_best_image_processing if config[:automatic_reprocess]
|
196
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess]
|
193
197
|
|
194
198
|
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
|
195
199
|
OcrFile::ImageEngines::PdfEngine
|
@@ -203,7 +207,11 @@ module OcrFile
|
|
203
207
|
text = ''
|
204
208
|
|
205
209
|
image_paths.each do |image_path|
|
206
|
-
|
210
|
+
puts image_path
|
211
|
+
text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
|
212
|
+
|
213
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
214
|
+
text = "#{text}#{PAGE_BREAK}#{text}"
|
207
215
|
end
|
208
216
|
|
209
217
|
if save
|
@@ -215,7 +223,9 @@ module OcrFile
|
|
215
223
|
|
216
224
|
def ocr_image_to_text(save:)
|
217
225
|
create_temp_folder
|
226
|
+
|
218
227
|
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
|
228
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
219
229
|
|
220
230
|
if save
|
221
231
|
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
|
@@ -225,7 +235,7 @@ module OcrFile
|
|
225
235
|
end
|
226
236
|
|
227
237
|
def ocr_file_to_text(save:)
|
228
|
-
if pdf?
|
238
|
+
if pdf?
|
229
239
|
ocr_pdf_to_text(save: save)
|
230
240
|
else # is an image
|
231
241
|
ocr_image_to_text(save: save)
|
@@ -233,15 +243,29 @@ module OcrFile
|
|
233
243
|
end
|
234
244
|
|
235
245
|
def find_best_image_processing(save:)
|
236
|
-
ocr_file_to_text(save: save)
|
246
|
+
ocr_file_to_text(save: save) unless config[:automatic_reprocess]
|
237
247
|
|
238
248
|
text = ''
|
249
|
+
best_text_count = 0
|
250
|
+
best_effects = config[:effects]
|
251
|
+
|
239
252
|
effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
|
240
253
|
effects_to_test.each do |effect|
|
241
|
-
|
254
|
+
text = test_ocr_settings(effect)
|
255
|
+
processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
|
242
256
|
|
257
|
+
if processed_result.count_of_issues < best_text_count
|
258
|
+
best_text_count = processed_result.count_of_issues
|
259
|
+
best_effects = config[:effects]
|
260
|
+
end
|
261
|
+
|
262
|
+
break if processed_result.valid_words?
|
263
|
+
end
|
264
|
+
|
265
|
+
# Fallback
|
266
|
+
if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
|
267
|
+
config[:effects] = best_effects
|
243
268
|
text = ocr_file_to_text(save: false)
|
244
|
-
break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
|
245
269
|
end
|
246
270
|
|
247
271
|
# Adds in extra operations which is unfortunately inefficient
|
@@ -252,6 +276,11 @@ module OcrFile
|
|
252
276
|
end
|
253
277
|
end
|
254
278
|
|
279
|
+
def test_ocr_settings(effect)
|
280
|
+
config[:effects] = config[:effects] - [effect]
|
281
|
+
ocr_file_to_text(save: false)
|
282
|
+
end
|
283
|
+
|
255
284
|
def print_time
|
256
285
|
puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
|
257
286
|
end
|
@@ -61,11 +61,38 @@ module OcrFile
|
|
61
61
|
image_paths
|
62
62
|
end
|
63
63
|
|
64
|
+
def insert_image(document, image_path)
|
65
|
+
canvas = document.pages.add.canvas
|
66
|
+
canvas.image(image_path, at: [0, 0], height: 700)
|
67
|
+
end
|
68
|
+
|
69
|
+
def combine(text, pdf_of_images)
|
70
|
+
return unless pdf_of_images.is_a?(::HexaPDF::Document)
|
71
|
+
|
72
|
+
if text.is_a?(::HexaPDF::Document)
|
73
|
+
pages_of_text = text.pages
|
74
|
+
else # Assume raw text with PAGE_BREAK
|
75
|
+
pages_of_text = text.split(PAGE_BREAK)
|
76
|
+
end
|
77
|
+
|
78
|
+
return unless pages_of_text.size == pdf_of_images.pages.size
|
79
|
+
|
80
|
+
if text.is_a?(::HexaPDF::Document) # Keep the page structure
|
81
|
+
|
82
|
+
else # Just text to embed
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
64
87
|
def merge(documents)
|
65
88
|
target = ::HexaPDF::Document.new
|
66
89
|
|
67
90
|
documents.each do |document|
|
68
|
-
document.
|
91
|
+
if document.is_a?(::HexaPDF::Document)
|
92
|
+
document.pages.each { |page| target.pages << target.import(page) }
|
93
|
+
else # Assume an image
|
94
|
+
insert_image(target, document)
|
95
|
+
end
|
69
96
|
end
|
70
97
|
|
71
98
|
target
|
@@ -1,33 +1,81 @@
|
|
1
1
|
module OcrFile
|
2
2
|
module TextEngines
|
3
3
|
class ResultProcessor
|
4
|
-
MINIMUM_WORD_LENGTH =
|
4
|
+
MINIMUM_WORD_LENGTH = 4
|
5
|
+
ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
|
6
|
+
ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
|
7
|
+
|
8
|
+
# REGEX
|
9
|
+
ASCII_ONLY = /[^\u{0000}-\u{007f}]/
|
10
|
+
NOISE_CHARACTERS = /[^\w\s\/-;:]/
|
11
|
+
DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
|
12
|
+
EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
|
5
13
|
|
6
14
|
attr_reader :text, :clear_text
|
7
15
|
|
8
16
|
def initialize(text)
|
9
17
|
@text = text
|
10
|
-
@clear_text =
|
18
|
+
@clear_text = generate_clear_text || text || ''
|
19
|
+
end
|
20
|
+
|
21
|
+
def correct
|
22
|
+
Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
|
11
23
|
end
|
12
24
|
|
13
25
|
# This is a very naive way of determining if we should re-do OCR with
|
14
26
|
# shifted options
|
15
27
|
def valid_words?
|
16
|
-
word_size_average >= MINIMUM_WORD_LENGTH
|
28
|
+
word_size_average >= MINIMUM_WORD_LENGTH &&
|
29
|
+
spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
|
30
|
+
unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
|
31
|
+
end
|
32
|
+
|
33
|
+
def invalid_words?
|
34
|
+
!valid_words?
|
17
35
|
end
|
18
36
|
|
19
37
|
def word_count
|
20
|
-
|
38
|
+
return 0 if empty_text?
|
39
|
+
@_word_count ||= clear_words.size
|
21
40
|
end
|
22
41
|
|
23
42
|
def word_size_average
|
24
|
-
|
43
|
+
return 0 if empty_text?
|
44
|
+
@_word_size_average ||= clear_words.map(&:size).sum / word_count
|
45
|
+
end
|
46
|
+
|
47
|
+
# Assume English
|
48
|
+
def unidentified_word_count
|
49
|
+
clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
|
50
|
+
end
|
51
|
+
|
52
|
+
def spelling_error_count
|
53
|
+
Spellchecker.check(clear_text).count
|
54
|
+
end
|
55
|
+
|
56
|
+
def count_of_issues
|
57
|
+
spelling_error_count + unidentified_word_count
|
25
58
|
end
|
26
59
|
|
27
60
|
private
|
28
61
|
|
62
|
+
def empty_text?
|
63
|
+
clear_text.nil? || clear_text == ''
|
64
|
+
end
|
65
|
+
|
66
|
+
def clear_words
|
67
|
+
@clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_clear_text
|
71
|
+
remove_lines
|
72
|
+
&.gsub(ASCII_ONLY, '')
|
73
|
+
&.gsub(NOISE_CHARACTERS, '')
|
74
|
+
&.gsub(DUPLICATE_WORDS, '')
|
75
|
+
end
|
76
|
+
|
29
77
|
def remove_lines
|
30
|
-
text
|
78
|
+
text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
|
31
79
|
end
|
32
80
|
end
|
33
81
|
end
|
data/lib/ocr-file/version.rb
CHANGED
data/lib/ocr-file.rb
CHANGED
data/ocr-file.gemspec
CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_dependency "hexapdf", "~> 0.23.0"
|
33
33
|
spec.add_dependency "rtesseract", "~> 3.1.2"
|
34
34
|
spec.add_dependency "mini_magick", "~> 4.11.0"
|
35
|
+
spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
|
35
36
|
|
36
37
|
# Development Dependencies
|
37
38
|
spec.add_development_dependency "pry", "~> 0.14.1"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ocr-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: console-style
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 4.11.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-spellchecker
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.5
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.1.5
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: pry
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|