ocr-file 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -2
- data/README.md +7 -0
- data/lib/ocr-file/document.rb +37 -8
- data/lib/ocr-file/image_engines/pdf_engine.rb +28 -1
- data/lib/ocr-file/text_engines/result_processor.rb +54 -6
- data/lib/ocr-file/version.rb +1 -1
- data/lib/ocr-file.rb +1 -0
- data/ocr-file.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9660e3d19c210789a7aeab56e63b002b507c12917edd1418202900d05647773b
|
4
|
+
data.tar.gz: 28861544f58374db141e5e3cc3ee8569201c4ffd611458e77774c98835e2f882
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 55f4266d6877a7f2c8a4175f5601930c9940aa1a8a06fc1d6d84faca455232173f2ae4be887dc9d20246cb224498a213b91871b66ee3fb3b02699399be33453a
|
7
|
+
data.tar.gz: e78263baffd1ae1ff1246f705250d788ae6b5c24b3d59d78e169b3ddf93e139ac9b9688cf9d954053a913a1c3bd6be54c40466d372f915c4430d55d93282f1bb
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ocr-file (0.0.
|
4
|
+
ocr-file (0.0.6)
|
5
5
|
active_attr (~> 0.15.4)
|
6
6
|
console-style (~> 0.0.1)
|
7
7
|
hexapdf (~> 0.23.0)
|
8
8
|
mini_magick (~> 4.11.0)
|
9
9
|
rtesseract (~> 3.1.2)
|
10
|
+
ruby-spellchecker (~> 0.1.5)
|
10
11
|
|
11
12
|
GEM
|
12
13
|
remote: https://rubygems.org/
|
@@ -60,7 +61,7 @@ GEM
|
|
60
61
|
coderay (~> 1.1)
|
61
62
|
method_source (~> 1.0)
|
62
63
|
racc (1.6.0)
|
63
|
-
rack (2.2.
|
64
|
+
rack (2.2.4)
|
64
65
|
rack-test (1.1.0)
|
65
66
|
rack (>= 1.0, < 3)
|
66
67
|
rails-dom-testing (2.0.3)
|
@@ -69,6 +70,7 @@ GEM
|
|
69
70
|
rails-html-sanitizer (1.4.3)
|
70
71
|
loofah (~> 2.3)
|
71
72
|
rtesseract (3.1.2)
|
73
|
+
ruby-spellchecker (0.1.5)
|
72
74
|
tzinfo (2.0.4)
|
73
75
|
concurrent-ruby (~> 1.0)
|
74
76
|
|
data/README.md
CHANGED
@@ -49,6 +49,7 @@ You will need to install `tesseract` with your desired language on your system,
|
|
49
49
|
optimise_pdf: true,
|
50
50
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
51
51
|
temp_filename_prefix: 'image',
|
52
|
+
spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
|
52
53
|
# Console Output
|
53
54
|
verbose: true,
|
54
55
|
timing: true,
|
@@ -76,6 +77,7 @@ You will need to install `tesseract` with your desired language on your system,
|
|
76
77
|
doc.to_pdf
|
77
78
|
|
78
79
|
# How to merge files into a single PDF:
|
80
|
+
# The files can be images or other PDFs
|
79
81
|
filepaths = []
|
80
82
|
documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
|
81
83
|
merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
|
@@ -120,6 +122,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
120
122
|
- Improve console output
|
121
123
|
- Fix spaces in file names
|
122
124
|
- Better verbosity
|
125
|
+
- Docker
|
126
|
+
- pdftk / pdf merge for text and bookmarks etc ...
|
127
|
+
- https://github.com/tesseract-ocr/tesseract/issues/660
|
128
|
+
- tesseract -c naked_pdf=true
|
129
|
+
-
|
123
130
|
|
124
131
|
### Tests
|
125
132
|
To run tests execute:
|
data/lib/ocr-file/document.rb
CHANGED
@@ -29,6 +29,7 @@ module OcrFile
|
|
29
29
|
optimise_pdf: true,
|
30
30
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
31
31
|
temp_filename_prefix: 'image',
|
32
|
+
spelling_correction: true,
|
32
33
|
# Console Output
|
33
34
|
verbose: true,
|
34
35
|
timing: true,
|
@@ -75,7 +76,7 @@ module OcrFile
|
|
75
76
|
# Trigger OCR pipeline
|
76
77
|
def to_pdf
|
77
78
|
@start_time = Time.now
|
78
|
-
find_best_image_processing if config[:automatic_reprocess] && !text?
|
79
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
|
79
80
|
|
80
81
|
if pdf?
|
81
82
|
ocr_pdf_to_searchable_pdf
|
@@ -117,7 +118,7 @@ module OcrFile
|
|
117
118
|
end
|
118
119
|
|
119
120
|
def close
|
120
|
-
|
121
|
+
::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
|
121
122
|
end
|
122
123
|
|
123
124
|
private
|
@@ -171,6 +172,7 @@ module OcrFile
|
|
171
172
|
pdfs_to_merge = []
|
172
173
|
|
173
174
|
image_paths.each do |image_path|
|
175
|
+
puts image_path
|
174
176
|
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
175
177
|
end
|
176
178
|
|
@@ -182,6 +184,8 @@ module OcrFile
|
|
182
184
|
|
183
185
|
def text_to_pdf
|
184
186
|
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
187
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
188
|
+
|
185
189
|
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
186
190
|
|
187
191
|
OcrFile::ImageEngines::PdfEngine
|
@@ -189,7 +193,7 @@ module OcrFile
|
|
189
193
|
end
|
190
194
|
|
191
195
|
def ocr_image_to_pdf
|
192
|
-
find_best_image_processing if config[:automatic_reprocess]
|
196
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess]
|
193
197
|
|
194
198
|
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
|
195
199
|
OcrFile::ImageEngines::PdfEngine
|
@@ -203,7 +207,11 @@ module OcrFile
|
|
203
207
|
text = ''
|
204
208
|
|
205
209
|
image_paths.each do |image_path|
|
206
|
-
|
210
|
+
puts image_path
|
211
|
+
text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
|
212
|
+
|
213
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
214
|
+
text = "#{text}#{PAGE_BREAK}#{text}"
|
207
215
|
end
|
208
216
|
|
209
217
|
if save
|
@@ -215,7 +223,9 @@ module OcrFile
|
|
215
223
|
|
216
224
|
def ocr_image_to_text(save:)
|
217
225
|
create_temp_folder
|
226
|
+
|
218
227
|
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
|
228
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
219
229
|
|
220
230
|
if save
|
221
231
|
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
|
@@ -225,7 +235,7 @@ module OcrFile
|
|
225
235
|
end
|
226
236
|
|
227
237
|
def ocr_file_to_text(save:)
|
228
|
-
if pdf?
|
238
|
+
if pdf?
|
229
239
|
ocr_pdf_to_text(save: save)
|
230
240
|
else # is an image
|
231
241
|
ocr_image_to_text(save: save)
|
@@ -233,15 +243,29 @@ module OcrFile
|
|
233
243
|
end
|
234
244
|
|
235
245
|
def find_best_image_processing(save:)
|
236
|
-
ocr_file_to_text(save: save)
|
246
|
+
ocr_file_to_text(save: save) unless config[:automatic_reprocess]
|
237
247
|
|
238
248
|
text = ''
|
249
|
+
best_text_count = 0
|
250
|
+
best_effects = config[:effects]
|
251
|
+
|
239
252
|
effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
|
240
253
|
effects_to_test.each do |effect|
|
241
|
-
|
254
|
+
text = test_ocr_settings(effect)
|
255
|
+
processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
|
242
256
|
|
257
|
+
if processed_result.count_of_issues < best_text_count
|
258
|
+
best_text_count = processed_result.count_of_issues
|
259
|
+
best_effects = config[:effects]
|
260
|
+
end
|
261
|
+
|
262
|
+
break if processed_result.valid_words?
|
263
|
+
end
|
264
|
+
|
265
|
+
# Fallback
|
266
|
+
if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
|
267
|
+
config[:effects] = best_effects
|
243
268
|
text = ocr_file_to_text(save: false)
|
244
|
-
break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
|
245
269
|
end
|
246
270
|
|
247
271
|
# Adds in extra operations which is unfortunately inefficient
|
@@ -252,6 +276,11 @@ module OcrFile
|
|
252
276
|
end
|
253
277
|
end
|
254
278
|
|
279
|
+
def test_ocr_settings(effect)
|
280
|
+
config[:effects] = config[:effects] - [effect]
|
281
|
+
ocr_file_to_text(save: false)
|
282
|
+
end
|
283
|
+
|
255
284
|
def print_time
|
256
285
|
puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
|
257
286
|
end
|
@@ -61,11 +61,38 @@ module OcrFile
|
|
61
61
|
image_paths
|
62
62
|
end
|
63
63
|
|
64
|
+
def insert_image(document, image_path)
|
65
|
+
canvas = document.pages.add.canvas
|
66
|
+
canvas.image(image_path, at: [0, 0], height: 700)
|
67
|
+
end
|
68
|
+
|
69
|
+
def combine(text, pdf_of_images)
|
70
|
+
return unless pdf_of_images.is_a?(::HexaPDF::Document)
|
71
|
+
|
72
|
+
if text.is_a?(::HexaPDF::Document)
|
73
|
+
pages_of_text = text.pages
|
74
|
+
else # Assume raw text with PAGE_BREAK
|
75
|
+
pages_of_text = text.split(PAGE_BREAK)
|
76
|
+
end
|
77
|
+
|
78
|
+
return unless pages_of_text.size == pdf_of_images.pages.size
|
79
|
+
|
80
|
+
if text.is_a?(::HexaPDF::Document) # Keep the page structure
|
81
|
+
|
82
|
+
else # Just text to embed
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
64
87
|
def merge(documents)
|
65
88
|
target = ::HexaPDF::Document.new
|
66
89
|
|
67
90
|
documents.each do |document|
|
68
|
-
document.
|
91
|
+
if document.is_a?(::HexaPDF::Document)
|
92
|
+
document.pages.each { |page| target.pages << target.import(page) }
|
93
|
+
else # Assume an image
|
94
|
+
insert_image(target, document)
|
95
|
+
end
|
69
96
|
end
|
70
97
|
|
71
98
|
target
|
@@ -1,33 +1,81 @@
|
|
1
1
|
module OcrFile
|
2
2
|
module TextEngines
|
3
3
|
class ResultProcessor
|
4
|
-
MINIMUM_WORD_LENGTH =
|
4
|
+
MINIMUM_WORD_LENGTH = 4
|
5
|
+
ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
|
6
|
+
ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
|
7
|
+
|
8
|
+
# REGEX
|
9
|
+
ASCII_ONLY = /[^\u{0000}-\u{007f}]/
|
10
|
+
NOISE_CHARACTERS = /[^\w\s\/-;:]/
|
11
|
+
DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
|
12
|
+
EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
|
5
13
|
|
6
14
|
attr_reader :text, :clear_text
|
7
15
|
|
8
16
|
def initialize(text)
|
9
17
|
@text = text
|
10
|
-
@clear_text =
|
18
|
+
@clear_text = generate_clear_text || text || ''
|
19
|
+
end
|
20
|
+
|
21
|
+
def correct
|
22
|
+
Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
|
11
23
|
end
|
12
24
|
|
13
25
|
# This is a very naive way of determining if we should re-do OCR with
|
14
26
|
# shifted options
|
15
27
|
def valid_words?
|
16
|
-
word_size_average >= MINIMUM_WORD_LENGTH
|
28
|
+
word_size_average >= MINIMUM_WORD_LENGTH &&
|
29
|
+
spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
|
30
|
+
unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
|
31
|
+
end
|
32
|
+
|
33
|
+
def invalid_words?
|
34
|
+
!valid_words?
|
17
35
|
end
|
18
36
|
|
19
37
|
def word_count
|
20
|
-
|
38
|
+
return 0 if empty_text?
|
39
|
+
@_word_count ||= clear_words.size
|
21
40
|
end
|
22
41
|
|
23
42
|
def word_size_average
|
24
|
-
|
43
|
+
return 0 if empty_text?
|
44
|
+
@_word_size_average ||= clear_words.map(&:size).sum / word_count
|
45
|
+
end
|
46
|
+
|
47
|
+
# Assume English
|
48
|
+
def unidentified_word_count
|
49
|
+
clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
|
50
|
+
end
|
51
|
+
|
52
|
+
def spelling_error_count
|
53
|
+
Spellchecker.check(clear_text).count
|
54
|
+
end
|
55
|
+
|
56
|
+
def count_of_issues
|
57
|
+
spelling_error_count + unidentified_word_count
|
25
58
|
end
|
26
59
|
|
27
60
|
private
|
28
61
|
|
62
|
+
def empty_text?
|
63
|
+
clear_text.nil? || clear_text == ''
|
64
|
+
end
|
65
|
+
|
66
|
+
def clear_words
|
67
|
+
@clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_clear_text
|
71
|
+
remove_lines
|
72
|
+
&.gsub(ASCII_ONLY, '')
|
73
|
+
&.gsub(NOISE_CHARACTERS, '')
|
74
|
+
&.gsub(DUPLICATE_WORDS, '')
|
75
|
+
end
|
76
|
+
|
29
77
|
def remove_lines
|
30
|
-
text
|
78
|
+
text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
|
31
79
|
end
|
32
80
|
end
|
33
81
|
end
|
data/lib/ocr-file/version.rb
CHANGED
data/lib/ocr-file.rb
CHANGED
data/ocr-file.gemspec
CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_dependency "hexapdf", "~> 0.23.0"
|
33
33
|
spec.add_dependency "rtesseract", "~> 3.1.2"
|
34
34
|
spec.add_dependency "mini_magick", "~> 4.11.0"
|
35
|
+
spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
|
35
36
|
|
36
37
|
# Development Dependencies
|
37
38
|
spec.add_development_dependency "pry", "~> 0.14.1"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ocr-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: console-style
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 4.11.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-spellchecker
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.5
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.1.5
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: pry
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|