ocr-file 0.0.4 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -2
- data/README.md +12 -4
- data/lib/ocr-file/document.rb +45 -10
- data/lib/ocr-file/image_engines/image_magick.rb +8 -1
- data/lib/ocr-file/image_engines/pdf_engine.rb +43 -1
- data/lib/ocr-file/text_engines/result_processor.rb +54 -6
- data/lib/ocr-file/version.rb +1 -1
- data/lib/ocr-file.rb +1 -0
- data/ocr-file.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ae0f4940b34df3280221cf8b26d86ba3498f8344ef5f0e27ea335ca651a8906
|
4
|
+
data.tar.gz: 5e790899721d25bb0f4dc0e8e276b39b62bbb2803549fdbc8ba148804885bec0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6cd016ca7bba37866579cad59f01f41d190c0a191cd1ce27fa7037646da7bf4962923664c7b6295655936aed8714fac01b08301be65fdfef68403c8dd12c075b
|
7
|
+
data.tar.gz: f1581713a76e19f1b24d43f030cccbfb32b206bea8d1a5f07fed26fe4e0cfaa3f991c0c35b98bf1f222ca36b143e83700638ecf3b0520b9663d2fe4336cc5da2
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ocr-file (0.0.
|
4
|
+
ocr-file (0.0.8)
|
5
5
|
active_attr (~> 0.15.4)
|
6
6
|
console-style (~> 0.0.1)
|
7
7
|
hexapdf (~> 0.23.0)
|
8
8
|
mini_magick (~> 4.11.0)
|
9
9
|
rtesseract (~> 3.1.2)
|
10
|
+
ruby-spellchecker (~> 0.1.5)
|
10
11
|
|
11
12
|
GEM
|
12
13
|
remote: https://rubygems.org/
|
@@ -60,7 +61,7 @@ GEM
|
|
60
61
|
coderay (~> 1.1)
|
61
62
|
method_source (~> 1.0)
|
62
63
|
racc (1.6.0)
|
63
|
-
rack (2.2.
|
64
|
+
rack (2.2.4)
|
64
65
|
rack-test (1.1.0)
|
65
66
|
rack (>= 1.0, < 3)
|
66
67
|
rails-dom-testing (2.0.3)
|
@@ -69,6 +70,7 @@ GEM
|
|
69
70
|
rails-html-sanitizer (1.4.3)
|
70
71
|
loofah (~> 2.3)
|
71
72
|
rtesseract (3.1.2)
|
73
|
+
ruby-spellchecker (0.1.5)
|
72
74
|
tzinfo (2.0.4)
|
73
75
|
concurrent-ruby (~> 1.0)
|
74
76
|
|
data/README.md
CHANGED
@@ -49,9 +49,11 @@ You will need to install `tesseract` with your desired language on your system,
|
|
49
49
|
optimise_pdf: true,
|
50
50
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
51
51
|
temp_filename_prefix: 'image',
|
52
|
+
spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
|
53
|
+
keep_files: false,
|
52
54
|
# Console Output
|
53
55
|
verbose: true,
|
54
|
-
timing: true
|
56
|
+
timing: true
|
55
57
|
}
|
56
58
|
|
57
59
|
doc = OcrFile::Document.new(
|
@@ -76,9 +78,10 @@ You will need to install `tesseract` with your desired language on your system,
|
|
76
78
|
doc.to_pdf
|
77
79
|
|
78
80
|
# How to merge files into a single PDF:
|
79
|
-
|
80
|
-
|
81
|
-
merged_document =
|
81
|
+
# The files can be images or other PDFs
|
82
|
+
file_paths = []
|
83
|
+
merged_document = ::HexaPDF::Document.new
|
84
|
+
documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.insert_image(merged_document, path) }
|
82
85
|
OcrFile::ImageEngines::PdfEngine.save_pdf(merged_document, save_file_path, optimise: true)
|
83
86
|
```
|
84
87
|
|
@@ -120,6 +123,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
120
123
|
- Improve console output
|
121
124
|
- Fix spaces in file names
|
122
125
|
- Better verbosity
|
126
|
+
- Docker
|
127
|
+
- pdftk / pdf merge for text and bookmarks etc ...
|
128
|
+
- https://github.com/tesseract-ocr/tesseract/issues/660
|
129
|
+
- tesseract -c naked_pdf=true
|
130
|
+
-
|
123
131
|
|
124
132
|
### Tests
|
125
133
|
To run tests execute:
|
data/lib/ocr-file/document.rb
CHANGED
@@ -29,9 +29,11 @@ module OcrFile
|
|
29
29
|
optimise_pdf: true,
|
30
30
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
31
31
|
temp_filename_prefix: 'image',
|
32
|
+
spelling_correction: true,
|
33
|
+
keep_files: false,
|
32
34
|
# Console Output
|
33
35
|
verbose: true,
|
34
|
-
timing: true
|
36
|
+
timing: true
|
35
37
|
}
|
36
38
|
|
37
39
|
attr_reader :original_file_path,
|
@@ -64,7 +66,7 @@ module OcrFile
|
|
64
66
|
|
65
67
|
def image?
|
66
68
|
return false if pdf?
|
67
|
-
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
|
69
|
+
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") }
|
68
70
|
end
|
69
71
|
|
70
72
|
# Treat anything which isnt a PDF or image as text
|
@@ -75,7 +77,7 @@ module OcrFile
|
|
75
77
|
# Trigger OCR pipeline
|
76
78
|
def to_pdf
|
77
79
|
@start_time = Time.now
|
78
|
-
find_best_image_processing if config[:automatic_reprocess] && !text?
|
80
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
|
79
81
|
|
80
82
|
if pdf?
|
81
83
|
ocr_pdf_to_searchable_pdf
|
@@ -117,7 +119,8 @@ module OcrFile
|
|
117
119
|
end
|
118
120
|
|
119
121
|
def close
|
120
|
-
|
122
|
+
return if keep_files?
|
123
|
+
::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
|
121
124
|
end
|
122
125
|
|
123
126
|
private
|
@@ -141,6 +144,10 @@ module OcrFile
|
|
141
144
|
end
|
142
145
|
end
|
143
146
|
|
147
|
+
def keep_files?
|
148
|
+
config['keep_files']
|
149
|
+
end
|
150
|
+
|
144
151
|
def create_temp_folder
|
145
152
|
date = Time.now.to_s.split(' ').first
|
146
153
|
|
@@ -171,6 +178,7 @@ module OcrFile
|
|
171
178
|
pdfs_to_merge = []
|
172
179
|
|
173
180
|
image_paths.each do |image_path|
|
181
|
+
puts image_path
|
174
182
|
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
175
183
|
end
|
176
184
|
|
@@ -182,6 +190,8 @@ module OcrFile
|
|
182
190
|
|
183
191
|
def text_to_pdf
|
184
192
|
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
193
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
194
|
+
|
185
195
|
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
186
196
|
|
187
197
|
OcrFile::ImageEngines::PdfEngine
|
@@ -189,7 +199,7 @@ module OcrFile
|
|
189
199
|
end
|
190
200
|
|
191
201
|
def ocr_image_to_pdf
|
192
|
-
find_best_image_processing if config[:automatic_reprocess]
|
202
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess]
|
193
203
|
|
194
204
|
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
|
195
205
|
OcrFile::ImageEngines::PdfEngine
|
@@ -203,7 +213,11 @@ module OcrFile
|
|
203
213
|
text = ''
|
204
214
|
|
205
215
|
image_paths.each do |image_path|
|
206
|
-
|
216
|
+
puts image_path
|
217
|
+
text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
|
218
|
+
|
219
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
220
|
+
text = "#{text}#{PAGE_BREAK}#{text}"
|
207
221
|
end
|
208
222
|
|
209
223
|
if save
|
@@ -215,7 +229,9 @@ module OcrFile
|
|
215
229
|
|
216
230
|
def ocr_image_to_text(save:)
|
217
231
|
create_temp_folder
|
232
|
+
|
218
233
|
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
|
234
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
219
235
|
|
220
236
|
if save
|
221
237
|
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
|
@@ -225,7 +241,7 @@ module OcrFile
|
|
225
241
|
end
|
226
242
|
|
227
243
|
def ocr_file_to_text(save:)
|
228
|
-
if pdf?
|
244
|
+
if pdf?
|
229
245
|
ocr_pdf_to_text(save: save)
|
230
246
|
else # is an image
|
231
247
|
ocr_image_to_text(save: save)
|
@@ -233,15 +249,29 @@ module OcrFile
|
|
233
249
|
end
|
234
250
|
|
235
251
|
def find_best_image_processing(save:)
|
236
|
-
ocr_file_to_text(save: save)
|
252
|
+
ocr_file_to_text(save: save) unless config[:automatic_reprocess]
|
237
253
|
|
238
254
|
text = ''
|
255
|
+
best_text_count = 0
|
256
|
+
best_effects = config[:effects]
|
257
|
+
|
239
258
|
effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
|
240
259
|
effects_to_test.each do |effect|
|
241
|
-
|
260
|
+
text = test_ocr_settings(effect)
|
261
|
+
processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
|
262
|
+
|
263
|
+
if processed_result.count_of_issues < best_text_count
|
264
|
+
best_text_count = processed_result.count_of_issues
|
265
|
+
best_effects = config[:effects]
|
266
|
+
end
|
267
|
+
|
268
|
+
break if processed_result.valid_words?
|
269
|
+
end
|
242
270
|
|
271
|
+
# Fallback
|
272
|
+
if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
|
273
|
+
config[:effects] = best_effects
|
243
274
|
text = ocr_file_to_text(save: false)
|
244
|
-
break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
|
245
275
|
end
|
246
276
|
|
247
277
|
# Adds in extra operations which is unfortunately inefficient
|
@@ -252,6 +282,11 @@ module OcrFile
|
|
252
282
|
end
|
253
283
|
end
|
254
284
|
|
285
|
+
def test_ocr_settings(effect)
|
286
|
+
config[:effects] = config[:effects] - [effect]
|
287
|
+
ocr_file_to_text(save: false)
|
288
|
+
end
|
289
|
+
|
255
290
|
def print_time
|
256
291
|
puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
|
257
292
|
end
|
@@ -5,7 +5,7 @@ module OcrFile
|
|
5
5
|
# Conversion of image types
|
6
6
|
# Rotation and detection of skew
|
7
7
|
|
8
|
-
attr_reader :image_path, :image, :temp_path, :save_file_path, :config
|
8
|
+
attr_reader :image_path, :image, :temp_path, :save_file_path, :config, :width, :height
|
9
9
|
|
10
10
|
def initialize(image_path:, temp_path:, save_file_path:, config:)
|
11
11
|
@image_path = image_path
|
@@ -22,6 +22,9 @@ module OcrFile
|
|
22
22
|
# end
|
23
23
|
|
24
24
|
@image = MiniMagick::Image.open(image_path)
|
25
|
+
|
26
|
+
@width = @image[:width]
|
27
|
+
@height = @image[:height]
|
25
28
|
end
|
26
29
|
|
27
30
|
def convert!
|
@@ -39,6 +42,10 @@ module OcrFile
|
|
39
42
|
@save_file_path
|
40
43
|
end
|
41
44
|
|
45
|
+
def resize(width, height)
|
46
|
+
@image.resize("#{width}x#{height}")
|
47
|
+
end
|
48
|
+
|
42
49
|
# Effects
|
43
50
|
# http://www.imagemagick.org/script/command-line-options.php
|
44
51
|
def bw
|
@@ -61,11 +61,53 @@ module OcrFile
|
|
61
61
|
image_paths
|
62
62
|
end
|
63
63
|
|
64
|
+
def insert_image(document, image_path, dimensions: nil)
|
65
|
+
image_processor = OcrFile::ImageEngines::ImageMagick.new(
|
66
|
+
image_path: image_path,
|
67
|
+
temp_path: @temp_folder_path,
|
68
|
+
save_file_path: '',
|
69
|
+
config: @config
|
70
|
+
)
|
71
|
+
|
72
|
+
if dimensions
|
73
|
+
width = dimensions[0]
|
74
|
+
height = dimensions[1]
|
75
|
+
else
|
76
|
+
width = image_processor.width
|
77
|
+
height = image_processor.height
|
78
|
+
end
|
79
|
+
|
80
|
+
page = document.pages.add([0, 0, width, height])
|
81
|
+
page.canvas.image(@image || image_path, at: [0, 0], width: width, height: height)
|
82
|
+
end
|
83
|
+
|
84
|
+
def combine(text, pdf_of_images)
|
85
|
+
return unless pdf_of_images.is_a?(::HexaPDF::Document)
|
86
|
+
|
87
|
+
if text.is_a?(::HexaPDF::Document)
|
88
|
+
pages_of_text = text.pages
|
89
|
+
else # Assume raw text with PAGE_BREAK
|
90
|
+
pages_of_text = text.split(PAGE_BREAK)
|
91
|
+
end
|
92
|
+
|
93
|
+
return unless pages_of_text.size == pdf_of_images.pages.size
|
94
|
+
|
95
|
+
if text.is_a?(::HexaPDF::Document) # Keep the page structure
|
96
|
+
|
97
|
+
else # Just text to embed
|
98
|
+
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
64
102
|
def merge(documents)
|
65
103
|
target = ::HexaPDF::Document.new
|
66
104
|
|
67
105
|
documents.each do |document|
|
68
|
-
document.
|
106
|
+
if document.is_a?(::HexaPDF::Document)
|
107
|
+
document.pages.each { |page| target.pages << target.import(page) }
|
108
|
+
else # Assume an image
|
109
|
+
insert_image(target, document)
|
110
|
+
end
|
69
111
|
end
|
70
112
|
|
71
113
|
target
|
@@ -1,33 +1,81 @@
|
|
1
1
|
module OcrFile
|
2
2
|
module TextEngines
|
3
3
|
class ResultProcessor
|
4
|
-
MINIMUM_WORD_LENGTH =
|
4
|
+
MINIMUM_WORD_LENGTH = 4
|
5
|
+
ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
|
6
|
+
ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
|
7
|
+
|
8
|
+
# REGEX
|
9
|
+
ASCII_ONLY = /[^\u{0000}-\u{007f}]/
|
10
|
+
NOISE_CHARACTERS = /[^\w\s\/-;:]/
|
11
|
+
DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
|
12
|
+
EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
|
5
13
|
|
6
14
|
attr_reader :text, :clear_text
|
7
15
|
|
8
16
|
def initialize(text)
|
9
17
|
@text = text
|
10
|
-
@clear_text =
|
18
|
+
@clear_text = generate_clear_text || text || ''
|
19
|
+
end
|
20
|
+
|
21
|
+
def correct
|
22
|
+
Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
|
11
23
|
end
|
12
24
|
|
13
25
|
# This is a very naive way of determining if we should re-do OCR with
|
14
26
|
# shifted options
|
15
27
|
def valid_words?
|
16
|
-
word_size_average >= MINIMUM_WORD_LENGTH
|
28
|
+
word_size_average >= MINIMUM_WORD_LENGTH &&
|
29
|
+
spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
|
30
|
+
unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
|
31
|
+
end
|
32
|
+
|
33
|
+
def invalid_words?
|
34
|
+
!valid_words?
|
17
35
|
end
|
18
36
|
|
19
37
|
def word_count
|
20
|
-
|
38
|
+
return 0 if empty_text?
|
39
|
+
@_word_count ||= clear_words.size
|
21
40
|
end
|
22
41
|
|
23
42
|
def word_size_average
|
24
|
-
|
43
|
+
return 0 if empty_text?
|
44
|
+
@_word_size_average ||= clear_words.map(&:size).sum / word_count
|
45
|
+
end
|
46
|
+
|
47
|
+
# Assume English
|
48
|
+
def unidentified_word_count
|
49
|
+
clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
|
50
|
+
end
|
51
|
+
|
52
|
+
def spelling_error_count
|
53
|
+
Spellchecker.check(clear_text).count
|
54
|
+
end
|
55
|
+
|
56
|
+
def count_of_issues
|
57
|
+
spelling_error_count + unidentified_word_count
|
25
58
|
end
|
26
59
|
|
27
60
|
private
|
28
61
|
|
62
|
+
def empty_text?
|
63
|
+
clear_text.nil? || clear_text == ''
|
64
|
+
end
|
65
|
+
|
66
|
+
def clear_words
|
67
|
+
@clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_clear_text
|
71
|
+
remove_lines
|
72
|
+
&.gsub(ASCII_ONLY, '')
|
73
|
+
&.gsub(NOISE_CHARACTERS, '')
|
74
|
+
&.gsub(DUPLICATE_WORDS, '')
|
75
|
+
end
|
76
|
+
|
29
77
|
def remove_lines
|
30
|
-
text
|
78
|
+
text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
|
31
79
|
end
|
32
80
|
end
|
33
81
|
end
|
data/lib/ocr-file/version.rb
CHANGED
data/lib/ocr-file.rb
CHANGED
data/ocr-file.gemspec
CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_dependency "hexapdf", "~> 0.23.0"
|
33
33
|
spec.add_dependency "rtesseract", "~> 3.1.2"
|
34
34
|
spec.add_dependency "mini_magick", "~> 4.11.0"
|
35
|
+
spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
|
35
36
|
|
36
37
|
# Development Dependencies
|
37
38
|
spec.add_development_dependency "pry", "~> 0.14.1"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ocr-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: console-style
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 4.11.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-spellchecker
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.5
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.1.5
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: pry
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|