ocr-file 0.0.4 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -2
- data/README.md +12 -4
- data/lib/ocr-file/document.rb +45 -10
- data/lib/ocr-file/image_engines/image_magick.rb +8 -1
- data/lib/ocr-file/image_engines/pdf_engine.rb +43 -1
- data/lib/ocr-file/text_engines/result_processor.rb +54 -6
- data/lib/ocr-file/version.rb +1 -1
- data/lib/ocr-file.rb +1 -0
- data/ocr-file.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9ae0f4940b34df3280221cf8b26d86ba3498f8344ef5f0e27ea335ca651a8906
|
4
|
+
data.tar.gz: 5e790899721d25bb0f4dc0e8e276b39b62bbb2803549fdbc8ba148804885bec0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6cd016ca7bba37866579cad59f01f41d190c0a191cd1ce27fa7037646da7bf4962923664c7b6295655936aed8714fac01b08301be65fdfef68403c8dd12c075b
|
7
|
+
data.tar.gz: f1581713a76e19f1b24d43f030cccbfb32b206bea8d1a5f07fed26fe4e0cfaa3f991c0c35b98bf1f222ca36b143e83700638ecf3b0520b9663d2fe4336cc5da2
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ocr-file (0.0.
|
4
|
+
ocr-file (0.0.8)
|
5
5
|
active_attr (~> 0.15.4)
|
6
6
|
console-style (~> 0.0.1)
|
7
7
|
hexapdf (~> 0.23.0)
|
8
8
|
mini_magick (~> 4.11.0)
|
9
9
|
rtesseract (~> 3.1.2)
|
10
|
+
ruby-spellchecker (~> 0.1.5)
|
10
11
|
|
11
12
|
GEM
|
12
13
|
remote: https://rubygems.org/
|
@@ -60,7 +61,7 @@ GEM
|
|
60
61
|
coderay (~> 1.1)
|
61
62
|
method_source (~> 1.0)
|
62
63
|
racc (1.6.0)
|
63
|
-
rack (2.2.
|
64
|
+
rack (2.2.4)
|
64
65
|
rack-test (1.1.0)
|
65
66
|
rack (>= 1.0, < 3)
|
66
67
|
rails-dom-testing (2.0.3)
|
@@ -69,6 +70,7 @@ GEM
|
|
69
70
|
rails-html-sanitizer (1.4.3)
|
70
71
|
loofah (~> 2.3)
|
71
72
|
rtesseract (3.1.2)
|
73
|
+
ruby-spellchecker (0.1.5)
|
72
74
|
tzinfo (2.0.4)
|
73
75
|
concurrent-ruby (~> 1.0)
|
74
76
|
|
data/README.md
CHANGED
@@ -49,9 +49,11 @@ You will need to install `tesseract` with your desired language on your system,
|
|
49
49
|
optimise_pdf: true,
|
50
50
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
51
51
|
temp_filename_prefix: 'image',
|
52
|
+
spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
|
53
|
+
keep_files: false,
|
52
54
|
# Console Output
|
53
55
|
verbose: true,
|
54
|
-
timing: true
|
56
|
+
timing: true
|
55
57
|
}
|
56
58
|
|
57
59
|
doc = OcrFile::Document.new(
|
@@ -76,9 +78,10 @@ You will need to install `tesseract` with your desired language on your system,
|
|
76
78
|
doc.to_pdf
|
77
79
|
|
78
80
|
# How to merge files into a single PDF:
|
79
|
-
|
80
|
-
|
81
|
-
merged_document =
|
81
|
+
# The files can be images or other PDFs
|
82
|
+
file_paths = []
|
83
|
+
merged_document = ::HexaPDF::Document.new
|
84
|
+
documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.insert_image(merged_document, path) }
|
82
85
|
OcrFile::ImageEngines::PdfEngine.save_pdf(merged_document, save_file_path, optimise: true)
|
83
86
|
```
|
84
87
|
|
@@ -120,6 +123,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
120
123
|
- Improve console output
|
121
124
|
- Fix spaces in file names
|
122
125
|
- Better verbosity
|
126
|
+
- Docker
|
127
|
+
- pdftk / pdf merge for text and bookmarks etc ...
|
128
|
+
- https://github.com/tesseract-ocr/tesseract/issues/660
|
129
|
+
- tesseract -c naked_pdf=true
|
130
|
+
-
|
123
131
|
|
124
132
|
### Tests
|
125
133
|
To run tests execute:
|
data/lib/ocr-file/document.rb
CHANGED
@@ -29,9 +29,11 @@ module OcrFile
|
|
29
29
|
optimise_pdf: true,
|
30
30
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
31
31
|
temp_filename_prefix: 'image',
|
32
|
+
spelling_correction: true,
|
33
|
+
keep_files: false,
|
32
34
|
# Console Output
|
33
35
|
verbose: true,
|
34
|
-
timing: true
|
36
|
+
timing: true
|
35
37
|
}
|
36
38
|
|
37
39
|
attr_reader :original_file_path,
|
@@ -64,7 +66,7 @@ module OcrFile
|
|
64
66
|
|
65
67
|
def image?
|
66
68
|
return false if pdf?
|
67
|
-
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
|
69
|
+
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") }
|
68
70
|
end
|
69
71
|
|
70
72
|
# Treat anything which isnt a PDF or image as text
|
@@ -75,7 +77,7 @@ module OcrFile
|
|
75
77
|
# Trigger OCR pipeline
|
76
78
|
def to_pdf
|
77
79
|
@start_time = Time.now
|
78
|
-
find_best_image_processing if config[:automatic_reprocess] && !text?
|
80
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
|
79
81
|
|
80
82
|
if pdf?
|
81
83
|
ocr_pdf_to_searchable_pdf
|
@@ -117,7 +119,8 @@ module OcrFile
|
|
117
119
|
end
|
118
120
|
|
119
121
|
def close
|
120
|
-
|
122
|
+
return if keep_files?
|
123
|
+
::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
|
121
124
|
end
|
122
125
|
|
123
126
|
private
|
@@ -141,6 +144,10 @@ module OcrFile
|
|
141
144
|
end
|
142
145
|
end
|
143
146
|
|
147
|
+
def keep_files?
|
148
|
+
config['keep_files']
|
149
|
+
end
|
150
|
+
|
144
151
|
def create_temp_folder
|
145
152
|
date = Time.now.to_s.split(' ').first
|
146
153
|
|
@@ -171,6 +178,7 @@ module OcrFile
|
|
171
178
|
pdfs_to_merge = []
|
172
179
|
|
173
180
|
image_paths.each do |image_path|
|
181
|
+
puts image_path
|
174
182
|
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
|
175
183
|
end
|
176
184
|
|
@@ -182,6 +190,8 @@ module OcrFile
|
|
182
190
|
|
183
191
|
def text_to_pdf
|
184
192
|
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
|
193
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
194
|
+
|
185
195
|
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
|
186
196
|
|
187
197
|
OcrFile::ImageEngines::PdfEngine
|
@@ -189,7 +199,7 @@ module OcrFile
|
|
189
199
|
end
|
190
200
|
|
191
201
|
def ocr_image_to_pdf
|
192
|
-
find_best_image_processing if config[:automatic_reprocess]
|
202
|
+
find_best_image_processing(save: false) if config[:automatic_reprocess]
|
193
203
|
|
194
204
|
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
|
195
205
|
OcrFile::ImageEngines::PdfEngine
|
@@ -203,7 +213,11 @@ module OcrFile
|
|
203
213
|
text = ''
|
204
214
|
|
205
215
|
image_paths.each do |image_path|
|
206
|
-
|
216
|
+
puts image_path
|
217
|
+
text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
|
218
|
+
|
219
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
220
|
+
text = "#{text}#{PAGE_BREAK}#{text}"
|
207
221
|
end
|
208
222
|
|
209
223
|
if save
|
@@ -215,7 +229,9 @@ module OcrFile
|
|
215
229
|
|
216
230
|
def ocr_image_to_text(save:)
|
217
231
|
create_temp_folder
|
232
|
+
|
218
233
|
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
|
234
|
+
text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
|
219
235
|
|
220
236
|
if save
|
221
237
|
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
|
@@ -225,7 +241,7 @@ module OcrFile
|
|
225
241
|
end
|
226
242
|
|
227
243
|
def ocr_file_to_text(save:)
|
228
|
-
if pdf?
|
244
|
+
if pdf?
|
229
245
|
ocr_pdf_to_text(save: save)
|
230
246
|
else # is an image
|
231
247
|
ocr_image_to_text(save: save)
|
@@ -233,15 +249,29 @@ module OcrFile
|
|
233
249
|
end
|
234
250
|
|
235
251
|
def find_best_image_processing(save:)
|
236
|
-
ocr_file_to_text(save: save)
|
252
|
+
ocr_file_to_text(save: save) unless config[:automatic_reprocess]
|
237
253
|
|
238
254
|
text = ''
|
255
|
+
best_text_count = 0
|
256
|
+
best_effects = config[:effects]
|
257
|
+
|
239
258
|
effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
|
240
259
|
effects_to_test.each do |effect|
|
241
|
-
|
260
|
+
text = test_ocr_settings(effect)
|
261
|
+
processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
|
262
|
+
|
263
|
+
if processed_result.count_of_issues < best_text_count
|
264
|
+
best_text_count = processed_result.count_of_issues
|
265
|
+
best_effects = config[:effects]
|
266
|
+
end
|
267
|
+
|
268
|
+
break if processed_result.valid_words?
|
269
|
+
end
|
242
270
|
|
271
|
+
# Fallback
|
272
|
+
if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
|
273
|
+
config[:effects] = best_effects
|
243
274
|
text = ocr_file_to_text(save: false)
|
244
|
-
break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
|
245
275
|
end
|
246
276
|
|
247
277
|
# Adds in extra operations which is unfortunately inefficient
|
@@ -252,6 +282,11 @@ module OcrFile
|
|
252
282
|
end
|
253
283
|
end
|
254
284
|
|
285
|
+
def test_ocr_settings(effect)
|
286
|
+
config[:effects] = config[:effects] - [effect]
|
287
|
+
ocr_file_to_text(save: false)
|
288
|
+
end
|
289
|
+
|
255
290
|
def print_time
|
256
291
|
puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
|
257
292
|
end
|
@@ -5,7 +5,7 @@ module OcrFile
|
|
5
5
|
# Conversion of image types
|
6
6
|
# Rotation and detection of skew
|
7
7
|
|
8
|
-
attr_reader :image_path, :image, :temp_path, :save_file_path, :config
|
8
|
+
attr_reader :image_path, :image, :temp_path, :save_file_path, :config, :width, :height
|
9
9
|
|
10
10
|
def initialize(image_path:, temp_path:, save_file_path:, config:)
|
11
11
|
@image_path = image_path
|
@@ -22,6 +22,9 @@ module OcrFile
|
|
22
22
|
# end
|
23
23
|
|
24
24
|
@image = MiniMagick::Image.open(image_path)
|
25
|
+
|
26
|
+
@width = @image[:width]
|
27
|
+
@height = @image[:height]
|
25
28
|
end
|
26
29
|
|
27
30
|
def convert!
|
@@ -39,6 +42,10 @@ module OcrFile
|
|
39
42
|
@save_file_path
|
40
43
|
end
|
41
44
|
|
45
|
+
def resize(width, height)
|
46
|
+
@image.resize("#{width}x#{height}")
|
47
|
+
end
|
48
|
+
|
42
49
|
# Effects
|
43
50
|
# http://www.imagemagick.org/script/command-line-options.php
|
44
51
|
def bw
|
@@ -61,11 +61,53 @@ module OcrFile
|
|
61
61
|
image_paths
|
62
62
|
end
|
63
63
|
|
64
|
+
def insert_image(document, image_path, dimensions: nil)
|
65
|
+
image_processor = OcrFile::ImageEngines::ImageMagick.new(
|
66
|
+
image_path: image_path,
|
67
|
+
temp_path: @temp_folder_path,
|
68
|
+
save_file_path: '',
|
69
|
+
config: @config
|
70
|
+
)
|
71
|
+
|
72
|
+
if dimensions
|
73
|
+
width = dimensions[0]
|
74
|
+
height = dimensions[1]
|
75
|
+
else
|
76
|
+
width = image_processor.width
|
77
|
+
height = image_processor.height
|
78
|
+
end
|
79
|
+
|
80
|
+
page = document.pages.add([0, 0, width, height])
|
81
|
+
page.canvas.image(@image || image_path, at: [0, 0], width: width, height: height)
|
82
|
+
end
|
83
|
+
|
84
|
+
def combine(text, pdf_of_images)
|
85
|
+
return unless pdf_of_images.is_a?(::HexaPDF::Document)
|
86
|
+
|
87
|
+
if text.is_a?(::HexaPDF::Document)
|
88
|
+
pages_of_text = text.pages
|
89
|
+
else # Assume raw text with PAGE_BREAK
|
90
|
+
pages_of_text = text.split(PAGE_BREAK)
|
91
|
+
end
|
92
|
+
|
93
|
+
return unless pages_of_text.size == pdf_of_images.pages.size
|
94
|
+
|
95
|
+
if text.is_a?(::HexaPDF::Document) # Keep the page structure
|
96
|
+
|
97
|
+
else # Just text to embed
|
98
|
+
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
64
102
|
def merge(documents)
|
65
103
|
target = ::HexaPDF::Document.new
|
66
104
|
|
67
105
|
documents.each do |document|
|
68
|
-
document.
|
106
|
+
if document.is_a?(::HexaPDF::Document)
|
107
|
+
document.pages.each { |page| target.pages << target.import(page) }
|
108
|
+
else # Assume an image
|
109
|
+
insert_image(target, document)
|
110
|
+
end
|
69
111
|
end
|
70
112
|
|
71
113
|
target
|
@@ -1,33 +1,81 @@
|
|
1
1
|
module OcrFile
|
2
2
|
module TextEngines
|
3
3
|
class ResultProcessor
|
4
|
-
MINIMUM_WORD_LENGTH =
|
4
|
+
MINIMUM_WORD_LENGTH = 4
|
5
|
+
ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
|
6
|
+
ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
|
7
|
+
|
8
|
+
# REGEX
|
9
|
+
ASCII_ONLY = /[^\u{0000}-\u{007f}]/
|
10
|
+
NOISE_CHARACTERS = /[^\w\s\/-;:]/
|
11
|
+
DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
|
12
|
+
EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
|
5
13
|
|
6
14
|
attr_reader :text, :clear_text
|
7
15
|
|
8
16
|
def initialize(text)
|
9
17
|
@text = text
|
10
|
-
@clear_text =
|
18
|
+
@clear_text = generate_clear_text || text || ''
|
19
|
+
end
|
20
|
+
|
21
|
+
def correct
|
22
|
+
Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
|
11
23
|
end
|
12
24
|
|
13
25
|
# This is a very naive way of determining if we should re-do OCR with
|
14
26
|
# shifted options
|
15
27
|
def valid_words?
|
16
|
-
word_size_average >= MINIMUM_WORD_LENGTH
|
28
|
+
word_size_average >= MINIMUM_WORD_LENGTH &&
|
29
|
+
spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
|
30
|
+
unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
|
31
|
+
end
|
32
|
+
|
33
|
+
def invalid_words?
|
34
|
+
!valid_words?
|
17
35
|
end
|
18
36
|
|
19
37
|
def word_count
|
20
|
-
|
38
|
+
return 0 if empty_text?
|
39
|
+
@_word_count ||= clear_words.size
|
21
40
|
end
|
22
41
|
|
23
42
|
def word_size_average
|
24
|
-
|
43
|
+
return 0 if empty_text?
|
44
|
+
@_word_size_average ||= clear_words.map(&:size).sum / word_count
|
45
|
+
end
|
46
|
+
|
47
|
+
# Assume English
|
48
|
+
def unidentified_word_count
|
49
|
+
clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
|
50
|
+
end
|
51
|
+
|
52
|
+
def spelling_error_count
|
53
|
+
Spellchecker.check(clear_text).count
|
54
|
+
end
|
55
|
+
|
56
|
+
def count_of_issues
|
57
|
+
spelling_error_count + unidentified_word_count
|
25
58
|
end
|
26
59
|
|
27
60
|
private
|
28
61
|
|
62
|
+
def empty_text?
|
63
|
+
clear_text.nil? || clear_text == ''
|
64
|
+
end
|
65
|
+
|
66
|
+
def clear_words
|
67
|
+
@clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_clear_text
|
71
|
+
remove_lines
|
72
|
+
&.gsub(ASCII_ONLY, '')
|
73
|
+
&.gsub(NOISE_CHARACTERS, '')
|
74
|
+
&.gsub(DUPLICATE_WORDS, '')
|
75
|
+
end
|
76
|
+
|
29
77
|
def remove_lines
|
30
|
-
text
|
78
|
+
text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
|
31
79
|
end
|
32
80
|
end
|
33
81
|
end
|
data/lib/ocr-file/version.rb
CHANGED
data/lib/ocr-file.rb
CHANGED
data/ocr-file.gemspec
CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_dependency "hexapdf", "~> 0.23.0"
|
33
33
|
spec.add_dependency "rtesseract", "~> 3.1.2"
|
34
34
|
spec.add_dependency "mini_magick", "~> 4.11.0"
|
35
|
+
spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
|
35
36
|
|
36
37
|
# Development Dependencies
|
37
38
|
spec.add_development_dependency "pry", "~> 0.14.1"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ocr-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: console-style
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 4.11.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-spellchecker
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.5
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.1.5
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: pry
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|