ocr-file 0.0.3 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0e67553a31e82eba190368040d3475b812e113aedfb9994484043dda34a55053
4
- data.tar.gz: 6fe5e142fef4387fc98fce57d3fdb2b7a0c37199d1712bd1d85dced9a0e61274
3
+ metadata.gz: 8b87806d21622a72c6166c35fe4367f5b07135e5e7fab4e8be8b8941f75439dc
4
+ data.tar.gz: d342a91e9b23f8677784553327ba1cc1c00e1599415512b28226f8e9f6bc55b4
5
5
  SHA512:
6
- metadata.gz: e5d06cf54a8bc96c90522ab67530310730230067ee226f6eb1143adde2ccb407dde25aef7b595836478ee944e4e9b3ff306b4df5a08ec14ab6623ab08daefa8b
7
- data.tar.gz: 45a7c3d06908c878f281db9baf4ec82310ecde20e12cad5ff4cc03d2f271167d46fa52145fe598f594a3360a525c926d955bb08d17e740ba78f97ec72f0f4b47
6
+ metadata.gz: ecadeeb21a358274bce4ed3d7fce66e53d31ff3abe940ff1b9d77893f12b73bfd41e9ac35324e3a98f004638f9d1906760ef962a3637fbaf48973faeec9a17cb
7
+ data.tar.gz: 5d4a149dd6d0da1feb723b08c327edab414b75f0b633cea53aaee00d43313d26b84659956957acec7550a822998b76a760b3888770a606d8b4a1f9bb14f807c2
data/Gemfile.lock CHANGED
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ocr-file (0.0.2)
4
+ ocr-file (0.0.6)
5
5
  active_attr (~> 0.15.4)
6
6
  console-style (~> 0.0.1)
7
7
  hexapdf (~> 0.23.0)
8
8
  mini_magick (~> 4.11.0)
9
9
  rtesseract (~> 3.1.2)
10
+ ruby-spellchecker (~> 0.1.5)
10
11
 
11
12
  GEM
12
13
  remote: https://rubygems.org/
@@ -60,7 +61,7 @@ GEM
60
61
  coderay (~> 1.1)
61
62
  method_source (~> 1.0)
62
63
  racc (1.6.0)
63
- rack (2.2.3.1)
64
+ rack (2.2.4)
64
65
  rack-test (1.1.0)
65
66
  rack (>= 1.0, < 3)
66
67
  rails-dom-testing (2.0.3)
@@ -69,6 +70,7 @@ GEM
69
70
  rails-html-sanitizer (1.4.3)
70
71
  loofah (~> 2.3)
71
72
  rtesseract (3.1.2)
73
+ ruby-spellchecker (0.1.5)
72
74
  tzinfo (2.0.4)
73
75
  concurrent-ruby (~> 1.0)
74
76
 
data/README.md CHANGED
@@ -44,12 +44,16 @@ You will need to install `tesseract` with your desired language on your system,
44
44
  # Image Pre-Processing
45
45
  image_preprocess: true,
46
46
  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
47
+ automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
47
48
  # PDF to Image Processing
48
49
  optimise_pdf: true,
49
50
  extract_pdf_images: true, # if false will screenshot each PDF page
50
51
  temp_filename_prefix: 'image',
52
+ spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
53
+ keep_files: false,
51
54
  # Console Output
52
55
  verbose: true,
56
+ timing: true
53
57
  }
54
58
 
55
59
  doc = OcrFile::Document.new(
@@ -74,6 +78,7 @@ You will need to install `tesseract` with your desired language on your system,
74
78
  doc.to_pdf
75
79
 
76
80
  # How to merge files into a single PDF:
81
+ # The files can be images or other PDFs
77
82
  filepaths = []
78
83
  documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
79
84
  merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
@@ -85,6 +90,8 @@ Set `extract_pdf_images` to `false` for higher quality OCR. However this will co
85
90
 
86
91
  Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
87
92
 
93
+ `automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
94
+
88
95
  ### Simple CLI
89
96
  Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
90
97
 
@@ -108,7 +115,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
108
115
  ### TODOs
109
116
  - input validation
110
117
  - Better CLI
111
- - image processing
112
118
  - password
113
119
  - Base64 encoding
114
120
  - requirements checking (installed dependencies etc ...)
@@ -117,7 +123,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
117
123
  - Improve console output
118
124
  - Fix spaces in file names
119
125
  - Better verbosity
120
- - Timing
126
+ - Docker
127
+ - pdftk / pdf merge for text and bookmarks etc ...
128
+ - https://github.com/tesseract-ocr/tesseract/issues/660
129
+ - tesseract -c naked_pdf=true
130
+ -
121
131
 
122
132
  ### Tests
123
133
  To run tests execute:
@@ -5,6 +5,7 @@ module OcrFile
5
5
 
6
6
  ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
7
7
  PAGE_BREAK = "\n\r\n" # TODO: Make configurable
8
+ EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
8
9
  DEFAULT_CONFIG = {
9
10
  # Images from PDF
10
11
  filetype: 'png',
@@ -23,12 +24,16 @@ module OcrFile
23
24
  # Image Pre-Processing
24
25
  image_preprocess: true,
25
26
  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
27
+ automatic_reprocess: true,
26
28
  # PDF to Image Processing
27
29
  optimise_pdf: true,
28
30
  extract_pdf_images: true, # if false will screenshot each PDF page
29
31
  temp_filename_prefix: 'image',
32
+ spelling_correction: true,
33
+ keep_files: false,
30
34
  # Console Output
31
35
  verbose: true,
36
+ timing: true
32
37
  }
33
38
 
34
39
  attr_reader :original_file_path,
@@ -36,7 +41,9 @@ module OcrFile
36
41
  :save_file_path,
37
42
  :final_save_file,
38
43
  :config,
39
- :ocr_engine
44
+ :ocr_engine,
45
+ :start_time,
46
+ :end_time
40
47
 
41
48
  # save_file_path will also generate a tmp path for tmp files. Expected folder path
42
49
  # TODO: Add in more input validation
@@ -69,75 +76,50 @@ module OcrFile
69
76
 
70
77
  # Trigger OCR pipeline
71
78
  def to_pdf
72
- if pdf?
73
- create_temp_folder
74
- image_paths = extract_image_paths_from_pdf(@original_file_path)
75
-
76
- pdfs_to_merge = []
77
-
78
- image_paths.each do |image_path|
79
- pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
80
- end
81
-
82
- merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
79
+ @start_time = Time.now
80
+ find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
83
81
 
84
- OcrFile::ImageEngines::PdfEngine
85
- .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
82
+ if pdf?
83
+ ocr_pdf_to_searchable_pdf
86
84
  elsif text?
87
- text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
88
- pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
89
-
90
- OcrFile::ImageEngines::PdfEngine
91
- .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
85
+ text_to_pdf
92
86
  else # is an image
93
87
  ocr_image_to_pdf
94
88
  end
95
89
 
96
90
  close
91
+
92
+ @end_time = Time.now
93
+ print_time
97
94
  end
98
95
 
99
96
  def to_text
100
- if pdf?
101
- create_temp_folder
102
- image_paths = extract_image_paths_from_pdf(@original_file_path)
103
-
104
- image_paths.each do |image_path|
105
- text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
106
- ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
107
- end
108
- elsif text?
109
- ::OcrFile::FileHelpers.open_text_file(@original_file_path)
110
- else # is an image
111
- ocr_image_to_text(save: true)
112
- end
97
+ @start_time = Time.now
98
+ return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
113
99
 
100
+ find_best_image_processing(save: true)
114
101
  close
102
+
103
+ @end_time = Time.now
104
+ print_time
115
105
  end
116
106
 
117
107
  def to_s
118
- if pdf?
119
- create_temp_folder
120
- image_paths = extract_image_paths_from_pdf(@original_file_path)
108
+ @start_time = Time.now
109
+ return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
121
110
 
122
- text = ''
111
+ text = find_best_image_processing(save: false)
123
112
 
124
- image_paths.each do |image_path|
125
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
126
- end
113
+ close
127
114
 
128
- close
129
- text
130
- elsif text?
131
- ::OcrFile::FileHelpers.open_text_file(@original_file_path)
132
- else # is an image
133
- text = ocr_image_to_text(save: false)
115
+ @end_time = Time.now
116
+ print_time
134
117
 
135
- close
136
- text
137
- end
118
+ text
138
119
  end
139
120
 
140
121
  def close
122
+ return if keep_files?
141
123
  ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
142
124
  end
143
125
 
@@ -162,6 +144,10 @@ module OcrFile
162
144
  end
163
145
  end
164
146
 
147
+ def keep_files?
148
+ config['keep_files']
149
+ end
150
+
165
151
  def create_temp_folder
166
152
  date = Time.now.to_s.split(' ').first
167
153
 
@@ -185,14 +171,67 @@ module OcrFile
185
171
  image_processor.convert!
186
172
  end
187
173
 
174
+ def ocr_pdf_to_searchable_pdf
175
+ create_temp_folder
176
+ image_paths = extract_image_paths_from_pdf(@original_file_path)
177
+
178
+ pdfs_to_merge = []
179
+
180
+ image_paths.each do |image_path|
181
+ puts image_path
182
+ pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
183
+ end
184
+
185
+ merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
186
+
187
+ OcrFile::ImageEngines::PdfEngine
188
+ .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
189
+ end
190
+
191
+ def text_to_pdf
192
+ text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
193
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
194
+
195
+ pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
196
+
197
+ OcrFile::ImageEngines::PdfEngine
198
+ .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
199
+ end
200
+
188
201
  def ocr_image_to_pdf
202
+ find_best_image_processing(save: false) if config[:automatic_reprocess]
203
+
189
204
  pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
190
205
  OcrFile::ImageEngines::PdfEngine
191
206
  .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
192
207
  end
193
208
 
194
- def ocr_image_to_text(save: true)
209
+ def ocr_pdf_to_text(save:)
210
+ create_temp_folder
211
+ image_paths = extract_image_paths_from_pdf(@original_file_path)
212
+
213
+ text = ''
214
+
215
+ image_paths.each do |image_path|
216
+ puts image_path
217
+ text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
218
+
219
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
220
+ text = "#{text}#{PAGE_BREAK}#{text}"
221
+ end
222
+
223
+ if save
224
+ ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
225
+ else
226
+ text
227
+ end
228
+ end
229
+
230
+ def ocr_image_to_text(save:)
231
+ create_temp_folder
232
+
195
233
  text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
234
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
196
235
 
197
236
  if save
198
237
  ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -201,6 +240,57 @@ module OcrFile
201
240
  end
202
241
  end
203
242
 
243
+ def ocr_file_to_text(save:)
244
+ if pdf?
245
+ ocr_pdf_to_text(save: save)
246
+ else # is an image
247
+ ocr_image_to_text(save: save)
248
+ end
249
+ end
250
+
251
+ def find_best_image_processing(save:)
252
+ ocr_file_to_text(save: save) unless config[:automatic_reprocess]
253
+
254
+ text = ''
255
+ best_text_count = 0
256
+ best_effects = config[:effects]
257
+
258
+ effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
259
+ effects_to_test.each do |effect|
260
+ text = test_ocr_settings(effect)
261
+ processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
262
+
263
+ if processed_result.count_of_issues < best_text_count
264
+ best_text_count = processed_result.count_of_issues
265
+ best_effects = config[:effects]
266
+ end
267
+
268
+ break if processed_result.valid_words?
269
+ end
270
+
271
+ # Fallback
272
+ if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
273
+ config[:effects] = best_effects
274
+ text = ocr_file_to_text(save: false)
275
+ end
276
+
277
+ # Adds in extra operations which is unfortunately inefficient
278
+ if save
279
+ ocr_file_to_text(save: save)
280
+ else
281
+ text
282
+ end
283
+ end
284
+
285
+ def test_ocr_settings(effect)
286
+ config[:effects] = config[:effects] - [effect]
287
+ ocr_file_to_text(save: false)
288
+ end
289
+
290
+ def print_time
291
+ puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
292
+ end
293
+
204
294
  def find_ocr_engine(engine_id)
205
295
  ocr_engine_constants
206
296
  .map { |c| ocr_module(c) }
@@ -61,11 +61,38 @@ module OcrFile
61
61
  image_paths
62
62
  end
63
63
 
64
+ def insert_image(document, image_path)
65
+ canvas = document.pages.add.canvas
66
+ canvas.image(image_path, at: [0, 0], height: 700)
67
+ end
68
+
69
+ def combine(text, pdf_of_images)
70
+ return unless pdf_of_images.is_a?(::HexaPDF::Document)
71
+
72
+ if text.is_a?(::HexaPDF::Document)
73
+ pages_of_text = text.pages
74
+ else # Assume raw text with PAGE_BREAK
75
+ pages_of_text = text.split(PAGE_BREAK)
76
+ end
77
+
78
+ return unless pages_of_text.size == pdf_of_images.pages.size
79
+
80
+ if text.is_a?(::HexaPDF::Document) # Keep the page structure
81
+
82
+ else # Just text to embed
83
+
84
+ end
85
+ end
86
+
64
87
  def merge(documents)
65
88
  target = ::HexaPDF::Document.new
66
89
 
67
90
  documents.each do |document|
68
- document.pages.each { |page| target.pages << target.import(page) }
91
+ if document.is_a?(::HexaPDF::Document)
92
+ document.pages.each { |page| target.pages << target.import(page) }
93
+ else # Assume an image
94
+ insert_image(target, document)
95
+ end
69
96
  end
70
97
 
71
98
  target
@@ -0,0 +1,82 @@
1
+ module OcrFile
2
+ module TextEngines
3
+ class ResultProcessor
4
+ MINIMUM_WORD_LENGTH = 4
5
+ ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
6
+ ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
7
+
8
+ # REGEX
9
+ ASCII_ONLY = /[^\u{0000}-\u{007f}]/
10
+ NOISE_CHARACTERS = /[^\w\s\/-;:]/
11
+ DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
12
+ EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
13
+
14
+ attr_reader :text, :clear_text
15
+
16
+ def initialize(text)
17
+ @text = text
18
+ @clear_text = generate_clear_text || text || ''
19
+ end
20
+
21
+ def correct
22
+ Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
23
+ end
24
+
25
+ # This is a very naive way of determining if we should re-do OCR with
26
+ # shifted options
27
+ def valid_words?
28
+ word_size_average >= MINIMUM_WORD_LENGTH &&
29
+ spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
30
+ unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
31
+ end
32
+
33
+ def invalid_words?
34
+ !valid_words?
35
+ end
36
+
37
+ def word_count
38
+ return 0 if empty_text?
39
+ @_word_count ||= clear_words.size
40
+ end
41
+
42
+ def word_size_average
43
+ return 0 if empty_text?
44
+ @_word_size_average ||= clear_words.map(&:size).sum / word_count
45
+ end
46
+
47
+ # Assume English
48
+ def unidentified_word_count
49
+ clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
50
+ end
51
+
52
+ def spelling_error_count
53
+ Spellchecker.check(clear_text).count
54
+ end
55
+
56
+ def count_of_issues
57
+ spelling_error_count + unidentified_word_count
58
+ end
59
+
60
+ private
61
+
62
+ def empty_text?
63
+ clear_text.nil? || clear_text == ''
64
+ end
65
+
66
+ def clear_words
67
+ @clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
68
+ end
69
+
70
+ def generate_clear_text
71
+ remove_lines
72
+ &.gsub(ASCII_ONLY, '')
73
+ &.gsub(NOISE_CHARACTERS, '')
74
+ &.gsub(DUPLICATE_WORDS, '')
75
+ end
76
+
77
+ def remove_lines
78
+ text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
79
+ end
80
+ end
81
+ end
82
+ end
@@ -1,3 +1,3 @@
1
1
  module OcrFile
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.7"
3
3
  end
data/lib/ocr-file.rb CHANGED
@@ -2,6 +2,7 @@ require 'hexapdf'
2
2
  require 'hexapdf/cli/images'
3
3
  require 'rtesseract'
4
4
  require 'mini_magick'
5
+ require 'ruby-spellchecker'
5
6
 
6
7
  require 'ocr-file/version'
7
8
 
@@ -10,6 +11,7 @@ require 'ocr-file/image_engines/image_magick'
10
11
  require 'ocr-file/image_engines/pdftoppm'
11
12
  require 'ocr-file/ocr_engines/tesseract'
12
13
  require 'ocr-file/ocr_engines/cloud_vision'
14
+ require 'ocr-file/text_engines/result_processor'
13
15
  require 'ocr-file/file_helpers'
14
16
  require 'ocr-file/document'
15
17
  require 'ocr-file/cli'
data/ocr-file.gemspec CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
32
32
  spec.add_dependency "hexapdf", "~> 0.23.0"
33
33
  spec.add_dependency "rtesseract", "~> 3.1.2"
34
34
  spec.add_dependency "mini_magick", "~> 4.11.0"
35
+ spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
35
36
 
36
37
  # Development Dependencies
37
38
  spec.add_development_dependency "pry", "~> 0.14.1"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ocr-file
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-20 00:00:00.000000000 Z
11
+ date: 2022-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: console-style
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: 4.11.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: ruby-spellchecker
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.5
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.1.5
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: pry
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -122,6 +136,7 @@ files:
122
136
  - lib/ocr-file/image_engines/pdftoppm.rb
123
137
  - lib/ocr-file/ocr_engines/cloud_vision.rb
124
138
  - lib/ocr-file/ocr_engines/tesseract.rb
139
+ - lib/ocr-file/text_engines/result_processor.rb
125
140
  - lib/ocr-file/version.rb
126
141
  - ocr-file.gemspec
127
142
  homepage: https://github.com/TRex22/ocr-file