ocr-file 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
4
- data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
3
+ metadata.gz: 9660e3d19c210789a7aeab56e63b002b507c12917edd1418202900d05647773b
4
+ data.tar.gz: 28861544f58374db141e5e3cc3ee8569201c4ffd611458e77774c98835e2f882
5
5
  SHA512:
6
- metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
7
- data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
6
+ metadata.gz: 55f4266d6877a7f2c8a4175f5601930c9940aa1a8a06fc1d6d84faca455232173f2ae4be887dc9d20246cb224498a213b91871b66ee3fb3b02699399be33453a
7
+ data.tar.gz: e78263baffd1ae1ff1246f705250d788ae6b5c24b3d59d78e169b3ddf93e139ac9b9688cf9d954053a913a1c3bd6be54c40466d372f915c4430d55d93282f1bb
data/Gemfile.lock CHANGED
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ocr-file (0.0.4)
4
+ ocr-file (0.0.6)
5
5
  active_attr (~> 0.15.4)
6
6
  console-style (~> 0.0.1)
7
7
  hexapdf (~> 0.23.0)
8
8
  mini_magick (~> 4.11.0)
9
9
  rtesseract (~> 3.1.2)
10
+ ruby-spellchecker (~> 0.1.5)
10
11
 
11
12
  GEM
12
13
  remote: https://rubygems.org/
@@ -60,7 +61,7 @@ GEM
60
61
  coderay (~> 1.1)
61
62
  method_source (~> 1.0)
62
63
  racc (1.6.0)
63
- rack (2.2.3.1)
64
+ rack (2.2.4)
64
65
  rack-test (1.1.0)
65
66
  rack (>= 1.0, < 3)
66
67
  rails-dom-testing (2.0.3)
@@ -69,6 +70,7 @@ GEM
69
70
  rails-html-sanitizer (1.4.3)
70
71
  loofah (~> 2.3)
71
72
  rtesseract (3.1.2)
73
+ ruby-spellchecker (0.1.5)
72
74
  tzinfo (2.0.4)
73
75
  concurrent-ruby (~> 1.0)
74
76
 
data/README.md CHANGED
@@ -49,6 +49,7 @@ You will need to install `tesseract` with your desired language on your system,
49
49
  optimise_pdf: true,
50
50
  extract_pdf_images: true, # if false will screenshot each PDF page
51
51
  temp_filename_prefix: 'image',
52
+ spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
52
53
  # Console Output
53
54
  verbose: true,
54
55
  timing: true,
@@ -76,6 +77,7 @@ You will need to install `tesseract` with your desired language on your system,
76
77
  doc.to_pdf
77
78
 
78
79
  # How to merge files into a single PDF:
80
+ # The files can be images or other PDFs
79
81
  filepaths = []
80
82
  documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
81
83
  merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
@@ -120,6 +122,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
120
122
  - Improve console output
121
123
  - Fix spaces in file names
122
124
  - Better verbosity
125
+ - Docker
126
+ - pdftk / pdf merge for text and bookmarks etc ...
127
+ - https://github.com/tesseract-ocr/tesseract/issues/660
128
+ - tesseract -c naked_pdf=true
129
+ -
123
130
 
124
131
  ### Tests
125
132
  To run tests execute:
@@ -29,6 +29,7 @@ module OcrFile
29
29
  optimise_pdf: true,
30
30
  extract_pdf_images: true, # if false will screenshot each PDF page
31
31
  temp_filename_prefix: 'image',
32
+ spelling_correction: true,
32
33
  # Console Output
33
34
  verbose: true,
34
35
  timing: true,
@@ -75,7 +76,7 @@ module OcrFile
75
76
  # Trigger OCR pipeline
76
77
  def to_pdf
77
78
  @start_time = Time.now
78
- find_best_image_processing if config[:automatic_reprocess] && !text?
79
+ find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
79
80
 
80
81
  if pdf?
81
82
  ocr_pdf_to_searchable_pdf
@@ -117,7 +118,7 @@ module OcrFile
117
118
  end
118
119
 
119
120
  def close
120
- # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
121
+ ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
121
122
  end
122
123
 
123
124
  private
@@ -171,6 +172,7 @@ module OcrFile
171
172
  pdfs_to_merge = []
172
173
 
173
174
  image_paths.each do |image_path|
175
+ puts image_path
174
176
  pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
175
177
  end
176
178
 
@@ -182,6 +184,8 @@ module OcrFile
182
184
 
183
185
  def text_to_pdf
184
186
  text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
187
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
188
+
185
189
  pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
186
190
 
187
191
  OcrFile::ImageEngines::PdfEngine
@@ -189,7 +193,7 @@ module OcrFile
189
193
  end
190
194
 
191
195
  def ocr_image_to_pdf
192
- find_best_image_processing if config[:automatic_reprocess]
196
+ find_best_image_processing(save: false) if config[:automatic_reprocess]
193
197
 
194
198
  pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
195
199
  OcrFile::ImageEngines::PdfEngine
@@ -203,7 +207,11 @@ module OcrFile
203
207
  text = ''
204
208
 
205
209
  image_paths.each do |image_path|
206
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
210
+ puts image_path
211
+ text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
212
+
213
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
214
+ text = "#{text}#{PAGE_BREAK}#{text}"
207
215
  end
208
216
 
209
217
  if save
@@ -215,7 +223,9 @@ module OcrFile
215
223
 
216
224
  def ocr_image_to_text(save:)
217
225
  create_temp_folder
226
+
218
227
  text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
228
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
219
229
 
220
230
  if save
221
231
  ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -225,7 +235,7 @@ module OcrFile
225
235
  end
226
236
 
227
237
  def ocr_file_to_text(save:)
228
- if pdf? &&
238
+ if pdf?
229
239
  ocr_pdf_to_text(save: save)
230
240
  else # is an image
231
241
  ocr_image_to_text(save: save)
@@ -233,15 +243,29 @@ module OcrFile
233
243
  end
234
244
 
235
245
  def find_best_image_processing(save:)
236
- ocr_file_to_text(save: save) if !config[:automatic_reprocess]
246
+ ocr_file_to_text(save: save) unless config[:automatic_reprocess]
237
247
 
238
248
  text = ''
249
+ best_text_count = 0
250
+ best_effects = config[:effects]
251
+
239
252
  effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
240
253
  effects_to_test.each do |effect|
241
- config[:effects] = config[:effects] - [effect]
254
+ text = test_ocr_settings(effect)
255
+ processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
242
256
 
257
+ if processed_result.count_of_issues < best_text_count
258
+ best_text_count = processed_result.count_of_issues
259
+ best_effects = config[:effects]
260
+ end
261
+
262
+ break if processed_result.valid_words?
263
+ end
264
+
265
+ # Fallback
266
+ if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
267
+ config[:effects] = best_effects
243
268
  text = ocr_file_to_text(save: false)
244
- break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
245
269
  end
246
270
 
247
271
  # Adds in extra operations which is unfortunately inefficient
@@ -252,6 +276,11 @@ module OcrFile
252
276
  end
253
277
  end
254
278
 
279
+ def test_ocr_settings(effect)
280
+ config[:effects] = config[:effects] - [effect]
281
+ ocr_file_to_text(save: false)
282
+ end
283
+
255
284
  def print_time
256
285
  puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
257
286
  end
@@ -61,11 +61,38 @@ module OcrFile
61
61
  image_paths
62
62
  end
63
63
 
64
+ def insert_image(document, image_path)
65
+ canvas = document.pages.add.canvas
66
+ canvas.image(image_path, at: [0, 0], height: 700)
67
+ end
68
+
69
+ def combine(text, pdf_of_images)
70
+ return unless pdf_of_images.is_a?(::HexaPDF::Document)
71
+
72
+ if text.is_a?(::HexaPDF::Document)
73
+ pages_of_text = text.pages
74
+ else # Assume raw text with PAGE_BREAK
75
+ pages_of_text = text.split(PAGE_BREAK)
76
+ end
77
+
78
+ return unless pages_of_text.size == pdf_of_images.pages.size
79
+
80
+ if text.is_a?(::HexaPDF::Document) # Keep the page structure
81
+
82
+ else # Just text to embed
83
+
84
+ end
85
+ end
86
+
64
87
  def merge(documents)
65
88
  target = ::HexaPDF::Document.new
66
89
 
67
90
  documents.each do |document|
68
- document.pages.each { |page| target.pages << target.import(page) }
91
+ if document.is_a?(::HexaPDF::Document)
92
+ document.pages.each { |page| target.pages << target.import(page) }
93
+ else # Assume an image
94
+ insert_image(target, document)
95
+ end
69
96
  end
70
97
 
71
98
  target
@@ -1,33 +1,81 @@
1
1
  module OcrFile
2
2
  module TextEngines
3
3
  class ResultProcessor
4
- MINIMUM_WORD_LENGTH = 3
4
+ MINIMUM_WORD_LENGTH = 4
5
+ ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
6
+ ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
7
+
8
+ # REGEX
9
+ ASCII_ONLY = /[^\u{0000}-\u{007f}]/
10
+ NOISE_CHARACTERS = /[^\w\s\/-;:]/
11
+ DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
12
+ EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
5
13
 
6
14
  attr_reader :text, :clear_text
7
15
 
8
16
  def initialize(text)
9
17
  @text = text
10
- @clear_text = remove_lines
18
+ @clear_text = generate_clear_text || text || ''
19
+ end
20
+
21
+ def correct
22
+ Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
11
23
  end
12
24
 
13
25
  # This is a very naive way of determining if we should re-do OCR with
14
26
  # shifted options
15
27
  def valid_words?
16
- word_size_average >= MINIMUM_WORD_LENGTH
28
+ word_size_average >= MINIMUM_WORD_LENGTH &&
29
+ spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
30
+ unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
31
+ end
32
+
33
+ def invalid_words?
34
+ !valid_words?
17
35
  end
18
36
 
19
37
  def word_count
20
- @_word_count ||= clear_text.split(' ').size
38
+ return 0 if empty_text?
39
+ @_word_count ||= clear_words.size
21
40
  end
22
41
 
23
42
  def word_size_average
24
- @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
43
+ return 0 if empty_text?
44
+ @_word_size_average ||= clear_words.map(&:size).sum / word_count
45
+ end
46
+
47
+ # Assume English
48
+ def unidentified_word_count
49
+ clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
50
+ end
51
+
52
+ def spelling_error_count
53
+ Spellchecker.check(clear_text).count
54
+ end
55
+
56
+ def count_of_issues
57
+ spelling_error_count + unidentified_word_count
25
58
  end
26
59
 
27
60
  private
28
61
 
62
+ def empty_text?
63
+ clear_text.nil? || clear_text == ''
64
+ end
65
+
66
+ def clear_words
67
+ @clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
68
+ end
69
+
70
+ def generate_clear_text
71
+ remove_lines
72
+ &.gsub(ASCII_ONLY, '')
73
+ &.gsub(NOISE_CHARACTERS, '')
74
+ &.gsub(DUPLICATE_WORDS, '')
75
+ end
76
+
29
77
  def remove_lines
30
- text.gsub("\n", ' ').gsub("\r", ' ').gsub(' ', '')
78
+ text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
31
79
  end
32
80
  end
33
81
  end
@@ -1,3 +1,3 @@
1
1
  module OcrFile
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.6"
3
3
  end
data/lib/ocr-file.rb CHANGED
@@ -2,6 +2,7 @@ require 'hexapdf'
2
2
  require 'hexapdf/cli/images'
3
3
  require 'rtesseract'
4
4
  require 'mini_magick'
5
+ require 'ruby-spellchecker'
5
6
 
6
7
  require 'ocr-file/version'
7
8
 
data/ocr-file.gemspec CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
32
32
  spec.add_dependency "hexapdf", "~> 0.23.0"
33
33
  spec.add_dependency "rtesseract", "~> 3.1.2"
34
34
  spec.add_dependency "mini_magick", "~> 4.11.0"
35
+ spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
35
36
 
36
37
  # Development Dependencies
37
38
  spec.add_development_dependency "pry", "~> 0.14.1"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ocr-file
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-20 00:00:00.000000000 Z
11
+ date: 2022-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: console-style
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: 4.11.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: ruby-spellchecker
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.5
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.1.5
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: pry
85
99
  requirement: !ruby/object:Gem::Requirement