ocr-file 0.0.4 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
4
- data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
3
+ metadata.gz: 9660e3d19c210789a7aeab56e63b002b507c12917edd1418202900d05647773b
4
+ data.tar.gz: 28861544f58374db141e5e3cc3ee8569201c4ffd611458e77774c98835e2f882
5
5
  SHA512:
6
- metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
7
- data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
6
+ metadata.gz: 55f4266d6877a7f2c8a4175f5601930c9940aa1a8a06fc1d6d84faca455232173f2ae4be887dc9d20246cb224498a213b91871b66ee3fb3b02699399be33453a
7
+ data.tar.gz: e78263baffd1ae1ff1246f705250d788ae6b5c24b3d59d78e169b3ddf93e139ac9b9688cf9d954053a913a1c3bd6be54c40466d372f915c4430d55d93282f1bb
data/Gemfile.lock CHANGED
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ocr-file (0.0.4)
4
+ ocr-file (0.0.6)
5
5
  active_attr (~> 0.15.4)
6
6
  console-style (~> 0.0.1)
7
7
  hexapdf (~> 0.23.0)
8
8
  mini_magick (~> 4.11.0)
9
9
  rtesseract (~> 3.1.2)
10
+ ruby-spellchecker (~> 0.1.5)
10
11
 
11
12
  GEM
12
13
  remote: https://rubygems.org/
@@ -60,7 +61,7 @@ GEM
60
61
  coderay (~> 1.1)
61
62
  method_source (~> 1.0)
62
63
  racc (1.6.0)
63
- rack (2.2.3.1)
64
+ rack (2.2.4)
64
65
  rack-test (1.1.0)
65
66
  rack (>= 1.0, < 3)
66
67
  rails-dom-testing (2.0.3)
@@ -69,6 +70,7 @@ GEM
69
70
  rails-html-sanitizer (1.4.3)
70
71
  loofah (~> 2.3)
71
72
  rtesseract (3.1.2)
73
+ ruby-spellchecker (0.1.5)
72
74
  tzinfo (2.0.4)
73
75
  concurrent-ruby (~> 1.0)
74
76
 
data/README.md CHANGED
@@ -49,6 +49,7 @@ You will need to install `tesseract` with your desired language on your system,
49
49
  optimise_pdf: true,
50
50
  extract_pdf_images: true, # if false will screenshot each PDF page
51
51
  temp_filename_prefix: 'image',
52
+ spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
52
53
  # Console Output
53
54
  verbose: true,
54
55
  timing: true,
@@ -76,6 +77,7 @@ You will need to install `tesseract` with your desired language on your system,
76
77
  doc.to_pdf
77
78
 
78
79
  # How to merge files into a single PDF:
80
+ # The files can be images or other PDFs
79
81
  filepaths = []
80
82
  documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
81
83
  merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
@@ -120,6 +122,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
120
122
  - Improve console output
121
123
  - Fix spaces in file names
122
124
  - Better verbosity
125
+ - Docker
126
+ - pdftk / pdf merge for text and bookmarks etc ...
127
+ - https://github.com/tesseract-ocr/tesseract/issues/660
128
+ - tesseract -c naked_pdf=true
129
+ -
123
130
 
124
131
  ### Tests
125
132
  To run tests execute:
@@ -29,6 +29,7 @@ module OcrFile
29
29
  optimise_pdf: true,
30
30
  extract_pdf_images: true, # if false will screenshot each PDF page
31
31
  temp_filename_prefix: 'image',
32
+ spelling_correction: true,
32
33
  # Console Output
33
34
  verbose: true,
34
35
  timing: true,
@@ -75,7 +76,7 @@ module OcrFile
75
76
  # Trigger OCR pipeline
76
77
  def to_pdf
77
78
  @start_time = Time.now
78
- find_best_image_processing if config[:automatic_reprocess] && !text?
79
+ find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
79
80
 
80
81
  if pdf?
81
82
  ocr_pdf_to_searchable_pdf
@@ -117,7 +118,7 @@ module OcrFile
117
118
  end
118
119
 
119
120
  def close
120
- # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
121
+ ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
121
122
  end
122
123
 
123
124
  private
@@ -171,6 +172,7 @@ module OcrFile
171
172
  pdfs_to_merge = []
172
173
 
173
174
  image_paths.each do |image_path|
175
+ puts image_path
174
176
  pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
175
177
  end
176
178
 
@@ -182,6 +184,8 @@ module OcrFile
182
184
 
183
185
  def text_to_pdf
184
186
  text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
187
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
188
+
185
189
  pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
186
190
 
187
191
  OcrFile::ImageEngines::PdfEngine
@@ -189,7 +193,7 @@ module OcrFile
189
193
  end
190
194
 
191
195
  def ocr_image_to_pdf
192
- find_best_image_processing if config[:automatic_reprocess]
196
+ find_best_image_processing(save: false) if config[:automatic_reprocess]
193
197
 
194
198
  pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
195
199
  OcrFile::ImageEngines::PdfEngine
@@ -203,7 +207,11 @@ module OcrFile
203
207
  text = ''
204
208
 
205
209
  image_paths.each do |image_path|
206
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
210
+ puts image_path
211
+ text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
212
+
213
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
214
+ text = "#{text}#{PAGE_BREAK}#{text}"
207
215
  end
208
216
 
209
217
  if save
@@ -215,7 +223,9 @@ module OcrFile
215
223
 
216
224
  def ocr_image_to_text(save:)
217
225
  create_temp_folder
226
+
218
227
  text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
228
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
219
229
 
220
230
  if save
221
231
  ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -225,7 +235,7 @@ module OcrFile
225
235
  end
226
236
 
227
237
  def ocr_file_to_text(save:)
228
- if pdf? &&
238
+ if pdf?
229
239
  ocr_pdf_to_text(save: save)
230
240
  else # is an image
231
241
  ocr_image_to_text(save: save)
@@ -233,15 +243,29 @@ module OcrFile
233
243
  end
234
244
 
235
245
  def find_best_image_processing(save:)
236
- ocr_file_to_text(save: save) if !config[:automatic_reprocess]
246
+ ocr_file_to_text(save: save) unless config[:automatic_reprocess]
237
247
 
238
248
  text = ''
249
+ best_text_count = 0
250
+ best_effects = config[:effects]
251
+
239
252
  effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
240
253
  effects_to_test.each do |effect|
241
- config[:effects] = config[:effects] - [effect]
254
+ text = test_ocr_settings(effect)
255
+ processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
242
256
 
257
+ if processed_result.count_of_issues < best_text_count
258
+ best_text_count = processed_result.count_of_issues
259
+ best_effects = config[:effects]
260
+ end
261
+
262
+ break if processed_result.valid_words?
263
+ end
264
+
265
+ # Fallback
266
+ if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
267
+ config[:effects] = best_effects
243
268
  text = ocr_file_to_text(save: false)
244
- break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
245
269
  end
246
270
 
247
271
  # Adds in extra operations which is unfortunately inefficient
@@ -252,6 +276,11 @@ module OcrFile
252
276
  end
253
277
  end
254
278
 
279
+ def test_ocr_settings(effect)
280
+ config[:effects] = config[:effects] - [effect]
281
+ ocr_file_to_text(save: false)
282
+ end
283
+
255
284
  def print_time
256
285
  puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
257
286
  end
@@ -61,11 +61,38 @@ module OcrFile
61
61
  image_paths
62
62
  end
63
63
 
64
+ def insert_image(document, image_path)
65
+ canvas = document.pages.add.canvas
66
+ canvas.image(image_path, at: [0, 0], height: 700)
67
+ end
68
+
69
+ def combine(text, pdf_of_images)
70
+ return unless pdf_of_images.is_a?(::HexaPDF::Document)
71
+
72
+ if text.is_a?(::HexaPDF::Document)
73
+ pages_of_text = text.pages
74
+ else # Assume raw text with PAGE_BREAK
75
+ pages_of_text = text.split(PAGE_BREAK)
76
+ end
77
+
78
+ return unless pages_of_text.size == pdf_of_images.pages.size
79
+
80
+ if text.is_a?(::HexaPDF::Document) # Keep the page structure
81
+
82
+ else # Just text to embed
83
+
84
+ end
85
+ end
86
+
64
87
  def merge(documents)
65
88
  target = ::HexaPDF::Document.new
66
89
 
67
90
  documents.each do |document|
68
- document.pages.each { |page| target.pages << target.import(page) }
91
+ if document.is_a?(::HexaPDF::Document)
92
+ document.pages.each { |page| target.pages << target.import(page) }
93
+ else # Assume an image
94
+ insert_image(target, document)
95
+ end
69
96
  end
70
97
 
71
98
  target
@@ -1,33 +1,81 @@
1
1
  module OcrFile
2
2
  module TextEngines
3
3
  class ResultProcessor
4
- MINIMUM_WORD_LENGTH = 3
4
+ MINIMUM_WORD_LENGTH = 4
5
+ ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
6
+ ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
7
+
8
+ # REGEX
9
+ ASCII_ONLY = /[^\u{0000}-\u{007f}]/
10
+ NOISE_CHARACTERS = /[^\w\s\/-;:]/
11
+ DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
12
+ EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
5
13
 
6
14
  attr_reader :text, :clear_text
7
15
 
8
16
  def initialize(text)
9
17
  @text = text
10
- @clear_text = remove_lines
18
+ @clear_text = generate_clear_text || text || ''
19
+ end
20
+
21
+ def correct
22
+ Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
11
23
  end
12
24
 
13
25
  # This is a very naive way of determining if we should re-do OCR with
14
26
  # shifted options
15
27
  def valid_words?
16
- word_size_average >= MINIMUM_WORD_LENGTH
28
+ word_size_average >= MINIMUM_WORD_LENGTH &&
29
+ spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
30
+ unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
31
+ end
32
+
33
+ def invalid_words?
34
+ !valid_words?
17
35
  end
18
36
 
19
37
  def word_count
20
- @_word_count ||= clear_text.split(' ').size
38
+ return 0 if empty_text?
39
+ @_word_count ||= clear_words.size
21
40
  end
22
41
 
23
42
  def word_size_average
24
- @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
43
+ return 0 if empty_text?
44
+ @_word_size_average ||= clear_words.map(&:size).sum / word_count
45
+ end
46
+
47
+ # Assume English
48
+ def unidentified_word_count
49
+ clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
50
+ end
51
+
52
+ def spelling_error_count
53
+ Spellchecker.check(clear_text).count
54
+ end
55
+
56
+ def count_of_issues
57
+ spelling_error_count + unidentified_word_count
25
58
  end
26
59
 
27
60
  private
28
61
 
62
+ def empty_text?
63
+ clear_text.nil? || clear_text == ''
64
+ end
65
+
66
+ def clear_words
67
+ @clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
68
+ end
69
+
70
+ def generate_clear_text
71
+ remove_lines
72
+ &.gsub(ASCII_ONLY, '')
73
+ &.gsub(NOISE_CHARACTERS, '')
74
+ &.gsub(DUPLICATE_WORDS, '')
75
+ end
76
+
29
77
  def remove_lines
30
- text.gsub("\n", ' ').gsub("\r", ' ').gsub(' ', '')
78
+ text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
31
79
  end
32
80
  end
33
81
  end
@@ -1,3 +1,3 @@
1
1
  module OcrFile
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.6"
3
3
  end
data/lib/ocr-file.rb CHANGED
@@ -2,6 +2,7 @@ require 'hexapdf'
2
2
  require 'hexapdf/cli/images'
3
3
  require 'rtesseract'
4
4
  require 'mini_magick'
5
+ require 'ruby-spellchecker'
5
6
 
6
7
  require 'ocr-file/version'
7
8
 
data/ocr-file.gemspec CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
32
32
  spec.add_dependency "hexapdf", "~> 0.23.0"
33
33
  spec.add_dependency "rtesseract", "~> 3.1.2"
34
34
  spec.add_dependency "mini_magick", "~> 4.11.0"
35
+ spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
35
36
 
36
37
  # Development Dependencies
37
38
  spec.add_development_dependency "pry", "~> 0.14.1"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ocr-file
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-20 00:00:00.000000000 Z
11
+ date: 2022-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: console-style
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: 4.11.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: ruby-spellchecker
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.5
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.1.5
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: pry
85
99
  requirement: !ruby/object:Gem::Requirement