ocr-file 0.0.4 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
4
- data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
3
+ metadata.gz: 9ae0f4940b34df3280221cf8b26d86ba3498f8344ef5f0e27ea335ca651a8906
4
+ data.tar.gz: 5e790899721d25bb0f4dc0e8e276b39b62bbb2803549fdbc8ba148804885bec0
5
5
  SHA512:
6
- metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
7
- data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
6
+ metadata.gz: 6cd016ca7bba37866579cad59f01f41d190c0a191cd1ce27fa7037646da7bf4962923664c7b6295655936aed8714fac01b08301be65fdfef68403c8dd12c075b
7
+ data.tar.gz: f1581713a76e19f1b24d43f030cccbfb32b206bea8d1a5f07fed26fe4e0cfaa3f991c0c35b98bf1f222ca36b143e83700638ecf3b0520b9663d2fe4336cc5da2
data/Gemfile.lock CHANGED
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ocr-file (0.0.4)
4
+ ocr-file (0.0.8)
5
5
  active_attr (~> 0.15.4)
6
6
  console-style (~> 0.0.1)
7
7
  hexapdf (~> 0.23.0)
8
8
  mini_magick (~> 4.11.0)
9
9
  rtesseract (~> 3.1.2)
10
+ ruby-spellchecker (~> 0.1.5)
10
11
 
11
12
  GEM
12
13
  remote: https://rubygems.org/
@@ -60,7 +61,7 @@ GEM
60
61
  coderay (~> 1.1)
61
62
  method_source (~> 1.0)
62
63
  racc (1.6.0)
63
- rack (2.2.3.1)
64
+ rack (2.2.4)
64
65
  rack-test (1.1.0)
65
66
  rack (>= 1.0, < 3)
66
67
  rails-dom-testing (2.0.3)
@@ -69,6 +70,7 @@ GEM
69
70
  rails-html-sanitizer (1.4.3)
70
71
  loofah (~> 2.3)
71
72
  rtesseract (3.1.2)
73
+ ruby-spellchecker (0.1.5)
72
74
  tzinfo (2.0.4)
73
75
  concurrent-ruby (~> 1.0)
74
76
 
data/README.md CHANGED
@@ -49,9 +49,11 @@ You will need to install `tesseract` with your desired language on your system,
49
49
  optimise_pdf: true,
50
50
  extract_pdf_images: true, # if false will screenshot each PDF page
51
51
  temp_filename_prefix: 'image',
52
+ spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
53
+ keep_files: false,
52
54
  # Console Output
53
55
  verbose: true,
54
- timing: true,
56
+ timing: true
55
57
  }
56
58
 
57
59
  doc = OcrFile::Document.new(
@@ -76,9 +78,10 @@ You will need to install `tesseract` with your desired language on your system,
76
78
  doc.to_pdf
77
79
 
78
80
  # How to merge files into a single PDF:
79
- filepaths = []
80
- documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
81
- merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
81
+ # The files can be images or other PDFs
82
+ file_paths = []
83
+ merged_document = ::HexaPDF::Document.new
84
+ documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.insert_image(merged_document, path) }
82
85
  OcrFile::ImageEngines::PdfEngine.save_pdf(merged_document, save_file_path, optimise: true)
83
86
  ```
84
87
 
@@ -120,6 +123,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
120
123
  - Improve console output
121
124
  - Fix spaces in file names
122
125
  - Better verbosity
126
+ - Docker
127
+ - pdftk / pdf merge for text and bookmarks etc ...
128
+ - https://github.com/tesseract-ocr/tesseract/issues/660
129
+ - tesseract -c naked_pdf=true
130
+ -
123
131
 
124
132
  ### Tests
125
133
  To run tests execute:
@@ -29,9 +29,11 @@ module OcrFile
29
29
  optimise_pdf: true,
30
30
  extract_pdf_images: true, # if false will screenshot each PDF page
31
31
  temp_filename_prefix: 'image',
32
+ spelling_correction: true,
33
+ keep_files: false,
32
34
  # Console Output
33
35
  verbose: true,
34
- timing: true,
36
+ timing: true
35
37
  }
36
38
 
37
39
  attr_reader :original_file_path,
@@ -64,7 +66,7 @@ module OcrFile
64
66
 
65
67
  def image?
66
68
  return false if pdf?
67
- ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
69
+ ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") }
68
70
  end
69
71
 
70
72
  # Treat anything which isnt a PDF or image as text
@@ -75,7 +77,7 @@ module OcrFile
75
77
  # Trigger OCR pipeline
76
78
  def to_pdf
77
79
  @start_time = Time.now
78
- find_best_image_processing if config[:automatic_reprocess] && !text?
80
+ find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
79
81
 
80
82
  if pdf?
81
83
  ocr_pdf_to_searchable_pdf
@@ -117,7 +119,8 @@ module OcrFile
117
119
  end
118
120
 
119
121
  def close
120
- # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
122
+ return if keep_files?
123
+ ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
121
124
  end
122
125
 
123
126
  private
@@ -141,6 +144,10 @@ module OcrFile
141
144
  end
142
145
  end
143
146
 
147
+ def keep_files?
148
+ config['keep_files']
149
+ end
150
+
144
151
  def create_temp_folder
145
152
  date = Time.now.to_s.split(' ').first
146
153
 
@@ -171,6 +178,7 @@ module OcrFile
171
178
  pdfs_to_merge = []
172
179
 
173
180
  image_paths.each do |image_path|
181
+ puts image_path
174
182
  pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
175
183
  end
176
184
 
@@ -182,6 +190,8 @@ module OcrFile
182
190
 
183
191
  def text_to_pdf
184
192
  text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
193
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
194
+
185
195
  pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
186
196
 
187
197
  OcrFile::ImageEngines::PdfEngine
@@ -189,7 +199,7 @@ module OcrFile
189
199
  end
190
200
 
191
201
  def ocr_image_to_pdf
192
- find_best_image_processing if config[:automatic_reprocess]
202
+ find_best_image_processing(save: false) if config[:automatic_reprocess]
193
203
 
194
204
  pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
195
205
  OcrFile::ImageEngines::PdfEngine
@@ -203,7 +213,11 @@ module OcrFile
203
213
  text = ''
204
214
 
205
215
  image_paths.each do |image_path|
206
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
216
+ puts image_path
217
+ text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
218
+
219
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
220
+ text = "#{text}#{PAGE_BREAK}#{text}"
207
221
  end
208
222
 
209
223
  if save
@@ -215,7 +229,9 @@ module OcrFile
215
229
 
216
230
  def ocr_image_to_text(save:)
217
231
  create_temp_folder
232
+
218
233
  text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
234
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
219
235
 
220
236
  if save
221
237
  ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -225,7 +241,7 @@ module OcrFile
225
241
  end
226
242
 
227
243
  def ocr_file_to_text(save:)
228
- if pdf? &&
244
+ if pdf?
229
245
  ocr_pdf_to_text(save: save)
230
246
  else # is an image
231
247
  ocr_image_to_text(save: save)
@@ -233,15 +249,29 @@ module OcrFile
233
249
  end
234
250
 
235
251
  def find_best_image_processing(save:)
236
- ocr_file_to_text(save: save) if !config[:automatic_reprocess]
252
+ ocr_file_to_text(save: save) unless config[:automatic_reprocess]
237
253
 
238
254
  text = ''
255
+ best_text_count = 0
256
+ best_effects = config[:effects]
257
+
239
258
  effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
240
259
  effects_to_test.each do |effect|
241
- config[:effects] = config[:effects] - [effect]
260
+ text = test_ocr_settings(effect)
261
+ processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
262
+
263
+ if processed_result.count_of_issues < best_text_count
264
+ best_text_count = processed_result.count_of_issues
265
+ best_effects = config[:effects]
266
+ end
267
+
268
+ break if processed_result.valid_words?
269
+ end
242
270
 
271
+ # Fallback
272
+ if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
273
+ config[:effects] = best_effects
243
274
  text = ocr_file_to_text(save: false)
244
- break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
245
275
  end
246
276
 
247
277
  # Adds in extra operations which is unfortunately inefficient
@@ -252,6 +282,11 @@ module OcrFile
252
282
  end
253
283
  end
254
284
 
285
+ def test_ocr_settings(effect)
286
+ config[:effects] = config[:effects] - [effect]
287
+ ocr_file_to_text(save: false)
288
+ end
289
+
255
290
  def print_time
256
291
  puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
257
292
  end
@@ -5,7 +5,7 @@ module OcrFile
5
5
  # Conversion of image types
6
6
  # Rotation and detection of skew
7
7
 
8
- attr_reader :image_path, :image, :temp_path, :save_file_path, :config
8
+ attr_reader :image_path, :image, :temp_path, :save_file_path, :config, :width, :height
9
9
 
10
10
  def initialize(image_path:, temp_path:, save_file_path:, config:)
11
11
  @image_path = image_path
@@ -22,6 +22,9 @@ module OcrFile
22
22
  # end
23
23
 
24
24
  @image = MiniMagick::Image.open(image_path)
25
+
26
+ @width = @image[:width]
27
+ @height = @image[:height]
25
28
  end
26
29
 
27
30
  def convert!
@@ -39,6 +42,10 @@ module OcrFile
39
42
  @save_file_path
40
43
  end
41
44
 
45
+ def resize(width, height)
46
+ @image.resize("#{width}x#{height}")
47
+ end
48
+
42
49
  # Effects
43
50
  # http://www.imagemagick.org/script/command-line-options.php
44
51
  def bw
@@ -61,11 +61,53 @@ module OcrFile
61
61
  image_paths
62
62
  end
63
63
 
64
+ def insert_image(document, image_path, dimensions: nil)
65
+ image_processor = OcrFile::ImageEngines::ImageMagick.new(
66
+ image_path: image_path,
67
+ temp_path: @temp_folder_path,
68
+ save_file_path: '',
69
+ config: @config
70
+ )
71
+
72
+ if dimensions
73
+ width = dimensions[0]
74
+ height = dimensions[1]
75
+ else
76
+ width = image_processor.width
77
+ height = image_processor.height
78
+ end
79
+
80
+ page = document.pages.add([0, 0, width, height])
81
+ page.canvas.image(@image || image_path, at: [0, 0], width: width, height: height)
82
+ end
83
+
84
+ def combine(text, pdf_of_images)
85
+ return unless pdf_of_images.is_a?(::HexaPDF::Document)
86
+
87
+ if text.is_a?(::HexaPDF::Document)
88
+ pages_of_text = text.pages
89
+ else # Assume raw text with PAGE_BREAK
90
+ pages_of_text = text.split(PAGE_BREAK)
91
+ end
92
+
93
+ return unless pages_of_text.size == pdf_of_images.pages.size
94
+
95
+ if text.is_a?(::HexaPDF::Document) # Keep the page structure
96
+
97
+ else # Just text to embed
98
+
99
+ end
100
+ end
101
+
64
102
  def merge(documents)
65
103
  target = ::HexaPDF::Document.new
66
104
 
67
105
  documents.each do |document|
68
- document.pages.each { |page| target.pages << target.import(page) }
106
+ if document.is_a?(::HexaPDF::Document)
107
+ document.pages.each { |page| target.pages << target.import(page) }
108
+ else # Assume an image
109
+ insert_image(target, document)
110
+ end
69
111
  end
70
112
 
71
113
  target
@@ -1,33 +1,81 @@
1
1
  module OcrFile
2
2
  module TextEngines
3
3
  class ResultProcessor
4
- MINIMUM_WORD_LENGTH = 3
4
+ MINIMUM_WORD_LENGTH = 4
5
+ ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
6
+ ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
7
+
8
+ # REGEX
9
+ ASCII_ONLY = /[^\u{0000}-\u{007f}]/
10
+ NOISE_CHARACTERS = /[^\w\s\/-;:]/
11
+ DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
12
+ EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
5
13
 
6
14
  attr_reader :text, :clear_text
7
15
 
8
16
  def initialize(text)
9
17
  @text = text
10
- @clear_text = remove_lines
18
+ @clear_text = generate_clear_text || text || ''
19
+ end
20
+
21
+ def correct
22
+ Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
11
23
  end
12
24
 
13
25
  # This is a very naive way of determining if we should re-do OCR with
14
26
  # shifted options
15
27
  def valid_words?
16
- word_size_average >= MINIMUM_WORD_LENGTH
28
+ word_size_average >= MINIMUM_WORD_LENGTH &&
29
+ spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
30
+ unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
31
+ end
32
+
33
+ def invalid_words?
34
+ !valid_words?
17
35
  end
18
36
 
19
37
  def word_count
20
- @_word_count ||= clear_text.split(' ').size
38
+ return 0 if empty_text?
39
+ @_word_count ||= clear_words.size
21
40
  end
22
41
 
23
42
  def word_size_average
24
- @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
43
+ return 0 if empty_text?
44
+ @_word_size_average ||= clear_words.map(&:size).sum / word_count
45
+ end
46
+
47
+ # Assume English
48
+ def unidentified_word_count
49
+ clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
50
+ end
51
+
52
+ def spelling_error_count
53
+ Spellchecker.check(clear_text).count
54
+ end
55
+
56
+ def count_of_issues
57
+ spelling_error_count + unidentified_word_count
25
58
  end
26
59
 
27
60
  private
28
61
 
62
+ def empty_text?
63
+ clear_text.nil? || clear_text == ''
64
+ end
65
+
66
+ def clear_words
67
+ @clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
68
+ end
69
+
70
+ def generate_clear_text
71
+ remove_lines
72
+ &.gsub(ASCII_ONLY, '')
73
+ &.gsub(NOISE_CHARACTERS, '')
74
+ &.gsub(DUPLICATE_WORDS, '')
75
+ end
76
+
29
77
  def remove_lines
30
- text.gsub("\n", ' ').gsub("\r", ' ').gsub(' ', '')
78
+ text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
31
79
  end
32
80
  end
33
81
  end
@@ -1,3 +1,3 @@
1
1
  module OcrFile
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.8"
3
3
  end
data/lib/ocr-file.rb CHANGED
@@ -2,6 +2,7 @@ require 'hexapdf'
2
2
  require 'hexapdf/cli/images'
3
3
  require 'rtesseract'
4
4
  require 'mini_magick'
5
+ require 'ruby-spellchecker'
5
6
 
6
7
  require 'ocr-file/version'
7
8
 
data/ocr-file.gemspec CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
32
32
  spec.add_dependency "hexapdf", "~> 0.23.0"
33
33
  spec.add_dependency "rtesseract", "~> 3.1.2"
34
34
  spec.add_dependency "mini_magick", "~> 4.11.0"
35
+ spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
35
36
 
36
37
  # Development Dependencies
37
38
  spec.add_development_dependency "pry", "~> 0.14.1"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ocr-file
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-20 00:00:00.000000000 Z
11
+ date: 2022-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: console-style
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: 4.11.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: ruby-spellchecker
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.5
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.1.5
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: pry
85
99
  requirement: !ruby/object:Gem::Requirement