ocr-file 0.0.4 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
4
- data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
3
+ metadata.gz: 9ae0f4940b34df3280221cf8b26d86ba3498f8344ef5f0e27ea335ca651a8906
4
+ data.tar.gz: 5e790899721d25bb0f4dc0e8e276b39b62bbb2803549fdbc8ba148804885bec0
5
5
  SHA512:
6
- metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
7
- data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
6
+ metadata.gz: 6cd016ca7bba37866579cad59f01f41d190c0a191cd1ce27fa7037646da7bf4962923664c7b6295655936aed8714fac01b08301be65fdfef68403c8dd12c075b
7
+ data.tar.gz: f1581713a76e19f1b24d43f030cccbfb32b206bea8d1a5f07fed26fe4e0cfaa3f991c0c35b98bf1f222ca36b143e83700638ecf3b0520b9663d2fe4336cc5da2
data/Gemfile.lock CHANGED
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ocr-file (0.0.4)
4
+ ocr-file (0.0.8)
5
5
  active_attr (~> 0.15.4)
6
6
  console-style (~> 0.0.1)
7
7
  hexapdf (~> 0.23.0)
8
8
  mini_magick (~> 4.11.0)
9
9
  rtesseract (~> 3.1.2)
10
+ ruby-spellchecker (~> 0.1.5)
10
11
 
11
12
  GEM
12
13
  remote: https://rubygems.org/
@@ -60,7 +61,7 @@ GEM
60
61
  coderay (~> 1.1)
61
62
  method_source (~> 1.0)
62
63
  racc (1.6.0)
63
- rack (2.2.3.1)
64
+ rack (2.2.4)
64
65
  rack-test (1.1.0)
65
66
  rack (>= 1.0, < 3)
66
67
  rails-dom-testing (2.0.3)
@@ -69,6 +70,7 @@ GEM
69
70
  rails-html-sanitizer (1.4.3)
70
71
  loofah (~> 2.3)
71
72
  rtesseract (3.1.2)
73
+ ruby-spellchecker (0.1.5)
72
74
  tzinfo (2.0.4)
73
75
  concurrent-ruby (~> 1.0)
74
76
 
data/README.md CHANGED
@@ -49,9 +49,11 @@ You will need to install `tesseract` with your desired language on your system,
49
49
  optimise_pdf: true,
50
50
  extract_pdf_images: true, # if false will screenshot each PDF page
51
51
  temp_filename_prefix: 'image',
52
+ spelling_correction: true, # Will attempt to fix text at the end (not used for searchable pdf output)
53
+ keep_files: false,
52
54
  # Console Output
53
55
  verbose: true,
54
- timing: true,
56
+ timing: true
55
57
  }
56
58
 
57
59
  doc = OcrFile::Document.new(
@@ -76,9 +78,10 @@ You will need to install `tesseract` with your desired language on your system,
76
78
  doc.to_pdf
77
79
 
78
80
  # How to merge files into a single PDF:
79
- filepaths = []
80
- documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
81
- merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
81
+ # The files can be images or other PDFs
82
+ file_paths = []
83
+ merged_document = ::HexaPDF::Document.new
84
+ documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.insert_image(merged_document, path) }
82
85
  OcrFile::ImageEngines::PdfEngine.save_pdf(merged_document, save_file_path, optimise: true)
83
86
  ```
84
87
 
@@ -120,6 +123,11 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
120
123
  - Improve console output
121
124
  - Fix spaces in file names
122
125
  - Better verbosity
126
+ - Docker
127
+ - pdftk / pdf merge for text and bookmarks etc ...
128
+ - https://github.com/tesseract-ocr/tesseract/issues/660
129
+ - tesseract -c naked_pdf=true
130
+ -
123
131
 
124
132
  ### Tests
125
133
  To run tests execute:
@@ -29,9 +29,11 @@ module OcrFile
29
29
  optimise_pdf: true,
30
30
  extract_pdf_images: true, # if false will screenshot each PDF page
31
31
  temp_filename_prefix: 'image',
32
+ spelling_correction: true,
33
+ keep_files: false,
32
34
  # Console Output
33
35
  verbose: true,
34
- timing: true,
36
+ timing: true
35
37
  }
36
38
 
37
39
  attr_reader :original_file_path,
@@ -64,7 +66,7 @@ module OcrFile
64
66
 
65
67
  def image?
66
68
  return false if pdf?
67
- ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
69
+ ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") }
68
70
  end
69
71
 
70
72
  # Treat anything which isnt a PDF or image as text
@@ -75,7 +77,7 @@ module OcrFile
75
77
  # Trigger OCR pipeline
76
78
  def to_pdf
77
79
  @start_time = Time.now
78
- find_best_image_processing if config[:automatic_reprocess] && !text?
80
+ find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
79
81
 
80
82
  if pdf?
81
83
  ocr_pdf_to_searchable_pdf
@@ -117,7 +119,8 @@ module OcrFile
117
119
  end
118
120
 
119
121
  def close
120
- # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
122
+ return if keep_files?
123
+ ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
121
124
  end
122
125
 
123
126
  private
@@ -141,6 +144,10 @@ module OcrFile
141
144
  end
142
145
  end
143
146
 
147
+ def keep_files?
148
+ config['keep_files']
149
+ end
150
+
144
151
  def create_temp_folder
145
152
  date = Time.now.to_s.split(' ').first
146
153
 
@@ -171,6 +178,7 @@ module OcrFile
171
178
  pdfs_to_merge = []
172
179
 
173
180
  image_paths.each do |image_path|
181
+ puts image_path
174
182
  pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
175
183
  end
176
184
 
@@ -182,6 +190,8 @@ module OcrFile
182
190
 
183
191
  def text_to_pdf
184
192
  text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
193
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
194
+
185
195
  pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
186
196
 
187
197
  OcrFile::ImageEngines::PdfEngine
@@ -189,7 +199,7 @@ module OcrFile
189
199
  end
190
200
 
191
201
  def ocr_image_to_pdf
192
- find_best_image_processing if config[:automatic_reprocess]
202
+ find_best_image_processing(save: false) if config[:automatic_reprocess]
193
203
 
194
204
  pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
195
205
  OcrFile::ImageEngines::PdfEngine
@@ -203,7 +213,11 @@ module OcrFile
203
213
  text = ''
204
214
 
205
215
  image_paths.each do |image_path|
206
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
216
+ puts image_path
217
+ text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
218
+
219
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
220
+ text = "#{text}#{PAGE_BREAK}#{text}"
207
221
  end
208
222
 
209
223
  if save
@@ -215,7 +229,9 @@ module OcrFile
215
229
 
216
230
  def ocr_image_to_text(save:)
217
231
  create_temp_folder
232
+
218
233
  text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
234
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
219
235
 
220
236
  if save
221
237
  ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -225,7 +241,7 @@ module OcrFile
225
241
  end
226
242
 
227
243
  def ocr_file_to_text(save:)
228
- if pdf? &&
244
+ if pdf?
229
245
  ocr_pdf_to_text(save: save)
230
246
  else # is an image
231
247
  ocr_image_to_text(save: save)
@@ -233,15 +249,29 @@ module OcrFile
233
249
  end
234
250
 
235
251
  def find_best_image_processing(save:)
236
- ocr_file_to_text(save: save) if !config[:automatic_reprocess]
252
+ ocr_file_to_text(save: save) unless config[:automatic_reprocess]
237
253
 
238
254
  text = ''
255
+ best_text_count = 0
256
+ best_effects = config[:effects]
257
+
239
258
  effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
240
259
  effects_to_test.each do |effect|
241
- config[:effects] = config[:effects] - [effect]
260
+ text = test_ocr_settings(effect)
261
+ processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
262
+
263
+ if processed_result.count_of_issues < best_text_count
264
+ best_text_count = processed_result.count_of_issues
265
+ best_effects = config[:effects]
266
+ end
267
+
268
+ break if processed_result.valid_words?
269
+ end
242
270
 
271
+ # Fallback
272
+ if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
273
+ config[:effects] = best_effects
243
274
  text = ocr_file_to_text(save: false)
244
- break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
245
275
  end
246
276
 
247
277
  # Adds in extra operations which is unfortunately inefficient
@@ -252,6 +282,11 @@ module OcrFile
252
282
  end
253
283
  end
254
284
 
285
+ def test_ocr_settings(effect)
286
+ config[:effects] = config[:effects] - [effect]
287
+ ocr_file_to_text(save: false)
288
+ end
289
+
255
290
  def print_time
256
291
  puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
257
292
  end
@@ -5,7 +5,7 @@ module OcrFile
5
5
  # Conversion of image types
6
6
  # Rotation and detection of skew
7
7
 
8
- attr_reader :image_path, :image, :temp_path, :save_file_path, :config
8
+ attr_reader :image_path, :image, :temp_path, :save_file_path, :config, :width, :height
9
9
 
10
10
  def initialize(image_path:, temp_path:, save_file_path:, config:)
11
11
  @image_path = image_path
@@ -22,6 +22,9 @@ module OcrFile
22
22
  # end
23
23
 
24
24
  @image = MiniMagick::Image.open(image_path)
25
+
26
+ @width = @image[:width]
27
+ @height = @image[:height]
25
28
  end
26
29
 
27
30
  def convert!
@@ -39,6 +42,10 @@ module OcrFile
39
42
  @save_file_path
40
43
  end
41
44
 
45
+ def resize(width, height)
46
+ @image.resize("#{width}x#{height}")
47
+ end
48
+
42
49
  # Effects
43
50
  # http://www.imagemagick.org/script/command-line-options.php
44
51
  def bw
@@ -61,11 +61,53 @@ module OcrFile
61
61
  image_paths
62
62
  end
63
63
 
64
+ def insert_image(document, image_path, dimensions: nil)
65
+ image_processor = OcrFile::ImageEngines::ImageMagick.new(
66
+ image_path: image_path,
67
+ temp_path: @temp_folder_path,
68
+ save_file_path: '',
69
+ config: @config
70
+ )
71
+
72
+ if dimensions
73
+ width = dimensions[0]
74
+ height = dimensions[1]
75
+ else
76
+ width = image_processor.width
77
+ height = image_processor.height
78
+ end
79
+
80
+ page = document.pages.add([0, 0, width, height])
81
+ page.canvas.image(@image || image_path, at: [0, 0], width: width, height: height)
82
+ end
83
+
84
+ def combine(text, pdf_of_images)
85
+ return unless pdf_of_images.is_a?(::HexaPDF::Document)
86
+
87
+ if text.is_a?(::HexaPDF::Document)
88
+ pages_of_text = text.pages
89
+ else # Assume raw text with PAGE_BREAK
90
+ pages_of_text = text.split(PAGE_BREAK)
91
+ end
92
+
93
+ return unless pages_of_text.size == pdf_of_images.pages.size
94
+
95
+ if text.is_a?(::HexaPDF::Document) # Keep the page structure
96
+
97
+ else # Just text to embed
98
+
99
+ end
100
+ end
101
+
64
102
  def merge(documents)
65
103
  target = ::HexaPDF::Document.new
66
104
 
67
105
  documents.each do |document|
68
- document.pages.each { |page| target.pages << target.import(page) }
106
+ if document.is_a?(::HexaPDF::Document)
107
+ document.pages.each { |page| target.pages << target.import(page) }
108
+ else # Assume an image
109
+ insert_image(target, document)
110
+ end
69
111
  end
70
112
 
71
113
  target
@@ -1,33 +1,81 @@
1
1
  module OcrFile
2
2
  module TextEngines
3
3
  class ResultProcessor
4
- MINIMUM_WORD_LENGTH = 3
4
+ MINIMUM_WORD_LENGTH = 4
5
+ ACCEPTABLE_NUMBER_OF_ERRORS = 8 # Random number I pulled out of nowhere
6
+ ACCEPTABLE_UNIDENTIFIED_WORDS = 8 # Random number I pulled out of nowhere
7
+
8
+ # REGEX
9
+ ASCII_ONLY = /[^\u{0000}-\u{007f}]/
10
+ NOISE_CHARACTERS = /[^\w\s\/-;:]/
11
+ DUPLICATE_WORDS = /\b(\w+)\s+\1\b/
12
+ EVERYTHING_BUT_CHARACTERS = /[^\w\s]|(\d)/
5
13
 
6
14
  attr_reader :text, :clear_text
7
15
 
8
16
  def initialize(text)
9
17
  @text = text
10
- @clear_text = remove_lines
18
+ @clear_text = generate_clear_text || text || ''
19
+ end
20
+
21
+ def correct
22
+ Spellchecker.correct(text.gsub(NOISE_CHARACTERS, '')).gsub("\n ", "\n").strip
11
23
  end
12
24
 
13
25
  # This is a very naive way of determining if we should re-do OCR with
14
26
  # shifted options
15
27
  def valid_words?
16
- word_size_average >= MINIMUM_WORD_LENGTH
28
+ word_size_average >= MINIMUM_WORD_LENGTH &&
29
+ spelling_error_count <= ACCEPTABLE_NUMBER_OF_ERRORS &&
30
+ unidentified_word_count <= ACCEPTABLE_UNIDENTIFIED_WORDS
31
+ end
32
+
33
+ def invalid_words?
34
+ !valid_words?
17
35
  end
18
36
 
19
37
  def word_count
20
- @_word_count ||= clear_text.split(' ').size
38
+ return 0 if empty_text?
39
+ @_word_count ||= clear_words.size
21
40
  end
22
41
 
23
42
  def word_size_average
24
- @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
43
+ return 0 if empty_text?
44
+ @_word_size_average ||= clear_words.map(&:size).sum / word_count
45
+ end
46
+
47
+ # Assume English
48
+ def unidentified_word_count
49
+ clear_words.reject { |word| Spellchecker::Dictionaries::EnglishWords.include?(word) }.count
50
+ end
51
+
52
+ def spelling_error_count
53
+ Spellchecker.check(clear_text).count
54
+ end
55
+
56
+ def count_of_issues
57
+ spelling_error_count + unidentified_word_count
25
58
  end
26
59
 
27
60
  private
28
61
 
62
+ def empty_text?
63
+ clear_text.nil? || clear_text == ''
64
+ end
65
+
66
+ def clear_words
67
+ @clear_words ||= clear_text.gsub(EVERYTHING_BUT_CHARACTERS, '').split(' ')
68
+ end
69
+
70
+ def generate_clear_text
71
+ remove_lines
72
+ &.gsub(ASCII_ONLY, '')
73
+ &.gsub(NOISE_CHARACTERS, '')
74
+ &.gsub(DUPLICATE_WORDS, '')
75
+ end
76
+
29
77
  def remove_lines
30
- text.gsub("\n", ' ').gsub("\r", ' ').gsub(' ', '')
78
+ text&.gsub("\n", ' ')&.gsub("\r", ' ')&.gsub(' ', '')
31
79
  end
32
80
  end
33
81
  end
@@ -1,3 +1,3 @@
1
1
  module OcrFile
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.8"
3
3
  end
data/lib/ocr-file.rb CHANGED
@@ -2,6 +2,7 @@ require 'hexapdf'
2
2
  require 'hexapdf/cli/images'
3
3
  require 'rtesseract'
4
4
  require 'mini_magick'
5
+ require 'ruby-spellchecker'
5
6
 
6
7
  require 'ocr-file/version'
7
8
 
data/ocr-file.gemspec CHANGED
@@ -32,6 +32,7 @@ Gem::Specification.new do |spec|
32
32
  spec.add_dependency "hexapdf", "~> 0.23.0"
33
33
  spec.add_dependency "rtesseract", "~> 3.1.2"
34
34
  spec.add_dependency "mini_magick", "~> 4.11.0"
35
+ spec.add_dependency "ruby-spellchecker", "~> 0.1.5"
35
36
 
36
37
  # Development Dependencies
37
38
  spec.add_development_dependency "pry", "~> 0.14.1"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ocr-file
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-20 00:00:00.000000000 Z
11
+ date: 2022-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: console-style
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: 4.11.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: ruby-spellchecker
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.5
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.1.5
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: pry
85
99
  requirement: !ruby/object:Gem::Requirement