ocr-file 0.0.1 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6b558ce36c35e74b410f42928eae1a987485d1bbd64da77750574062bc05b91e
4
- data.tar.gz: d906c620a02c5a2d139b3d89d05e9b3872ee6c929b4aa661b20f9033d8f3605f
3
+ metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
4
+ data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
5
5
  SHA512:
6
- metadata.gz: 81049908609ba3d622be2b6f99dabeca2960a455fa3d56ee1fca4c177c2ee4365281421c1128ec3fa5476d068daa53b3d7f7600c5fd1c31fcb5834ca688f9747
7
- data.tar.gz: 1a7dcd56a7196694371abf70633635545138bdc7bc0af2873fc5e7c22bdbfc97e9986ba1a18afc288b24e488caf999b644ec1f9d8889ce6e5efa6fcfe776c204
6
+ metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
7
+ data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ocr-file (0.0.1)
4
+ ocr-file (0.0.4)
5
5
  active_attr (~> 0.15.4)
6
6
  console-style (~> 0.0.1)
7
7
  hexapdf (~> 0.23.0)
data/README.md CHANGED
@@ -42,15 +42,16 @@ You will need to install `tesseract` with your desired language on your system,
42
42
  type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
43
43
  ocr_engine: 'tesseract', # 'cloud-vision'
44
44
  # Image Pre-Processing
45
- image_pre_preprocess: true,
46
- effects: ['bw', 'norm'],
47
- threshold: 0.25,
45
+ image_preprocess: true,
46
+ effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
47
+ automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
48
48
  # PDF to Image Processing
49
49
  optimise_pdf: true,
50
50
  extract_pdf_images: true, # if false will screenshot each PDF page
51
51
  temp_filename_prefix: 'image',
52
52
  # Console Output
53
53
  verbose: true,
54
+ timing: true,
54
55
  }
55
56
 
56
57
  doc = OcrFile::Document.new(
@@ -84,7 +85,23 @@ You will need to install `tesseract` with your desired language on your system,
84
85
  ### Notes / Tips
85
86
  Set `extract_pdf_images` to `false` for higher quality OCR. However this will consume more temporary space per PDF page and also be considerably slower.
86
87
 
87
- Image pre-processing is not yet implemented.
88
+ Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
89
+
90
+ `automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
91
+
92
+ ### Simple CLI
93
+ Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
94
+
95
+ ```
96
+ # Basic Usage with console output
97
+ ocr-file input_file_path output_folder_path
98
+
99
+ # Output to PDF
100
+ ocr-file input_file_path output_folder_path pdf
101
+
102
+ # Output to TXT
103
+ ocr-file input_file_path output_folder_path txt
104
+ ```
88
105
 
89
106
  ## Development
90
107
 
@@ -94,14 +111,15 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
94
111
 
95
112
  ### TODOs
96
113
  - input validation
97
- - CLI
98
- - image processing
114
+ - Better CLI
99
115
  - password
100
116
  - Base64 encoding
101
117
  - requirements checking (installed dependencies etc ...)
102
118
  - Tests
103
119
  - Configurable temp folder cleanup
104
120
  - Improve console output
121
+ - Fix spaces in file names
122
+ - Better verbosity
105
123
 
106
124
  ### Tests
107
125
  To run tests execute:
data/bin/ocr-file CHANGED
@@ -2,4 +2,4 @@
2
2
 
3
3
  require 'ocr-file'
4
4
 
5
- puts "Hello, world!"
5
+ OcrFile::Cli.new(ARGV).call
data/lib/ocr-file/cli.rb CHANGED
@@ -1,5 +1,41 @@
1
1
  module OcrFile
2
- module Cli
2
+ class Cli
3
+ attr_reader :args
3
4
 
5
+ def initialize(args)
6
+ @args = args
7
+ end
8
+
9
+ def valid?
10
+ return true if args.size == 2 || args.size == 3
11
+ false
12
+ end
13
+
14
+ def invalid?
15
+ !valid?
16
+ end
17
+
18
+ def call
19
+ # TODO: Use ConsoleStyle::Functions
20
+ # TODO: Heading and better CLI interface
21
+ # Simple cli for now
22
+ puts "OCR Tool © Jason Chalom 2022, Version: #{OcrFile::VERSION}"
23
+ abort "File path, Save Folder Paths, and output type (pdf, txt) are required!" if invalid?
24
+
25
+ # Using default config for now
26
+ original_file_path = args[0]
27
+ save_file_path = args[1]
28
+ output_type = args[2]
29
+
30
+ document = OcrFile::Document.new(original_file_path: original_file_path, save_file_path: save_file_path)
31
+
32
+ if output_type.to_s.downcase.include?('pdf')
33
+ document.to_pdf
34
+ elsif output_type.to_s.downcase.include?('txt') || output_type.to_s.downcase.include?('text')
35
+ document.to_text
36
+ else # Display in console
37
+ puts document.to_s
38
+ end
39
+ end
4
40
  end
5
41
  end
@@ -1,7 +1,11 @@
1
1
  module OcrFile
2
2
  class Document
3
+ # TODO: Skewness / text orientation detection
4
+ # TODO: Better handwriting analysis
5
+
3
6
  ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
4
7
  PAGE_BREAK = "\n\r\n" # TODO: Make configurable
8
+ EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
5
9
  DEFAULT_CONFIG = {
6
10
  # Images from PDF
7
11
  filetype: 'png',
@@ -18,15 +22,16 @@ module OcrFile
18
22
  type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
19
23
  ocr_engine: 'tesseract', # 'cloud-vision'
20
24
  # Image Pre-Processing
21
- image_pre_preprocess: true,
22
- effects: ['bw', 'norm'],
23
- threshold: 0.25,
25
+ image_preprocess: true,
26
+ effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
27
+ automatic_reprocess: true,
24
28
  # PDF to Image Processing
25
29
  optimise_pdf: true,
26
30
  extract_pdf_images: true, # if false will screenshot each PDF page
27
31
  temp_filename_prefix: 'image',
28
32
  # Console Output
29
33
  verbose: true,
34
+ timing: true,
30
35
  }
31
36
 
32
37
  attr_reader :original_file_path,
@@ -34,7 +39,9 @@ module OcrFile
34
39
  :save_file_path,
35
40
  :final_save_file,
36
41
  :config,
37
- :ocr_engine
42
+ :ocr_engine,
43
+ :start_time,
44
+ :end_time
38
45
 
39
46
  # save_file_path will also generate a tmp path for tmp files. Expected folder path
40
47
  # TODO: Add in more input validation
@@ -52,12 +59,12 @@ module OcrFile
52
59
  end
53
60
 
54
61
  def pdf?
55
- @original_file_path.include?('.pdf')
62
+ @original_file_path.downcase.include?('.pdf')
56
63
  end
57
64
 
58
65
  def image?
59
66
  return false if pdf?
60
- ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
67
+ ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
61
68
  end
62
69
 
63
70
  # Treat anything which isnt a PDF or image as text
@@ -65,74 +72,52 @@ module OcrFile
65
72
  !pdf? && !image?
66
73
  end
67
74
 
75
+ # Trigger OCR pipeline
68
76
  def to_pdf
69
- if pdf?
70
- create_temp_folder
71
- image_paths = extract_image_paths_from_pdf(@original_file_path)
72
-
73
- pdfs_to_merge = []
74
-
75
- image_paths.each do |image_path|
76
- pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
77
- end
78
-
79
- merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
77
+ @start_time = Time.now
78
+ find_best_image_processing if config[:automatic_reprocess] && !text?
80
79
 
81
- OcrFile::ImageEngines::PdfEngine
82
- .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
83
-
84
- close
80
+ if pdf?
81
+ ocr_pdf_to_searchable_pdf
85
82
  elsif text?
86
- text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
87
- pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
88
-
89
- OcrFile::ImageEngines::PdfEngine
90
- .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
83
+ text_to_pdf
91
84
  else # is an image
92
85
  ocr_image_to_pdf
93
86
  end
87
+
88
+ close
89
+
90
+ @end_time = Time.now
91
+ print_time
94
92
  end
95
93
 
96
94
  def to_text
97
- if pdf?
98
- create_temp_folder
99
- image_paths = extract_image_paths_from_pdf(@original_file_path)
95
+ @start_time = Time.now
96
+ return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
100
97
 
101
- image_paths.each do |image_path|
102
- text = @ocr_engine.ocr_to_text(image_path, options: @config)
103
- ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
104
- end
98
+ find_best_image_processing(save: true)
99
+ close
105
100
 
106
- close
107
- elsif text?
108
- ::OcrFile::FileHelpers.open_text_file(@original_file_path)
109
- else # is an image
110
- ocr_image_to_text(save: true)
111
- end
101
+ @end_time = Time.now
102
+ print_time
112
103
  end
113
104
 
114
105
  def to_s
115
- if pdf?
116
- create_temp_folder
117
- image_paths = extract_image_paths_from_pdf(@original_file_path)
106
+ @start_time = Time.now
107
+ return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
118
108
 
119
- text = ''
109
+ text = find_best_image_processing(save: false)
120
110
 
121
- image_paths.each do |image_path|
122
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
123
- end
111
+ close
124
112
 
125
- close
126
- text
127
- elsif text?
128
- ::OcrFile::FileHelpers.open_text_file(@original_file_path)
129
- else # is an image
130
- ocr_image_to_text(save: false)
131
- end
113
+ @end_time = Time.now
114
+ print_time
115
+
116
+ text
132
117
  end
133
118
 
134
119
  def close
135
- ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
120
+ # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
136
121
  end
137
122
 
138
123
  private
@@ -157,19 +142,80 @@ module OcrFile
157
142
  end
158
143
 
159
144
  def create_temp_folder
160
- # TODO: Make this a bit more robust
161
- @temp_folder_path = "#{save_file_path}/temp/".gsub(' ', '\ ')
145
+ date = Time.now.to_s.split(' ').first
146
+
147
+ @temp_folder_path = "#{save_file_path}/temp-#{date}/".gsub(' ', '\ ')
162
148
  ::OcrFile::FileHelpers.make_directory(@temp_folder_path)
163
149
  end
164
150
 
151
+ def process_image(path)
152
+ return path unless @config[:image_preprocess]
153
+
154
+ create_temp_folder
155
+ save_file_path = "#{@temp_folder_path}/#{Time.now.to_i}.#{@config[:filetype]}"
156
+
157
+ image_processor = OcrFile::ImageEngines::ImageMagick.new(
158
+ image_path: path,
159
+ temp_path: @temp_folder_path,
160
+ save_file_path: save_file_path,
161
+ config: @config
162
+ )
163
+
164
+ image_processor.convert!
165
+ end
166
+
167
+ def ocr_pdf_to_searchable_pdf
168
+ create_temp_folder
169
+ image_paths = extract_image_paths_from_pdf(@original_file_path)
170
+
171
+ pdfs_to_merge = []
172
+
173
+ image_paths.each do |image_path|
174
+ pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
175
+ end
176
+
177
+ merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
178
+
179
+ OcrFile::ImageEngines::PdfEngine
180
+ .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
181
+ end
182
+
183
+ def text_to_pdf
184
+ text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
185
+ pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
186
+
187
+ OcrFile::ImageEngines::PdfEngine
188
+ .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
189
+ end
190
+
165
191
  def ocr_image_to_pdf
166
- pdf_document = @ocr_engine.ocr_to_pdf(@original_file_path, options: @config)
192
+ find_best_image_processing if config[:automatic_reprocess]
193
+
194
+ pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
167
195
  OcrFile::ImageEngines::PdfEngine
168
196
  .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
169
197
  end
170
198
 
171
- def ocr_image_to_text(save: true)
172
- text = @ocr_engine.ocr_to_text(@original_file_path, options: @config)
199
+ def ocr_pdf_to_text(save:)
200
+ create_temp_folder
201
+ image_paths = extract_image_paths_from_pdf(@original_file_path)
202
+
203
+ text = ''
204
+
205
+ image_paths.each do |image_path|
206
+ text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
207
+ end
208
+
209
+ if save
210
+ ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
211
+ else
212
+ text
213
+ end
214
+ end
215
+
216
+ def ocr_image_to_text(save:)
217
+ create_temp_folder
218
+ text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
173
219
 
174
220
  if save
175
221
  ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -178,6 +224,38 @@ module OcrFile
178
224
  end
179
225
  end
180
226
 
227
+ def ocr_file_to_text(save:)
228
+ if pdf? &&
229
+ ocr_pdf_to_text(save: save)
230
+ else # is an image
231
+ ocr_image_to_text(save: save)
232
+ end
233
+ end
234
+
235
+ def find_best_image_processing(save:)
236
+ ocr_file_to_text(save: save) if !config[:automatic_reprocess]
237
+
238
+ text = ''
239
+ effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
240
+ effects_to_test.each do |effect|
241
+ config[:effects] = config[:effects] - [effect]
242
+
243
+ text = ocr_file_to_text(save: false)
244
+ break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
245
+ end
246
+
247
+ # Adds in extra operations which is unfortunately inefficient
248
+ if save
249
+ ocr_file_to_text(save: save)
250
+ else
251
+ text
252
+ end
253
+ end
254
+
255
+ def print_time
256
+ puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
257
+ end
258
+
181
259
  def find_ocr_engine(engine_id)
182
260
  ocr_engine_constants
183
261
  .map { |c| ocr_module(c) }
@@ -1,14 +1,80 @@
1
1
  module OcrFile
2
2
  module ImageEngines
3
- module ImageMagick
4
- extend self
5
-
3
+ class ImageMagick
6
4
  # TODO:
7
- # B/W
8
- # Contrast
9
- # Image Norm
10
- # Threshold
11
5
  # Conversion of image types
6
+ # Rotation and detection of skew
7
+
8
+ attr_reader :image_path, :image, :temp_path, :save_file_path, :config
9
+
10
+ def initialize(image_path:, temp_path:, save_file_path:, config:)
11
+ @image_path = image_path
12
+ @config = config
13
+ @save_file_path = save_file_path
14
+
15
+ @temp_path = temp_path
16
+
17
+ # Will be available in the next version of MiniMagick > 4.11.0
18
+ # https://github.com/minimagick/minimagick/pull/541
19
+ # MiniMagick.configure do |config|
20
+ # # cli_version graphicsmagick? imagemagick7? imagemagick? version
21
+ # config.tmpdir = File.join(Dir.tmpdir, @temp_path)
22
+ # end
23
+
24
+ @image = MiniMagick::Image.open(image_path)
25
+ end
26
+
27
+ def convert!
28
+ return @image_path unless @config[:image_preprocess]
29
+
30
+ @config[:effects].each do |effect|
31
+ self.send(effect.to_sym)
32
+ end
33
+
34
+ save!
35
+ end
36
+
37
+ def save!
38
+ image.write(@save_file_path)
39
+ @save_file_path
40
+ end
41
+
42
+ # Effects
43
+ # http://www.imagemagick.org/script/command-line-options.php
44
+ def bw
45
+ @image.alpha('off')
46
+ @image.auto_threshold("otsu")
47
+ end
48
+
49
+ def enhance
50
+ @image.enhance
51
+ end
52
+
53
+ def norm
54
+ @image.equalize
55
+ end
56
+
57
+ # Most likely not going to be configurable because
58
+ # these are aggressive parameters used to optimised OCR results
59
+ # and not the final results of the PDFs
60
+ def sharpen
61
+ @image.sharpen('0x4') # radiusXsigma
62
+ end
63
+
64
+ # https://github.com/ImageMagick/ImageMagick/discussions/4145
65
+ def remove_shadow
66
+ @image.negate
67
+ @image.lat("20x20+10\%")
68
+ @image.negate
69
+ end
70
+
71
+ def deskew
72
+ @image.deskew('40%') # threshold recommended in the docs
73
+ end
74
+
75
+ def despeckle
76
+ @image.despeckle
77
+ end
12
78
  end
13
79
  end
14
80
  end
@@ -13,7 +13,7 @@ module OcrFile
13
13
  print 'Generating screenshots of each PDF page ... '
14
14
 
15
15
  if filetype == 'jpg'
16
- `pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
16
+ `pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} "#{pdf_path}" "#{save_path}/#{filename}"`
17
17
  else
18
18
  `pdftoppm -#{filetype} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
19
19
  end
@@ -0,0 +1,34 @@
1
+ module OcrFile
2
+ module TextEngines
3
+ class ResultProcessor
4
+ MINIMUM_WORD_LENGTH = 3
5
+
6
+ attr_reader :text, :clear_text
7
+
8
+ def initialize(text)
9
+ @text = text
10
+ @clear_text = remove_lines
11
+ end
12
+
13
+ # This is a very naive way of determining if we should re-do OCR with
14
+ # shifted options
15
+ def valid_words?
16
+ word_size_average >= MINIMUM_WORD_LENGTH
17
+ end
18
+
19
+ def word_count
20
+ @_word_count ||= clear_text.split(' ').size
21
+ end
22
+
23
+ def word_size_average
24
+ @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
25
+ end
26
+
27
+ private
28
+
29
+ def remove_lines
30
+ text.gsub("\n", ' ').gsub("\r", ' ').gsub(' ', '')
31
+ end
32
+ end
33
+ end
34
+ end
@@ -1,3 +1,3 @@
1
1
  module OcrFile
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.4"
3
3
  end
data/lib/ocr-file.rb CHANGED
@@ -10,6 +10,7 @@ require 'ocr-file/image_engines/image_magick'
10
10
  require 'ocr-file/image_engines/pdftoppm'
11
11
  require 'ocr-file/ocr_engines/tesseract'
12
12
  require 'ocr-file/ocr_engines/cloud_vision'
13
+ require 'ocr-file/text_engines/result_processor'
13
14
  require 'ocr-file/file_helpers'
14
15
  require 'ocr-file/document'
15
16
  require 'ocr-file/cli'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ocr-file
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-19 00:00:00.000000000 Z
11
+ date: 2022-06-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: console-style
@@ -122,6 +122,7 @@ files:
122
122
  - lib/ocr-file/image_engines/pdftoppm.rb
123
123
  - lib/ocr-file/ocr_engines/cloud_vision.rb
124
124
  - lib/ocr-file/ocr_engines/tesseract.rb
125
+ - lib/ocr-file/text_engines/result_processor.rb
125
126
  - lib/ocr-file/version.rb
126
127
  - ocr-file.gemspec
127
128
  homepage: https://github.com/TRex22/ocr-file