ocr-file 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0e67553a31e82eba190368040d3475b812e113aedfb9994484043dda34a55053
4
- data.tar.gz: 6fe5e142fef4387fc98fce57d3fdb2b7a0c37199d1712bd1d85dced9a0e61274
3
+ metadata.gz: 3d87398395a2568088acbca6274482e8773761515031dd15cf20669deaaf1d4a
4
+ data.tar.gz: 7133a6a7481ed3918e57d22fab7fc6e264f816f79d71a6b778e4ff8b51142c86
5
5
  SHA512:
6
- metadata.gz: e5d06cf54a8bc96c90522ab67530310730230067ee226f6eb1143adde2ccb407dde25aef7b595836478ee944e4e9b3ff306b4df5a08ec14ab6623ab08daefa8b
7
- data.tar.gz: 45a7c3d06908c878f281db9baf4ec82310ecde20e12cad5ff4cc03d2f271167d46fa52145fe598f594a3360a525c926d955bb08d17e740ba78f97ec72f0f4b47
6
+ metadata.gz: 84b8516623b126b2db7e5bd6c6d2f2110b38f0165bbd67f9bdf2bdfd5beb1157a294469c4078ea641445627644607a75a69479736db503770f7a6758b0ba4f23
7
+ data.tar.gz: 77fd0e1cadc1080b4a079a9d9f48ebe3999a6750b51b5f73a335369fc9b842bbcc347b545cbcc88562a5a2755aaf2af5b671da27e8293b4ee20abd557ebc6cbe
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ocr-file (0.0.2)
4
+ ocr-file (0.0.4)
5
5
  active_attr (~> 0.15.4)
6
6
  console-style (~> 0.0.1)
7
7
  hexapdf (~> 0.23.0)
data/README.md CHANGED
@@ -44,12 +44,14 @@ You will need to install `tesseract` with your desired language on your system,
44
44
  # Image Pre-Processing
45
45
  image_preprocess: true,
46
46
  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
47
+ automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
47
48
  # PDF to Image Processing
48
49
  optimise_pdf: true,
49
50
  extract_pdf_images: true, # if false will screenshot each PDF page
50
51
  temp_filename_prefix: 'image',
51
52
  # Console Output
52
53
  verbose: true,
54
+ timing: true,
53
55
  }
54
56
 
55
57
  doc = OcrFile::Document.new(
@@ -85,6 +87,8 @@ Set `extract_pdf_images` to `false` for higher quality OCR. However this will co
85
87
 
86
88
  Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
87
89
 
90
+ `automatic_reprocess` is much slower as it has to re-do operations per image (in some cases) but will select the best result for each page.
91
+
88
92
  ### Simple CLI
89
93
  Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
90
94
 
@@ -108,7 +112,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
108
112
  ### TODOs
109
113
  - input validation
110
114
  - Better CLI
111
- - image processing
112
115
  - password
113
116
  - Base64 encoding
114
117
  - requirements checking (installed dependencies etc ...)
@@ -117,7 +120,6 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
117
120
  - Improve console output
118
121
  - Fix spaces in file names
119
122
  - Better verbosity
120
- - Timing
121
123
 
122
124
  ### Tests
123
125
  To run tests execute:
@@ -5,6 +5,7 @@ module OcrFile
5
5
 
6
6
  ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
7
7
  PAGE_BREAK = "\n\r\n" # TODO: Make configurable
8
+ EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
8
9
  DEFAULT_CONFIG = {
9
10
  # Images from PDF
10
11
  filetype: 'png',
@@ -23,12 +24,14 @@ module OcrFile
23
24
  # Image Pre-Processing
24
25
  image_preprocess: true,
25
26
  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
27
+ automatic_reprocess: true,
26
28
  # PDF to Image Processing
27
29
  optimise_pdf: true,
28
30
  extract_pdf_images: true, # if false will screenshot each PDF page
29
31
  temp_filename_prefix: 'image',
30
32
  # Console Output
31
33
  verbose: true,
34
+ timing: true,
32
35
  }
33
36
 
34
37
  attr_reader :original_file_path,
@@ -36,7 +39,9 @@ module OcrFile
36
39
  :save_file_path,
37
40
  :final_save_file,
38
41
  :config,
39
- :ocr_engine
42
+ :ocr_engine,
43
+ :start_time,
44
+ :end_time
40
45
 
41
46
  # save_file_path will also generate a tmp path for tmp files. Expected folder path
42
47
  # TODO: Add in more input validation
@@ -69,76 +74,50 @@ module OcrFile
69
74
 
70
75
  # Trigger OCR pipeline
71
76
  def to_pdf
72
- if pdf?
73
- create_temp_folder
74
- image_paths = extract_image_paths_from_pdf(@original_file_path)
75
-
76
- pdfs_to_merge = []
77
-
78
- image_paths.each do |image_path|
79
- pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
80
- end
81
-
82
- merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
77
+ @start_time = Time.now
78
+ find_best_image_processing if config[:automatic_reprocess] && !text?
83
79
 
84
- OcrFile::ImageEngines::PdfEngine
85
- .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
80
+ if pdf?
81
+ ocr_pdf_to_searchable_pdf
86
82
  elsif text?
87
- text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
88
- pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
89
-
90
- OcrFile::ImageEngines::PdfEngine
91
- .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
83
+ text_to_pdf
92
84
  else # is an image
93
85
  ocr_image_to_pdf
94
86
  end
95
87
 
96
88
  close
89
+
90
+ @end_time = Time.now
91
+ print_time
97
92
  end
98
93
 
99
94
  def to_text
100
- if pdf?
101
- create_temp_folder
102
- image_paths = extract_image_paths_from_pdf(@original_file_path)
103
-
104
- image_paths.each do |image_path|
105
- text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
106
- ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
107
- end
108
- elsif text?
109
- ::OcrFile::FileHelpers.open_text_file(@original_file_path)
110
- else # is an image
111
- ocr_image_to_text(save: true)
112
- end
95
+ @start_time = Time.now
96
+ return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
113
97
 
98
+ find_best_image_processing(save: true)
114
99
  close
100
+
101
+ @end_time = Time.now
102
+ print_time
115
103
  end
116
104
 
117
105
  def to_s
118
- if pdf?
119
- create_temp_folder
120
- image_paths = extract_image_paths_from_pdf(@original_file_path)
106
+ @start_time = Time.now
107
+ return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
121
108
 
122
- text = ''
109
+ text = find_best_image_processing(save: false)
123
110
 
124
- image_paths.each do |image_path|
125
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
126
- end
111
+ close
127
112
 
128
- close
129
- text
130
- elsif text?
131
- ::OcrFile::FileHelpers.open_text_file(@original_file_path)
132
- else # is an image
133
- text = ocr_image_to_text(save: false)
113
+ @end_time = Time.now
114
+ print_time
134
115
 
135
- close
136
- text
137
- end
116
+ text
138
117
  end
139
118
 
140
119
  def close
141
- ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
120
+ # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
142
121
  end
143
122
 
144
123
  private
@@ -185,13 +164,57 @@ module OcrFile
185
164
  image_processor.convert!
186
165
  end
187
166
 
167
+ def ocr_pdf_to_searchable_pdf
168
+ create_temp_folder
169
+ image_paths = extract_image_paths_from_pdf(@original_file_path)
170
+
171
+ pdfs_to_merge = []
172
+
173
+ image_paths.each do |image_path|
174
+ pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
175
+ end
176
+
177
+ merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
178
+
179
+ OcrFile::ImageEngines::PdfEngine
180
+ .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
181
+ end
182
+
183
+ def text_to_pdf
184
+ text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
185
+ pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
186
+
187
+ OcrFile::ImageEngines::PdfEngine
188
+ .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
189
+ end
190
+
188
191
  def ocr_image_to_pdf
192
+ find_best_image_processing if config[:automatic_reprocess]
193
+
189
194
  pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
190
195
  OcrFile::ImageEngines::PdfEngine
191
196
  .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
192
197
  end
193
198
 
194
- def ocr_image_to_text(save: true)
199
+ def ocr_pdf_to_text(save:)
200
+ create_temp_folder
201
+ image_paths = extract_image_paths_from_pdf(@original_file_path)
202
+
203
+ text = ''
204
+
205
+ image_paths.each do |image_path|
206
+ text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
207
+ end
208
+
209
+ if save
210
+ ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
211
+ else
212
+ text
213
+ end
214
+ end
215
+
216
+ def ocr_image_to_text(save:)
217
+ create_temp_folder
195
218
  text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
196
219
 
197
220
  if save
@@ -201,6 +224,38 @@ module OcrFile
201
224
  end
202
225
  end
203
226
 
227
+ def ocr_file_to_text(save:)
228
+ if pdf? &&
229
+ ocr_pdf_to_text(save: save)
230
+ else # is an image
231
+ ocr_image_to_text(save: save)
232
+ end
233
+ end
234
+
235
+ def find_best_image_processing(save:)
236
+ ocr_file_to_text(save: save) if !config[:automatic_reprocess]
237
+
238
+ text = ''
239
+ effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
240
+ effects_to_test.each do |effect|
241
+ config[:effects] = config[:effects] - [effect]
242
+
243
+ text = ocr_file_to_text(save: false)
244
+ break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
245
+ end
246
+
247
+ # Adds in extra operations which is unfortunately inefficient
248
+ if save
249
+ ocr_file_to_text(save: save)
250
+ else
251
+ text
252
+ end
253
+ end
254
+
255
+ def print_time
256
+ puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
257
+ end
258
+
204
259
  def find_ocr_engine(engine_id)
205
260
  ocr_engine_constants
206
261
  .map { |c| ocr_module(c) }
@@ -0,0 +1,34 @@
1
+ module OcrFile
2
+ module TextEngines
3
+ class ResultProcessor
4
+ MINIMUM_WORD_LENGTH = 3
5
+
6
+ attr_reader :text, :clear_text
7
+
8
+ def initialize(text)
9
+ @text = text
10
+ @clear_text = remove_lines
11
+ end
12
+
13
+ # This is a very naive way of determining if we should re-do OCR with
14
+ # shifted options
15
+ def valid_words?
16
+ word_size_average >= MINIMUM_WORD_LENGTH
17
+ end
18
+
19
+ def word_count
20
+ @_word_count ||= clear_text.split(' ').size
21
+ end
22
+
23
+ def word_size_average
24
+ @_word_size_average ||= clear_text.split(' ').map(&:size).sum / word_count
25
+ end
26
+
27
+ private
28
+
29
+ def remove_lines
30
+ text.gsub("\n", ' ').gsub("\r", ' ').gsub(' ', '')
31
+ end
32
+ end
33
+ end
34
+ end
@@ -1,3 +1,3 @@
1
1
  module OcrFile
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
data/lib/ocr-file.rb CHANGED
@@ -10,6 +10,7 @@ require 'ocr-file/image_engines/image_magick'
10
10
  require 'ocr-file/image_engines/pdftoppm'
11
11
  require 'ocr-file/ocr_engines/tesseract'
12
12
  require 'ocr-file/ocr_engines/cloud_vision'
13
+ require 'ocr-file/text_engines/result_processor'
13
14
  require 'ocr-file/file_helpers'
14
15
  require 'ocr-file/document'
15
16
  require 'ocr-file/cli'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ocr-file
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
@@ -122,6 +122,7 @@ files:
122
122
  - lib/ocr-file/image_engines/pdftoppm.rb
123
123
  - lib/ocr-file/ocr_engines/cloud_vision.rb
124
124
  - lib/ocr-file/ocr_engines/tesseract.rb
125
+ - lib/ocr-file/text_engines/result_processor.rb
125
126
  - lib/ocr-file/version.rb
126
127
  - ocr-file.gemspec
127
128
  homepage: https://github.com/TRex22/ocr-file