ocr-file 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6b558ce36c35e74b410f42928eae1a987485d1bbd64da77750574062bc05b91e
4
- data.tar.gz: d906c620a02c5a2d139b3d89d05e9b3872ee6c929b4aa661b20f9033d8f3605f
3
+ metadata.gz: f188bc0b29f4232b379e5e15d924c57a64a1758f04d8e168d2a44a744d20d1af
4
+ data.tar.gz: 5b54d844f01a5a5249572dd0abc270ae1fb37ff0070df9ad47eb84cf5f233fe7
5
5
  SHA512:
6
- metadata.gz: 81049908609ba3d622be2b6f99dabeca2960a455fa3d56ee1fca4c177c2ee4365281421c1128ec3fa5476d068daa53b3d7f7600c5fd1c31fcb5834ca688f9747
7
- data.tar.gz: 1a7dcd56a7196694371abf70633635545138bdc7bc0af2873fc5e7c22bdbfc97e9986ba1a18afc288b24e488caf999b644ec1f9d8889ce6e5efa6fcfe776c204
6
+ metadata.gz: c51ab724a77e8b22568dc0c7cefcf3ba28407f7050976d6900824954221d4f04e677b31b58ae644c87752e60024e1667194eda8b00c89dfab30f9a81d53ba1d5
7
+ data.tar.gz: 9b521be6e75808899398e77cf0c0b9dee842350a5c81c0ba513ad56125725607906c8c19e6b493201750ba331521db4ba247723a1c09d82dfb61e8caec857428
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ocr-file (0.0.1)
4
+ ocr-file (0.0.2)
5
5
  active_attr (~> 0.15.4)
6
6
  console-style (~> 0.0.1)
7
7
  hexapdf (~> 0.23.0)
data/README.md CHANGED
@@ -42,9 +42,8 @@ You will need to install `tesseract` with your desired language on your system,
42
42
  type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
43
43
  ocr_engine: 'tesseract', # 'cloud-vision'
44
44
  # Image Pre-Processing
45
- image_pre_preprocess: true,
46
- effects: ['bw', 'norm'],
47
- threshold: 0.25,
45
+ image_preprocess: true,
46
+ effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'], # Applies effects as listed. 'norm' is also available
48
47
  # PDF to Image Processing
49
48
  optimise_pdf: true,
50
49
  extract_pdf_images: true, # if false will screenshot each PDF page
@@ -84,7 +83,21 @@ You will need to install `tesseract` with your desired language on your system,
84
83
  ### Notes / Tips
85
84
  Set `extract_pdf_images` to `false` for higher quality OCR. However this will consume more temporary space per PDF page and also be considerably slower.
86
85
 
87
- Image pre-processing is not yet implemented.
86
+ Image pre-processing only thresholds (bw), normalises the colour space, removes speckles and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary.
87
+
88
+ ### Simple CLI
89
+ Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
90
+
91
+ ```
92
+ # Basic Usage with console output
93
+ ocr-file input_file_path output_folder_path
94
+
95
+ # Output to PDF
96
+ ocr-file input_file_path output_folder_path pdf
97
+
98
+ # Output to TXT
99
+ ocr-file input_file_path output_folder_path txt
100
+ ```
88
101
 
89
102
  ## Development
90
103
 
@@ -94,7 +107,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
94
107
 
95
108
  ### TODOs
96
109
  - input validation
97
- - CLI
110
+ - Better CLI
98
111
  - image processing
99
112
  - password
100
113
  - Base64 encoding
@@ -102,6 +115,9 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
102
115
  - Tests
103
116
  - Configurable temp folder cleanup
104
117
  - Improve console output
118
+ - Fix spaces in file names
119
+ - Better verbosity
120
+ - Timing
105
121
 
106
122
  ### Tests
107
123
  To run tests execute:
data/bin/ocr-file CHANGED
@@ -2,4 +2,4 @@
2
2
 
3
3
  require 'ocr-file'
4
4
 
5
- puts "Hello, world!"
5
+ OcrFile::Cli.new(ARGV).call
data/lib/ocr-file/cli.rb CHANGED
@@ -1,5 +1,41 @@
1
1
  module OcrFile
2
- module Cli
2
+ class Cli
3
+ attr_reader :args
3
4
 
5
+ def initialize(args)
6
+ @args = args
7
+ end
8
+
9
+ def valid?
10
+ return true if args.size == 2 || args.size == 3
11
+ false
12
+ end
13
+
14
+ def invalid?
15
+ !valid?
16
+ end
17
+
18
+ def call
19
+ # TODO: Use ConsoleStyle::Functions
20
+ # TODO: Heading and better CLI interface
21
+ # Simple cli for now
22
+ puts "OCR Tool © Jason Chalom 2022, Version: #{OcrFile::VERSION}"
23
+ abort "File path, Save Folder Paths, and output type (pdf, txt) are required!" if invalid?
24
+
25
+ # Using default config for now
26
+ original_file_path = args[0]
27
+ save_file_path = args[1]
28
+ output_type = args[2]
29
+
30
+ document = OcrFile::Document.new(original_file_path: original_file_path, save_file_path: save_file_path)
31
+
32
+ if output_type.to_s.downcase.include?('pdf')
33
+ document.to_pdf
34
+ elsif output_type.to_s.downcase.include?('txt') || output_type.to_s.downcase.include?('text')
35
+ document.to_text
36
+ else # Display in console
37
+ puts document.to_s
38
+ end
39
+ end
4
40
  end
5
41
  end
@@ -1,5 +1,8 @@
1
1
  module OcrFile
2
2
  class Document
3
+ # TODO: Skewness / text orientation detection
4
+ # TODO: Better handwriting analysis
5
+
3
6
  ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
4
7
  PAGE_BREAK = "\n\r\n" # TODO: Make configurable
5
8
  DEFAULT_CONFIG = {
@@ -18,9 +21,8 @@ module OcrFile
18
21
  type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
19
22
  ocr_engine: 'tesseract', # 'cloud-vision'
20
23
  # Image Pre-Processing
21
- image_pre_preprocess: true,
22
- effects: ['bw', 'norm'],
23
- threshold: 0.25,
24
+ image_preprocess: true,
25
+ effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'],
24
26
  # PDF to Image Processing
25
27
  optimise_pdf: true,
26
28
  extract_pdf_images: true, # if false will screenshot each PDF page
@@ -52,12 +54,12 @@ module OcrFile
52
54
  end
53
55
 
54
56
  def pdf?
55
- @original_file_path.include?('.pdf')
57
+ @original_file_path.downcase.include?('.pdf')
56
58
  end
57
59
 
58
60
  def image?
59
61
  return false if pdf?
60
- ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
62
+ ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
61
63
  end
62
64
 
63
65
  # Treat anything which isnt a PDF or image as text
@@ -65,6 +67,7 @@ module OcrFile
65
67
  !pdf? && !image?
66
68
  end
67
69
 
70
+ # Trigger OCR pipeline
68
71
  def to_pdf
69
72
  if pdf?
70
73
  create_temp_folder
@@ -73,15 +76,13 @@ module OcrFile
73
76
  pdfs_to_merge = []
74
77
 
75
78
  image_paths.each do |image_path|
76
- pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
79
+ pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
77
80
  end
78
81
 
79
82
  merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
80
83
 
81
84
  OcrFile::ImageEngines::PdfEngine
82
85
  .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
83
-
84
- close
85
86
  elsif text?
86
87
  text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
87
88
  pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
@@ -91,6 +92,8 @@ module OcrFile
91
92
  else # is an image
92
93
  ocr_image_to_pdf
93
94
  end
95
+
96
+ close
94
97
  end
95
98
 
96
99
  def to_text
@@ -99,16 +102,16 @@ module OcrFile
99
102
  image_paths = extract_image_paths_from_pdf(@original_file_path)
100
103
 
101
104
  image_paths.each do |image_path|
102
- text = @ocr_engine.ocr_to_text(image_path, options: @config)
105
+ text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
103
106
  ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
104
107
  end
105
-
106
- close
107
108
  elsif text?
108
109
  ::OcrFile::FileHelpers.open_text_file(@original_file_path)
109
110
  else # is an image
110
111
  ocr_image_to_text(save: true)
111
112
  end
113
+
114
+ close
112
115
  end
113
116
 
114
117
  def to_s
@@ -119,7 +122,7 @@ module OcrFile
119
122
  text = ''
120
123
 
121
124
  image_paths.each do |image_path|
122
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
125
+ text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
123
126
  end
124
127
 
125
128
  close
@@ -127,7 +130,10 @@ module OcrFile
127
130
  elsif text?
128
131
  ::OcrFile::FileHelpers.open_text_file(@original_file_path)
129
132
  else # is an image
130
- ocr_image_to_text(save: false)
133
+ text = ocr_image_to_text(save: false)
134
+
135
+ close
136
+ text
131
137
  end
132
138
  end
133
139
 
@@ -157,19 +163,36 @@ module OcrFile
157
163
  end
158
164
 
159
165
  def create_temp_folder
160
- # TODO: Make this a bit more robust
161
- @temp_folder_path = "#{save_file_path}/temp/".gsub(' ', '\ ')
166
+ date = Time.now.to_s.split(' ').first
167
+
168
+ @temp_folder_path = "#{save_file_path}/temp-#{date}/".gsub(' ', '\ ')
162
169
  ::OcrFile::FileHelpers.make_directory(@temp_folder_path)
163
170
  end
164
171
 
172
+ def process_image(path)
173
+ return path unless @config[:image_preprocess]
174
+
175
+ create_temp_folder
176
+ save_file_path = "#{@temp_folder_path}/#{Time.now.to_i}.#{@config[:filetype]}"
177
+
178
+ image_processor = OcrFile::ImageEngines::ImageMagick.new(
179
+ image_path: path,
180
+ temp_path: @temp_folder_path,
181
+ save_file_path: save_file_path,
182
+ config: @config
183
+ )
184
+
185
+ image_processor.convert!
186
+ end
187
+
165
188
  def ocr_image_to_pdf
166
- pdf_document = @ocr_engine.ocr_to_pdf(@original_file_path, options: @config)
189
+ pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
167
190
  OcrFile::ImageEngines::PdfEngine
168
191
  .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
169
192
  end
170
193
 
171
194
  def ocr_image_to_text(save: true)
172
- text = @ocr_engine.ocr_to_text(@original_file_path, options: @config)
195
+ text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
173
196
 
174
197
  if save
175
198
  ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
@@ -1,14 +1,73 @@
1
1
  module OcrFile
2
2
  module ImageEngines
3
- module ImageMagick
4
- extend self
5
-
3
+ class ImageMagick
6
4
  # TODO:
7
- # B/W
8
- # Contrast
9
- # Image Norm
10
- # Threshold
11
5
  # Conversion of image types
6
+ # Rotation and detection of skew
7
+
8
+ attr_reader :image_path, :image, :temp_path, :save_file_path, :config
9
+
10
+ def initialize(image_path:, temp_path:, save_file_path:, config:)
11
+ @image_path = image_path
12
+ @config = config
13
+ @save_file_path = save_file_path
14
+
15
+ @temp_path = temp_path
16
+
17
+ # Will be available in the next version of MiniMagick > 4.11.0
18
+ # https://github.com/minimagick/minimagick/pull/541
19
+ # MiniMagick.configure do |config|
20
+ # # cli_version graphicsmagick? imagemagick7? imagemagick? version
21
+ # config.tmpdir = File.join(Dir.tmpdir, @temp_path)
22
+ # end
23
+
24
+ @image = MiniMagick::Image.open(image_path)
25
+ end
26
+
27
+ def convert!
28
+ return @image_path unless @config[:image_preprocess]
29
+
30
+ @config[:effects].each do |effect|
31
+ self.send(effect.to_sym)
32
+ end
33
+
34
+ save!
35
+ end
36
+
37
+ def save!
38
+ image.write(@save_file_path)
39
+ @save_file_path
40
+ end
41
+
42
+ # Effects
43
+ # http://www.imagemagick.org/script/command-line-options.php
44
+ def bw
45
+ @image.alpha('off')
46
+ @image.auto_threshold("otsu")
47
+ end
48
+
49
+ def enhance
50
+ @image.enhance
51
+ end
52
+
53
+ def norm
54
+ @image.equalize
55
+ end
56
+
57
+ # Most likely not going to be configurable because
58
+ # these are aggressive parameters used to optimised OCR results
59
+ # and not the final results of the PDFs
60
+ def sharpen
61
+ @image.sharpen('0x4') # radiusXsigma
62
+ end
63
+
64
+ def deskew
65
+ @image.deskew('40%') # threshold recommended in the docs
66
+ end
67
+
68
+ def despeckle
69
+ @image.despeckle
70
+ end
12
71
  end
13
72
  end
14
73
  end
@@ -13,7 +13,7 @@ module OcrFile
13
13
  print 'Generating screenshots of each PDF page ... '
14
14
 
15
15
  if filetype == 'jpg'
16
- `pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
16
+ `pdftoppm -jpeg -jpegopt quality=#{quality} -r #{dpi} "#{pdf_path}" "#{save_path}/#{filename}"`
17
17
  else
18
18
  `pdftoppm -#{filetype} -r #{dpi} #{pdf_path} #{save_path}/#{filename}`
19
19
  end
@@ -1,3 +1,3 @@
1
1
  module OcrFile
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ocr-file
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-19 00:00:00.000000000 Z
11
+ date: 2022-06-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: console-style