ocr-file 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f188bc0b29f4232b379e5e15d924c57a64a1758f04d8e168d2a44a744d20d1af
4
- data.tar.gz: 5b54d844f01a5a5249572dd0abc270ae1fb37ff0070df9ad47eb84cf5f233fe7
3
+ metadata.gz: 0e67553a31e82eba190368040d3475b812e113aedfb9994484043dda34a55053
4
+ data.tar.gz: 6fe5e142fef4387fc98fce57d3fdb2b7a0c37199d1712bd1d85dced9a0e61274
5
5
  SHA512:
6
- metadata.gz: c51ab724a77e8b22568dc0c7cefcf3ba28407f7050976d6900824954221d4f04e677b31b58ae644c87752e60024e1667194eda8b00c89dfab30f9a81d53ba1d5
7
- data.tar.gz: 9b521be6e75808899398e77cf0c0b9dee842350a5c81c0ba513ad56125725607906c8c19e6b493201750ba331521db4ba247723a1c09d82dfb61e8caec857428
6
+ metadata.gz: e5d06cf54a8bc96c90522ab67530310730230067ee226f6eb1143adde2ccb407dde25aef7b595836478ee944e4e9b3ff306b4df5a08ec14ab6623ab08daefa8b
7
+ data.tar.gz: 45a7c3d06908c878f281db9baf4ec82310ecde20e12cad5ff4cc03d2f271167d46fa52145fe598f594a3360a525c926d955bb08d17e740ba78f97ec72f0f4b47
data/README.md CHANGED
@@ -43,7 +43,7 @@ You will need to install `tesseract` with your desired language on your system,
43
43
  ocr_engine: 'tesseract', # 'cloud-vision'
44
44
  # Image Pre-Processing
45
45
  image_preprocess: true,
46
- effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'], # Applies effects as listed. 'norm' is also available
46
+ effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
47
47
  # PDF to Image Processing
48
48
  optimise_pdf: true,
49
49
  extract_pdf_images: true, # if false will screenshot each PDF page
@@ -83,7 +83,7 @@ You will need to install `tesseract` with your desired language on your system,
83
83
  ### Notes / Tips
84
84
  Set `extract_pdf_images` to `false` for higher quality OCR. However this will consume more temporary space per PDF page and also be considerably slower.
85
85
 
86
- Image pre-processing only thresholds (bw), normalises the colour space, removes speckles and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary.
86
+ Image pre-processing only thresholds (bw), normalises the colour space, removes speckles, removes shadows and tries to straighten the image. Will make the end result Black and White but have far more accurate OCR (PDFs). The order of operations is important, but steps can be removed when necessary. Expanding the colour dynamic range with `'norm'` can also be done but isn't recommended.
87
87
 
88
88
  ### Simple CLI
89
89
  Once installed you can use `ocr-file` as a CLI. Its currently a reduced set of options. These are subject to change in future versions
@@ -22,7 +22,7 @@ module OcrFile
22
22
  ocr_engine: 'tesseract', # 'cloud-vision'
23
23
  # Image Pre-Processing
24
24
  image_preprocess: true,
25
- effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'],
25
+ effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
26
26
  # PDF to Image Processing
27
27
  optimise_pdf: true,
28
28
  extract_pdf_images: true, # if false will screenshot each PDF page
@@ -61,6 +61,13 @@ module OcrFile
61
61
  @image.sharpen('0x4') # radiusXsigma
62
62
  end
63
63
 
64
+ # https://github.com/ImageMagick/ImageMagick/discussions/4145
65
+ def remove_shadow
66
+ @image.negate
67
+ @image.lat("20x20+10\%")
68
+ @image.negate
69
+ end
70
+
64
71
  def deskew
65
72
  @image.deskew('40%') # threshold recommended in the docs
66
73
  end
@@ -1,3 +1,3 @@
1
1
  module OcrFile
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ocr-file
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22