ocr-file 0.0.7 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +5 -3
- data/lib/ocr-file/document.rb +2 -1
- data/lib/ocr-file/image_engines/image_magick.rb +12 -1
- data/lib/ocr-file/image_engines/pdf_engine.rb +20 -3
- data/lib/ocr-file/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e77eefa085a14282b42584bd4bf6796a99e10589552b52858cb8f5dd75c84b97
|
4
|
+
data.tar.gz: 9fde9adae0c252ecc56937a314c676903a6b2a6ababe51030cbcd9ab3ee1ba81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f6c9cf596d6a78ccea7e1fb45543826e1b95dab74449700eb0f0bed4bace802fbc15fba118a982fbf7daed9ebb188e417876f1e6a9cb6f620eb3630a4aaed7af
|
7
|
+
data.tar.gz: 9aab569d476170d8c7b405f65a1629d8ec789f008f734a7cc7f49c5716c89c7a7b5010ae3096ed7e4b3358fe650c221da658801a893cd76f6183f23b19696349
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -45,6 +45,7 @@ You will need to install `tesseract` with your desired language on your system,
|
|
45
45
|
image_preprocess: true,
|
46
46
|
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
|
47
47
|
automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
|
48
|
+
dimensions: [width, height], # Can be nil but will lock the images
|
48
49
|
# PDF to Image Processing
|
49
50
|
optimise_pdf: true,
|
50
51
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
@@ -79,9 +80,10 @@ You will need to install `tesseract` with your desired language on your system,
|
|
79
80
|
|
80
81
|
# How to merge files into a single PDF:
|
81
82
|
# The files can be images or other PDFs
|
82
|
-
|
83
|
-
|
84
|
-
|
83
|
+
file_paths = []
|
84
|
+
merged_document = ::HexaPDF::Document.new
|
85
|
+
dimensions = [width, height] # or nil to maintain dimensions
|
86
|
+
documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.insert_image(merged_document, path, dimensions: dimensions) }
|
85
87
|
OcrFile::ImageEngines::PdfEngine.save_pdf(merged_document, save_file_path, optimise: true)
|
86
88
|
```
|
87
89
|
|
data/lib/ocr-file/document.rb
CHANGED
@@ -25,6 +25,7 @@ module OcrFile
|
|
25
25
|
image_preprocess: true,
|
26
26
|
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
|
27
27
|
automatic_reprocess: true,
|
28
|
+
dimensions: nil, # width, height. Will lock images to these dimensions
|
28
29
|
# PDF to Image Processing
|
29
30
|
optimise_pdf: true,
|
30
31
|
extract_pdf_images: true, # if false will screenshot each PDF page
|
@@ -66,7 +67,7 @@ module OcrFile
|
|
66
67
|
|
67
68
|
def image?
|
68
69
|
return false if pdf?
|
69
|
-
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
|
70
|
+
ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") }
|
70
71
|
end
|
71
72
|
|
72
73
|
# Treat anything which isnt a PDF or image as text
|
@@ -5,7 +5,7 @@ module OcrFile
|
|
5
5
|
# Conversion of image types
|
6
6
|
# Rotation and detection of skew
|
7
7
|
|
8
|
-
attr_reader :image_path, :image, :temp_path, :save_file_path, :config
|
8
|
+
attr_reader :image_path, :image, :temp_path, :save_file_path, :config, :width, :height
|
9
9
|
|
10
10
|
def initialize(image_path:, temp_path:, save_file_path:, config:)
|
11
11
|
@image_path = image_path
|
@@ -22,11 +22,18 @@ module OcrFile
|
|
22
22
|
# end
|
23
23
|
|
24
24
|
@image = MiniMagick::Image.open(image_path)
|
25
|
+
|
26
|
+
@width = @image[:width]
|
27
|
+
@height = @image[:height]
|
25
28
|
end
|
26
29
|
|
27
30
|
def convert!
|
28
31
|
return @image_path unless @config[:image_preprocess]
|
29
32
|
|
33
|
+
if @config[:dimensions].is_a?(Array) && @config[:dimensions].size == 2
|
34
|
+
resize(width, height)
|
35
|
+
end
|
36
|
+
|
30
37
|
@config[:effects].each do |effect|
|
31
38
|
self.send(effect.to_sym)
|
32
39
|
end
|
@@ -39,6 +46,10 @@ module OcrFile
|
|
39
46
|
@save_file_path
|
40
47
|
end
|
41
48
|
|
49
|
+
def resize(width, height)
|
50
|
+
@image.resize("#{width}x#{height}")
|
51
|
+
end
|
52
|
+
|
42
53
|
# Effects
|
43
54
|
# http://www.imagemagick.org/script/command-line-options.php
|
44
55
|
def bw
|
@@ -61,9 +61,26 @@ module OcrFile
|
|
61
61
|
image_paths
|
62
62
|
end
|
63
63
|
|
64
|
-
def insert_image(document, image_path)
|
65
|
-
|
66
|
-
|
64
|
+
def insert_image(document, image_path, dimensions: nil)
|
65
|
+
image_processor = OcrFile::ImageEngines::ImageMagick.new(
|
66
|
+
image_path: image_path,
|
67
|
+
temp_path: @temp_folder_path,
|
68
|
+
save_file_path: '',
|
69
|
+
config: @config
|
70
|
+
)
|
71
|
+
|
72
|
+
if dimensions
|
73
|
+
width = dimensions[0]
|
74
|
+
height = dimensions[1]
|
75
|
+
else
|
76
|
+
width = image_processor.width
|
77
|
+
height = image_processor.height
|
78
|
+
end
|
79
|
+
|
80
|
+
page = document.pages.add([0, 0, width, height])
|
81
|
+
page.canvas.image(@image || image_path, at: [0, 0], width: width, height: height)
|
82
|
+
|
83
|
+
document
|
67
84
|
end
|
68
85
|
|
69
86
|
def combine(text, pdf_of_images)
|
data/lib/ocr-file/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ocr-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: console-style
|