mindee 3.13.0 → 3.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/docs/bill_of_lading_v1.md +50 -1
- data/docs/energy_bill_fra_v1.md +61 -1
- data/docs/expense_receipts_v5.md +4 -4
- data/docs/financial_document_v1.md +14 -0
- data/docs/invoices_v4.md +16 -2
- data/docs/nutrition_facts_v1.md +80 -1
- data/docs/payslip_fra_v2.md +77 -1
- data/docs/us_mail_v2.md +1 -1
- data/examples/auto_invoice_splitter_extraction.rb +36 -31
- data/examples/auto_multi_receipts_detector_extraction.rb +31 -0
- data/lib/mindee/client.rb +1 -0
- data/lib/mindee/extraction/common/extracted_image.rb +1 -2
- data/lib/mindee/extraction/common/image_extractor.rb +147 -159
- data/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb +22 -16
- data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +3 -1
- data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +1 -0
- data/lib/mindee/geometry/point.rb +2 -1
- data/lib/mindee/image/image_compressor.rb +29 -0
- data/lib/mindee/image/image_utils.rb +104 -0
- data/lib/mindee/image.rb +4 -0
- data/lib/mindee/input/sources.rb +36 -0
- data/lib/mindee/parsing/standard/date_field.rb +4 -0
- data/lib/mindee/parsing/standard/position_field.rb +3 -0
- data/lib/mindee/pdf/pdf_compressor.rb +117 -0
- data/lib/mindee/pdf/{pdf_processing.rb → pdf_processor.rb} +17 -0
- data/lib/mindee/pdf/pdf_tools.rb +100 -0
- data/lib/mindee/pdf.rb +3 -1
- data/lib/mindee/product/financial_document/financial_document_v1_document.rb +11 -1
- data/lib/mindee/product/financial_document/financial_document_v1_page.rb +1 -1
- data/lib/mindee/product/invoice/invoice_v4_document.rb +11 -1
- data/lib/mindee/product/invoice/invoice_v4_page.rb +1 -1
- data/lib/mindee/version.rb +1 -1
- data/lib/mindee.rb +10 -0
- data/mindee.gemspec +2 -1
- metadata +32 -7
@@ -9,180 +9,168 @@ require_relative 'extracted_image'
|
|
9
9
|
|
10
10
|
module Mindee
|
11
11
|
# Image Extraction Module.
|
12
|
-
module
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
#
|
32
|
-
# @param [Mindee::Input::Source::LocalInputSource] input_source
|
33
|
-
# @param [Integer] page_id ID of the Page to extract from.
|
34
|
-
# @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates
|
35
|
-
# to extract.
|
36
|
-
# @return [Array<Mindee::ImageExtraction::ExtractedImage>] Extracted Images.
|
37
|
-
def extract_multiple_images_from_source(input_source, page_id, polygons)
|
38
|
-
new_stream = load_doc(input_source, page_id)
|
39
|
-
new_stream.seek(0)
|
40
|
-
|
41
|
-
extract_images_from_polygons(input_source, new_stream, page_id, polygons)
|
42
|
-
end
|
12
|
+
module Extraction
|
13
|
+
# Image Extraction wrapper class.
|
14
|
+
module ImageExtractor
|
15
|
+
def self.attach_image_as_new_file(input_buffer, format: 'jpg')
|
16
|
+
# Attaches an image as a new page in a PdfDocument object.
|
17
|
+
#
|
18
|
+
# @param [StringIO] input_buffer Input buffer. Only supports JPEG.
|
19
|
+
# @return [Origami::PDF] A PdfDocument handle.
|
20
|
+
|
21
|
+
magick_image = MiniMagick::Image.read(input_buffer)
|
22
|
+
# NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
|
23
|
+
# converted.
|
24
|
+
magick_image.format(format)
|
25
|
+
original_density = magick_image.resolution
|
26
|
+
scale_factor = original_density[0].to_f / 4.166666 # No clue why the resolution needs to be reduced for
|
27
|
+
# the pdf otherwise the resulting image shrinks.
|
28
|
+
magick_image.format('pdf', 0, { density: scale_factor.to_s })
|
29
|
+
Origami::PDF.read(StringIO.new(magick_image.to_blob))
|
30
|
+
end
|
43
31
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
32
|
+
# Extracts multiple images from a given local input source.
|
33
|
+
#
|
34
|
+
# @param [Mindee::Input::Source::LocalInputSource] input_source
|
35
|
+
# @param [Integer] page_id ID of the Page to extract from.
|
36
|
+
# @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates
|
37
|
+
# to extract.
|
38
|
+
# @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
|
39
|
+
def self.extract_multiple_images_from_source(input_source, page_id, polygons)
|
40
|
+
new_stream = load_input_source_pdf_page_as_image(input_source, page_id)
|
41
|
+
new_stream.seek(0)
|
42
|
+
|
43
|
+
extract_images_from_polygons(input_source, new_stream, page_id, polygons)
|
44
|
+
end
|
51
45
|
|
52
|
-
|
53
|
-
|
54
|
-
|
46
|
+
# Extracts images from their positions on a file (as polygons).
|
47
|
+
#
|
48
|
+
# @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
|
49
|
+
# @param [StringIO] pdf_stream Buffer of the PDF.
|
50
|
+
# @param [Integer] page_id Page ID.
|
51
|
+
# @param [Array<Mindee::Geometry::Point, Mindee::Geometry::Polygon, Mindee::Geometry::Quadrangle>] polygons
|
52
|
+
# @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
|
53
|
+
def self.extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
|
54
|
+
extracted_elements = []
|
55
|
+
|
56
|
+
polygons.each_with_index do |polygon, element_id|
|
57
|
+
polygon = normalize_polygon(polygon)
|
58
|
+
page_content = read_page_content(pdf_stream)
|
59
|
+
|
60
|
+
min_max_x = Geometry.get_min_max_x([
|
61
|
+
polygon.top_left,
|
62
|
+
polygon.bottom_right,
|
63
|
+
polygon.top_right,
|
64
|
+
polygon.bottom_left,
|
65
|
+
])
|
66
|
+
min_max_y = Geometry.get_min_max_y([
|
67
|
+
polygon.top_left,
|
68
|
+
polygon.bottom_right,
|
69
|
+
polygon.top_right,
|
70
|
+
polygon.bottom_left,
|
71
|
+
])
|
72
|
+
file_extension = determine_file_extension(input_source)
|
73
|
+
cropped_image = crop_image(page_content, min_max_x, min_max_y)
|
74
|
+
if file_extension == 'pdf'
|
75
|
+
cropped_image.format('jpg')
|
76
|
+
else
|
77
|
+
cropped_image.format(file_extension)
|
78
|
+
end
|
79
|
+
|
80
|
+
buffer = StringIO.new
|
81
|
+
write_image_to_buffer(cropped_image, buffer)
|
82
|
+
file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"
|
83
|
+
|
84
|
+
extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
|
85
|
+
end
|
55
86
|
|
56
|
-
|
57
|
-
|
87
|
+
extracted_elements
|
88
|
+
end
|
58
89
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
# @return [Array<Mindee::ImageExtraction::ExtractedImage>] Extracted Images.
|
66
|
-
def extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
|
67
|
-
extracted_elements = []
|
68
|
-
|
69
|
-
polygons.each_with_index do |polygon, element_id|
|
70
|
-
polygon = normalize_polygon(polygon)
|
71
|
-
page_content = read_page_content(pdf_stream)
|
72
|
-
|
73
|
-
min_max_x = Geometry.get_min_max_x([
|
74
|
-
polygon.top_left,
|
75
|
-
polygon.bottom_right,
|
76
|
-
polygon.top_right,
|
77
|
-
polygon.bottom_left,
|
78
|
-
])
|
79
|
-
min_max_y = Geometry.get_min_max_y([
|
80
|
-
polygon.top_left,
|
81
|
-
polygon.bottom_right,
|
82
|
-
polygon.top_right,
|
83
|
-
polygon.bottom_left,
|
84
|
-
])
|
85
|
-
file_extension = determine_file_extension(input_source)
|
86
|
-
cropped_image = crop_image(page_content, min_max_x, min_max_y)
|
87
|
-
if file_extension == 'pdf'
|
88
|
-
cropped_image.format('jpg')
|
90
|
+
# Retrieves the bounding box of a polygon.
|
91
|
+
#
|
92
|
+
# @param [Array<Point>, Mindee::Geometry::Polygon] polygon
|
93
|
+
def self.normalize_polygon(polygon)
|
94
|
+
if polygon.is_a?(Mindee::Geometry::Polygon)
|
95
|
+
Mindee::Geometry.get_bounding_box(polygon)
|
89
96
|
else
|
90
|
-
|
97
|
+
polygon
|
91
98
|
end
|
92
|
-
|
93
|
-
buffer = StringIO.new
|
94
|
-
write_image_to_buffer(cropped_image, buffer)
|
95
|
-
file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"
|
96
|
-
|
97
|
-
extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
|
98
99
|
end
|
99
100
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
if polygon.is_a?(Mindee::Geometry::Polygon)
|
108
|
-
Mindee::Geometry.get_bounding_box(polygon)
|
109
|
-
else
|
110
|
-
polygon
|
101
|
+
# Loads a buffer into a MiniMagick Image.
|
102
|
+
#
|
103
|
+
# @param [StringIO] pdf_stream Buffer containg the PDF
|
104
|
+
# @return [MiniMagick::Image] a valid MiniMagick image handle.
|
105
|
+
def self.read_page_content(pdf_stream)
|
106
|
+
pdf_stream.rewind
|
107
|
+
MiniMagick::Image.read(pdf_stream)
|
111
108
|
end
|
112
|
-
end
|
113
109
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
height = image[:height].to_i
|
131
|
-
|
132
|
-
image.format('jpg')
|
133
|
-
new_width = (min_max_x.max - min_max_x.min) * width
|
134
|
-
new_height = (min_max_y.max - min_max_y.min) * height
|
135
|
-
image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")
|
136
|
-
|
137
|
-
image
|
138
|
-
end
|
110
|
+
# Crops a MiniMagick Image from a the given bounding box.
|
111
|
+
#
|
112
|
+
# @param [MiniMagick::Image] image Input Image.
|
113
|
+
# @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
|
114
|
+
# @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
|
115
|
+
def self.crop_image(image, min_max_x, min_max_y)
|
116
|
+
width = image[:width].to_i
|
117
|
+
height = image[:height].to_i
|
118
|
+
|
119
|
+
image.format('jpg')
|
120
|
+
new_width = (min_max_x.max - min_max_x.min) * width
|
121
|
+
new_height = (min_max_y.max - min_max_y.min) * height
|
122
|
+
image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")
|
123
|
+
|
124
|
+
image
|
125
|
+
end
|
139
126
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
127
|
+
# Writes a MiniMagick::Image to a buffer.
|
128
|
+
#
|
129
|
+
# @param [MiniMagick::Image] image a valid MiniMagick image.
|
130
|
+
# @param [StringIO] buffer
|
131
|
+
def self.write_image_to_buffer(image, buffer)
|
132
|
+
image.write(buffer)
|
133
|
+
end
|
147
134
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
135
|
+
# Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.
|
136
|
+
#
|
137
|
+
# @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
|
138
|
+
# @return [String] A valid file extension.
|
139
|
+
def self.determine_file_extension(input_source)
|
140
|
+
if input_source.pdf? || input_source.filename.downcase.end_with?('pdf')
|
141
|
+
'jpg'
|
142
|
+
else
|
143
|
+
File.extname(input_source.filename).strip.downcase[1..]
|
144
|
+
end
|
157
145
|
end
|
158
|
-
end
|
159
146
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
147
|
+
# Generates an ExtractedImage.
|
148
|
+
#
|
149
|
+
# @param [StringIO] buffer Buffer containing the image.
|
150
|
+
# @param [String] file_name Name for the file.
|
151
|
+
# @param [Object] page_id ID of the page the file was generated from.
|
152
|
+
# @param [Object] element_id ID of the element of a given page.
|
153
|
+
def self.create_extracted_image(buffer, file_name, page_id, element_id)
|
154
|
+
buffer.rewind
|
155
|
+
ExtractedImage.new(
|
156
|
+
Mindee::Input::Source::BytesInputSource.new(buffer.read, file_name),
|
157
|
+
page_id,
|
158
|
+
element_id
|
159
|
+
)
|
160
|
+
end
|
174
161
|
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
162
|
+
# Loads a single_page from an image file or a pdf document.
|
163
|
+
#
|
164
|
+
# @param input_file [LocalInputSource] Local input.
|
165
|
+
# @param [Integer] page_id Page ID.
|
166
|
+
# @return [MiniMagick::Image] A valid PdfDocument handle.
|
167
|
+
def self.load_input_source_pdf_page_as_image(input_file, page_id)
|
168
|
+
input_file.io_stream.rewind
|
169
|
+
if input_file.pdf?
|
170
|
+
Mindee::PDF::PdfProcessor.get_page(Origami::PDF.read(input_file.io_stream), page_id)
|
171
|
+
else
|
172
|
+
input_file.io_stream
|
173
|
+
end
|
186
174
|
end
|
187
175
|
end
|
188
176
|
end
|
@@ -1,26 +1,32 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require_relative '../common/image_extractor'
|
4
|
+
|
3
5
|
module Mindee
|
4
6
|
# Image Extraction Module.
|
5
|
-
module
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
7
|
+
module Extraction
|
8
|
+
# Multi-receipts extraction class wrapper.
|
9
|
+
class MultiReceiptsExtractor
|
10
|
+
def self.extract_receipts(input_source, inference)
|
11
|
+
# Extracts individual receipts from multi-receipts documents.
|
12
|
+
#
|
13
|
+
# @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from.
|
14
|
+
# @param inference [Inference] Results of the inference.
|
15
|
+
# @return [Array<ExtractedImage>] Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
|
12
16
|
|
13
|
-
|
14
|
-
|
17
|
+
images = []
|
18
|
+
raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts
|
15
19
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
20
|
+
(0...input_source.count_pdf_pages).each do |page_id|
|
21
|
+
receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box)
|
22
|
+
images.concat(
|
23
|
+
Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, page_id + 1,
|
24
|
+
receipt_positions)
|
25
|
+
)
|
26
|
+
end
|
22
27
|
|
23
|
-
|
28
|
+
images
|
29
|
+
end
|
24
30
|
end
|
25
31
|
end
|
26
32
|
end
|
@@ -13,7 +13,7 @@ module Mindee
|
|
13
13
|
if local_input.pdf?
|
14
14
|
@source_pdf = local_input.io_stream
|
15
15
|
else
|
16
|
-
pdf_image =
|
16
|
+
pdf_image = Extraction::ImageExtractor.attach_image_as_new_file(local_input.io_stream)
|
17
17
|
io_buffer = StringIO.new
|
18
18
|
pdf_image.save(io_buffer)
|
19
19
|
|
@@ -65,6 +65,7 @@ module Mindee
|
|
65
65
|
|
66
66
|
# rubocop:disable Metrics/CyclomaticComplexity
|
67
67
|
# rubocop:disable Metrics/PerceivedComplexity
|
68
|
+
|
68
69
|
# Extracts invoices as complete PDFs from the document.
|
69
70
|
# @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
|
70
71
|
# @param strict [Boolean]
|
@@ -99,6 +100,7 @@ module Mindee
|
|
99
100
|
end
|
100
101
|
extract_sub_documents(correct_page_indexes)
|
101
102
|
end
|
103
|
+
|
102
104
|
# rubocop:enable Metrics/CyclomaticComplexity
|
103
105
|
# rubocop:enable Metrics/PerceivedComplexity
|
104
106
|
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
# Image processing module.
|
5
|
+
module Image
|
6
|
+
# Image compressor module to handle image compression.
|
7
|
+
module ImageCompressor
|
8
|
+
# Resize and/or compress an SKBitmap. This assumes the ratio was provided before hands.
|
9
|
+
# @param image [MiniMagick::Image, StringIO] Input image.
|
10
|
+
# @param quality [Integer, nil] Quality of the final file.
|
11
|
+
# @param max_width [Integer, nil] Maximum width. If not specified, the horizontal ratio will remain the same.
|
12
|
+
# @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
|
13
|
+
# @return [StringIO]
|
14
|
+
def self.compress_image(image, quality: 85, max_width: nil, max_height: nil)
|
15
|
+
processed_image = ImageUtils.to_image(image)
|
16
|
+
processed_image.format 'jpg'
|
17
|
+
final_width, final_height = ImageUtils.calculate_new_dimensions(
|
18
|
+
processed_image,
|
19
|
+
max_width: max_width,
|
20
|
+
max_height: max_height
|
21
|
+
)
|
22
|
+
ImageUtils.resize_image(processed_image, final_width, final_height) if final_width || final_height
|
23
|
+
ImageUtils.compress_image_quality(processed_image, quality)
|
24
|
+
|
25
|
+
ImageUtils.image_to_stringio(processed_image)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
# Image processing module.
|
5
|
+
module Image
|
6
|
+
# Miscellaneous image operations.
|
7
|
+
module ImageUtils
|
8
|
+
# Resizes a provided MiniMagick Image with the given width & height, if present.
|
9
|
+
# @param image [MiniMagick::Image] MiniMagick image handle.
|
10
|
+
# @param width [Integer] Width to comply with.
|
11
|
+
# @param height [Integer] Height to comply with.
|
12
|
+
def self.resize_image(image, width, height)
|
13
|
+
if width && height
|
14
|
+
image.resize "#{width}x#{height}"
|
15
|
+
elsif width
|
16
|
+
image.resize width.to_s
|
17
|
+
elsif height
|
18
|
+
image.resize "x#{height}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Compresses the quality of the provided MiniMagick image.
|
23
|
+
# @param image [MiniMagick::Image] MiniMagick image handle.
|
24
|
+
# @param quality [Integer] Quality to apply to the image. This is independent of a JPG's base quality.
|
25
|
+
def self.compress_image_quality(image, quality)
|
26
|
+
image.quality quality.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
# Mostly here so that IDEs don't get confused on the type (@type annotation fails sometimes.)
|
30
|
+
# @param [MiniMagick::Image, StringIO, File, Tempfile] image The input image
|
31
|
+
# @return [MiniMagick::Image]
|
32
|
+
def self.to_image(image)
|
33
|
+
if image.respond_to?(:read) && image.respond_to?(:rewind)
|
34
|
+
image.rewind
|
35
|
+
MiniMagick::Image.read(image)
|
36
|
+
elsif image.is_a?(MiniMagick::Image)
|
37
|
+
image
|
38
|
+
else
|
39
|
+
raise "Expected an I/O object or a MiniMagick::Image. '#{image.class}' given instead."
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Converts a StringIO containing an image into a MiniMagick image.
|
44
|
+
# @param image [MiniMagick::Image] the input image.
|
45
|
+
# @param format [String] Format parameter, left open for the future, but should be JPEG for current use-cases.
|
46
|
+
# @return [StringIO]
|
47
|
+
def self.image_to_stringio(image, format = 'JPEG')
|
48
|
+
image.format format
|
49
|
+
blob = image.to_blob
|
50
|
+
stringio = StringIO.new(blob)
|
51
|
+
stringio.rewind
|
52
|
+
|
53
|
+
stringio
|
54
|
+
end
|
55
|
+
|
56
|
+
# Computes the new dimensions for a given SKBitmap, and returns a scaled down version of it relative to the
|
57
|
+
# provided bounds.
|
58
|
+
# @param [MiniMagick::Image] original Input MiniMagick image.
|
59
|
+
# @param max_width [Integer] Maximum width. If not specified, the horizontal ratio will remain the same.
|
60
|
+
# @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
|
61
|
+
def self.calculate_new_dimensions(original, max_width: nil, max_height: nil)
|
62
|
+
raise 'Provided image could not be processed for resizing.' if original.nil?
|
63
|
+
|
64
|
+
return [original.width, original.height] if max_width.nil? && max_height.nil?
|
65
|
+
|
66
|
+
width_ratio = max_width ? max_width.to_f / original.width : Float::INFINITY
|
67
|
+
height_ratio = max_height ? max_height.to_f / original.height : Float::INFINITY
|
68
|
+
|
69
|
+
scale_factor = [width_ratio, height_ratio].min
|
70
|
+
|
71
|
+
new_width = (original.width * scale_factor).to_i
|
72
|
+
new_height = (original.height * scale_factor).to_i
|
73
|
+
|
74
|
+
[new_width, new_height]
|
75
|
+
end
|
76
|
+
|
77
|
+
# Computes the Height & Width from a page's media box. Falls back to the size of the initial image.
|
78
|
+
# @param image [MiniMagick::Image] The initial image that will fit into the page.
|
79
|
+
# @param media_box [Array<Integer>, nil]
|
80
|
+
# @return [Array<Integer>]
|
81
|
+
def self.calculate_dimensions_from_media_box(image, media_box)
|
82
|
+
if !media_box.nil? && media_box.any?
|
83
|
+
[
|
84
|
+
media_box[2]&.to_i || image[:width].to_i,
|
85
|
+
media_box[3]&.to_i || image[:height].to_i,
|
86
|
+
]
|
87
|
+
else
|
88
|
+
[image[:width].to_i, image[:height].to_i]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Transforms a PDF into a MagickImage. This is currently used for single-page PDFs.
|
93
|
+
# @param pdf_stream [StringIO] Input stream.
|
94
|
+
# @param image_quality [Integer] Quality to apply to the image.
|
95
|
+
# @return [MiniMagick::Image]
|
96
|
+
def self.pdf_to_magick_image(pdf_stream, image_quality)
|
97
|
+
compressed_image = MiniMagick::Image.read(pdf_stream.read)
|
98
|
+
compressed_image.format('jpg')
|
99
|
+
compressed_image.quality image_quality.to_s
|
100
|
+
compressed_image
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
data/lib/mindee/image.rb
ADDED
data/lib/mindee/input/sources.rb
CHANGED
@@ -4,6 +4,7 @@ require 'stringio'
|
|
4
4
|
require 'marcel'
|
5
5
|
|
6
6
|
require_relative '../pdf'
|
7
|
+
require_relative '../image'
|
7
8
|
|
8
9
|
module Mindee
|
9
10
|
module Input
|
@@ -126,6 +127,41 @@ module Mindee
|
|
126
127
|
pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
|
127
128
|
pdf_processor.pages.size
|
128
129
|
end
|
130
|
+
|
131
|
+
# Compresses the file, according to the provided info.
|
132
|
+
# @param [Integer] quality Quality of the output file.
|
133
|
+
# @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
|
134
|
+
# @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
|
135
|
+
# @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
|
136
|
+
# This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
|
137
|
+
# WARNING: this operation is strongly discouraged.
|
138
|
+
# @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
|
139
|
+
# not. Needs force_source_text to work.
|
140
|
+
def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
|
141
|
+
buffer = if pdf?
|
142
|
+
Mindee::PDF::PDFCompressor.compress_pdf(
|
143
|
+
@io_stream,
|
144
|
+
quality: quality,
|
145
|
+
force_source_text_compression: force_source_text,
|
146
|
+
disable_source_text: disable_source_text
|
147
|
+
)
|
148
|
+
else
|
149
|
+
Mindee::Image::ImageCompressor.compress_image(
|
150
|
+
@io_stream,
|
151
|
+
quality: quality,
|
152
|
+
max_width: max_width,
|
153
|
+
max_height: max_height
|
154
|
+
)
|
155
|
+
end
|
156
|
+
@io_stream = buffer
|
157
|
+
@io_stream.rewind
|
158
|
+
end
|
159
|
+
|
160
|
+
# Checks whether the file has source text if it is a pdf. False otherwise
|
161
|
+
# @return [Boolean] True if the file is a PDF and has source text.
|
162
|
+
def source_text?
|
163
|
+
Mindee::PDF::PDFTools.source_text?(@io_stream)
|
164
|
+
end
|
129
165
|
end
|
130
166
|
|
131
167
|
# Load a document from a path.
|
@@ -18,11 +18,15 @@ module Mindee
|
|
18
18
|
# The textual representation of the date as found on the document.
|
19
19
|
# @return [String, nil]
|
20
20
|
attr_reader :raw
|
21
|
+
# Whether the field was computed or retrieved directly from the document.
|
22
|
+
# @return [Boolean, nil]
|
23
|
+
attr_reader :is_computed
|
21
24
|
|
22
25
|
# @param prediction [Hash]
|
23
26
|
# @param page_id [Integer, nil]
|
24
27
|
def initialize(prediction, page_id)
|
25
28
|
super
|
29
|
+
@is_computed = prediction['is_computed']
|
26
30
|
return unless @value
|
27
31
|
|
28
32
|
@date_object = Date.parse(@value)
|
@@ -31,6 +31,8 @@ module Mindee
|
|
31
31
|
|
32
32
|
# rubocop:disable Metrics/CyclomaticComplexity
|
33
33
|
# rubocop:disable Metrics/PerceivedComplexity
|
34
|
+
|
35
|
+
# String representation.
|
34
36
|
# @return [String]
|
35
37
|
def to_s
|
36
38
|
return "Polygon with #{@polygon.size} points." if @polygon&.size&.positive?
|
@@ -40,6 +42,7 @@ module Mindee
|
|
40
42
|
|
41
43
|
''
|
42
44
|
end
|
45
|
+
|
43
46
|
# rubocop:enable Metrics/CyclomaticComplexity
|
44
47
|
# rubocop:enable Metrics/PerceivedComplexity
|
45
48
|
|