mindee 3.13.0 → 3.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/docs/bill_of_lading_v1.md +50 -1
  4. data/docs/energy_bill_fra_v1.md +61 -1
  5. data/docs/expense_receipts_v5.md +4 -4
  6. data/docs/financial_document_v1.md +14 -0
  7. data/docs/invoices_v4.md +16 -2
  8. data/docs/nutrition_facts_v1.md +80 -1
  9. data/docs/payslip_fra_v2.md +77 -1
  10. data/docs/us_mail_v2.md +1 -1
  11. data/examples/auto_invoice_splitter_extraction.rb +36 -31
  12. data/examples/auto_multi_receipts_detector_extraction.rb +31 -0
  13. data/lib/mindee/client.rb +1 -0
  14. data/lib/mindee/extraction/common/extracted_image.rb +1 -2
  15. data/lib/mindee/extraction/common/image_extractor.rb +147 -159
  16. data/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb +22 -16
  17. data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +3 -1
  18. data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +1 -0
  19. data/lib/mindee/geometry/point.rb +2 -1
  20. data/lib/mindee/image/image_compressor.rb +29 -0
  21. data/lib/mindee/image/image_utils.rb +104 -0
  22. data/lib/mindee/image.rb +4 -0
  23. data/lib/mindee/input/sources.rb +36 -0
  24. data/lib/mindee/parsing/standard/date_field.rb +4 -0
  25. data/lib/mindee/parsing/standard/position_field.rb +3 -0
  26. data/lib/mindee/pdf/pdf_compressor.rb +117 -0
  27. data/lib/mindee/pdf/{pdf_processing.rb → pdf_processor.rb} +17 -0
  28. data/lib/mindee/pdf/pdf_tools.rb +100 -0
  29. data/lib/mindee/pdf.rb +3 -1
  30. data/lib/mindee/product/financial_document/financial_document_v1_document.rb +11 -1
  31. data/lib/mindee/product/financial_document/financial_document_v1_page.rb +1 -1
  32. data/lib/mindee/product/invoice/invoice_v4_document.rb +11 -1
  33. data/lib/mindee/product/invoice/invoice_v4_page.rb +1 -1
  34. data/lib/mindee/version.rb +1 -1
  35. data/lib/mindee.rb +10 -0
  36. data/mindee.gemspec +2 -1
  37. metadata +32 -7
@@ -9,180 +9,168 @@ require_relative 'extracted_image'
9
9
 
10
10
  module Mindee
11
11
  # Image Extraction Module.
12
- module ImageExtraction
13
- def self.attach_image_as_new_file(input_buffer)
14
- # Attaches an image as a new page in a PdfDocument object.
15
- #
16
- # @param [StringIO] input_buffer Input buffer. Only supports JPEG.
17
- # @return [Origami::PDF] A PdfDocument handle.
18
-
19
- magick_image = MiniMagick::Image.read(input_buffer)
20
- # NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
21
- # converted.
22
- magick_image.format('jpg')
23
- original_density = magick_image.resolution
24
- scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
25
- # the pdf otherwise the resulting image shrinks.
26
- magick_image.format('pdf', 0, { density: scale_factor.to_s })
27
- Origami::PDF.read(StringIO.new(magick_image.to_blob))
28
- end
29
-
30
- # Extracts multiple images from a given local input source.
31
- #
32
- # @param [Mindee::Input::Source::LocalInputSource] input_source
33
- # @param [Integer] page_id ID of the Page to extract from.
34
- # @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates
35
- # to extract.
36
- # @return [Array<Mindee::ImageExtraction::ExtractedImage>] Extracted Images.
37
- def extract_multiple_images_from_source(input_source, page_id, polygons)
38
- new_stream = load_doc(input_source, page_id)
39
- new_stream.seek(0)
40
-
41
- extract_images_from_polygons(input_source, new_stream, page_id, polygons)
42
- end
12
+ module Extraction
13
+ # Image Extraction wrapper class.
14
+ module ImageExtractor
15
+ def self.attach_image_as_new_file(input_buffer, format: 'jpg')
16
+ # Attaches an image as a new page in a PdfDocument object.
17
+ #
18
+ # @param [StringIO] input_buffer Input buffer. Only supports JPEG.
19
+ # @return [Origami::PDF] A PdfDocument handle.
20
+
21
+ magick_image = MiniMagick::Image.read(input_buffer)
22
+ # NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
23
+ # converted.
24
+ magick_image.format(format)
25
+ original_density = magick_image.resolution
26
+ scale_factor = original_density[0].to_f / 4.166666 # No clue why the resolution needs to be reduced for
27
+ # the pdf otherwise the resulting image shrinks.
28
+ magick_image.format('pdf', 0, { density: scale_factor.to_s })
29
+ Origami::PDF.read(StringIO.new(magick_image.to_blob))
30
+ end
43
31
 
44
- # Retrieves a PDF document's page.
45
- #
46
- # @param [Origami::PDF] pdf_doc Origami PDF handle.
47
- # @param [Integer] page_id Page ID.
48
- def get_page(pdf_doc, page_id)
49
- stream = StringIO.new
50
- pdf_doc.save(stream)
32
+ # Extracts multiple images from a given local input source.
33
+ #
34
+ # @param [Mindee::Input::Source::LocalInputSource] input_source
35
+ # @param [Integer] page_id ID of the Page to extract from.
36
+ # @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates
37
+ # to extract.
38
+ # @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
39
+ def self.extract_multiple_images_from_source(input_source, page_id, polygons)
40
+ new_stream = load_input_source_pdf_page_as_image(input_source, page_id)
41
+ new_stream.seek(0)
42
+
43
+ extract_images_from_polygons(input_source, new_stream, page_id, polygons)
44
+ end
51
45
 
52
- options = {
53
- page_indexes: [page_id - 1],
54
- }
46
+ # Extracts images from their positions on a file (as polygons).
47
+ #
48
+ # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
49
+ # @param [StringIO] pdf_stream Buffer of the PDF.
50
+ # @param [Integer] page_id Page ID.
51
+ # @param [Array<Mindee::Geometry::Point, Mindee::Geometry::Polygon, Mindee::Geometry::Quadrangle>] polygons
52
+ # @return [Array<Mindee::Extraction::ExtractedImage>] Extracted Images.
53
+ def self.extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
54
+ extracted_elements = []
55
+
56
+ polygons.each_with_index do |polygon, element_id|
57
+ polygon = normalize_polygon(polygon)
58
+ page_content = read_page_content(pdf_stream)
59
+
60
+ min_max_x = Geometry.get_min_max_x([
61
+ polygon.top_left,
62
+ polygon.bottom_right,
63
+ polygon.top_right,
64
+ polygon.bottom_left,
65
+ ])
66
+ min_max_y = Geometry.get_min_max_y([
67
+ polygon.top_left,
68
+ polygon.bottom_right,
69
+ polygon.top_right,
70
+ polygon.bottom_left,
71
+ ])
72
+ file_extension = determine_file_extension(input_source)
73
+ cropped_image = crop_image(page_content, min_max_x, min_max_y)
74
+ if file_extension == 'pdf'
75
+ cropped_image.format('jpg')
76
+ else
77
+ cropped_image.format(file_extension)
78
+ end
79
+
80
+ buffer = StringIO.new
81
+ write_image_to_buffer(cropped_image, buffer)
82
+ file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"
83
+
84
+ extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
85
+ end
55
86
 
56
- Mindee::PDF::PdfProcessor.parse(stream, options)
57
- end
87
+ extracted_elements
88
+ end
58
89
 
59
- # Extracts images from their positions on a file (as polygons).
60
- #
61
- # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
62
- # @param [StringIO] pdf_stream Buffer of the PDF.
63
- # @param [Integer] page_id Page ID.
64
- # @param [Array<Mindee::Geometry::Point, Mindee::Geometry::Polygon, Mindee::Geometry::Quadrangle>] polygons
65
- # @return [Array<Mindee::ImageExtraction::ExtractedImage>] Extracted Images.
66
- def extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
67
- extracted_elements = []
68
-
69
- polygons.each_with_index do |polygon, element_id|
70
- polygon = normalize_polygon(polygon)
71
- page_content = read_page_content(pdf_stream)
72
-
73
- min_max_x = Geometry.get_min_max_x([
74
- polygon.top_left,
75
- polygon.bottom_right,
76
- polygon.top_right,
77
- polygon.bottom_left,
78
- ])
79
- min_max_y = Geometry.get_min_max_y([
80
- polygon.top_left,
81
- polygon.bottom_right,
82
- polygon.top_right,
83
- polygon.bottom_left,
84
- ])
85
- file_extension = determine_file_extension(input_source)
86
- cropped_image = crop_image(page_content, min_max_x, min_max_y)
87
- if file_extension == 'pdf'
88
- cropped_image.format('jpg')
90
+ # Retrieves the bounding box of a polygon.
91
+ #
92
+ # @param [Array<Point>, Mindee::Geometry::Polygon] polygon
93
+ def self.normalize_polygon(polygon)
94
+ if polygon.is_a?(Mindee::Geometry::Polygon)
95
+ Mindee::Geometry.get_bounding_box(polygon)
89
96
  else
90
- cropped_image.format(file_extension)
97
+ polygon
91
98
  end
92
-
93
- buffer = StringIO.new
94
- write_image_to_buffer(cropped_image, buffer)
95
- file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"
96
-
97
- extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
98
99
  end
99
100
 
100
- extracted_elements
101
- end
102
-
103
- # Retrieves the bounding box of a polygon.
104
- #
105
- # @param [Array<Point>, Mindee::Geometry::Polygon] polygon
106
- def normalize_polygon(polygon)
107
- if polygon.is_a?(Mindee::Geometry::Polygon)
108
- Mindee::Geometry.get_bounding_box(polygon)
109
- else
110
- polygon
101
+ # Loads a buffer into a MiniMagick Image.
102
+ #
103
+ # @param [StringIO] pdf_stream Buffer containg the PDF
104
+ # @return [MiniMagick::Image] a valid MiniMagick image handle.
105
+ def self.read_page_content(pdf_stream)
106
+ pdf_stream.rewind
107
+ MiniMagick::Image.read(pdf_stream)
111
108
  end
112
- end
113
109
 
114
- # Loads a buffer into a MiniMagick Image.
115
- #
116
- # @param [StringIO] pdf_stream Buffer containg the PDF
117
- # @return [MiniMagick::Image] a valid MiniMagick image handle.
118
- def read_page_content(pdf_stream)
119
- pdf_stream.rewind
120
- MiniMagick::Image.read(pdf_stream)
121
- end
122
-
123
- # Crops a MiniMagick Image from a the given bounding box.
124
- #
125
- # @param [MiniMagick::Image] image Input Image.
126
- # @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
127
- # @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
128
- def crop_image(image, min_max_x, min_max_y)
129
- width = image[:width].to_i
130
- height = image[:height].to_i
131
-
132
- image.format('jpg')
133
- new_width = (min_max_x.max - min_max_x.min) * width
134
- new_height = (min_max_y.max - min_max_y.min) * height
135
- image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")
136
-
137
- image
138
- end
110
+ # Crops a MiniMagick Image from a the given bounding box.
111
+ #
112
+ # @param [MiniMagick::Image] image Input Image.
113
+ # @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
114
+ # @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
115
+ def self.crop_image(image, min_max_x, min_max_y)
116
+ width = image[:width].to_i
117
+ height = image[:height].to_i
118
+
119
+ image.format('jpg')
120
+ new_width = (min_max_x.max - min_max_x.min) * width
121
+ new_height = (min_max_y.max - min_max_y.min) * height
122
+ image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")
123
+
124
+ image
125
+ end
139
126
 
140
- # Writes a MiniMagick::Image to a buffer.
141
- #
142
- # @param [MiniMagick::Image] image a valid MiniMagick image.
143
- # @param [StringIO] buffer
144
- def write_image_to_buffer(image, buffer)
145
- image.write(buffer)
146
- end
127
+ # Writes a MiniMagick::Image to a buffer.
128
+ #
129
+ # @param [MiniMagick::Image] image a valid MiniMagick image.
130
+ # @param [StringIO] buffer
131
+ def self.write_image_to_buffer(image, buffer)
132
+ image.write(buffer)
133
+ end
147
134
 
148
- # Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.
149
- #
150
- # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
151
- # @return [String] A valid file extension.
152
- def determine_file_extension(input_source)
153
- if input_source.pdf? || input_source.filename.downcase.end_with?('pdf')
154
- 'jpg'
155
- else
156
- File.extname(input_source.filename).strip.downcase[1..]
135
+ # Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.
136
+ #
137
+ # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
138
+ # @return [String] A valid file extension.
139
+ def self.determine_file_extension(input_source)
140
+ if input_source.pdf? || input_source.filename.downcase.end_with?('pdf')
141
+ 'jpg'
142
+ else
143
+ File.extname(input_source.filename).strip.downcase[1..]
144
+ end
157
145
  end
158
- end
159
146
 
160
- # Generates an ExtractedImage.
161
- #
162
- # @param [StringIO] buffer Buffer containing the image.
163
- # @param [String] file_name Name for the file.
164
- # @param [Object] page_id ID of the page the file was generated from.
165
- # @param [Object] element_id ID of the element of a given page.
166
- def create_extracted_image(buffer, file_name, page_id, element_id)
167
- buffer.rewind
168
- ExtractedImage.new(
169
- Mindee::Input::Source::BytesInputSource.new(buffer.read, file_name),
170
- page_id,
171
- element_id
172
- )
173
- end
147
+ # Generates an ExtractedImage.
148
+ #
149
+ # @param [StringIO] buffer Buffer containing the image.
150
+ # @param [String] file_name Name for the file.
151
+ # @param [Object] page_id ID of the page the file was generated from.
152
+ # @param [Object] element_id ID of the element of a given page.
153
+ def self.create_extracted_image(buffer, file_name, page_id, element_id)
154
+ buffer.rewind
155
+ ExtractedImage.new(
156
+ Mindee::Input::Source::BytesInputSource.new(buffer.read, file_name),
157
+ page_id,
158
+ element_id
159
+ )
160
+ end
174
161
 
175
- # Loads a single_page from an image file or a pdf document.
176
- #
177
- # @param input_file [LocalInputSource] Local input.
178
- # @param [Integer] page_id Page ID.
179
- # @return [MiniMagick::Image] A valid PdfDocument handle.
180
- def load_doc(input_file, page_id)
181
- input_file.io_stream.rewind
182
- if input_file.pdf?
183
- get_page(Origami::PDF.read(input_file.io_stream), page_id)
184
- else
185
- input_file.io_stream
162
+ # Loads a single_page from an image file or a pdf document.
163
+ #
164
+ # @param input_file [LocalInputSource] Local input.
165
+ # @param [Integer] page_id Page ID.
166
+ # @return [MiniMagick::Image] A valid PdfDocument handle.
167
+ def self.load_input_source_pdf_page_as_image(input_file, page_id)
168
+ input_file.io_stream.rewind
169
+ if input_file.pdf?
170
+ Mindee::PDF::PdfProcessor.get_page(Origami::PDF.read(input_file.io_stream), page_id)
171
+ else
172
+ input_file.io_stream
173
+ end
186
174
  end
187
175
  end
188
176
  end
@@ -1,26 +1,32 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative '../common/image_extractor'
4
+
3
5
  module Mindee
4
6
  # Image Extraction Module.
5
- module ImageExtraction
6
- def extract_receipts(input_source, inference)
7
- # Extracts individual receipts from multi-receipts documents.
8
- #
9
- # @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from.
10
- # @param inference [Inference] Results of the inference.
11
- # @return [Array<ExtractedImage>] Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
7
+ module Extraction
8
+ # Multi-receipts extraction class wrapper.
9
+ class MultiReceiptsExtractor
10
+ def self.extract_receipts(input_source, inference)
11
+ # Extracts individual receipts from multi-receipts documents.
12
+ #
13
+ # @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from.
14
+ # @param inference [Inference] Results of the inference.
15
+ # @return [Array<ExtractedImage>] Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
12
16
 
13
- images = []
14
- raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts
17
+ images = []
18
+ raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts
15
19
 
16
- (0...input_source.count_pdf_pages).each do |page_id|
17
- receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box)
18
- images.concat(
19
- extract_multiple_images_from_source(input_source, page_id + 1, receipt_positions)
20
- )
21
- end
20
+ (0...input_source.count_pdf_pages).each do |page_id|
21
+ receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box)
22
+ images.concat(
23
+ Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, page_id + 1,
24
+ receipt_positions)
25
+ )
26
+ end
22
27
 
23
- images
28
+ images
29
+ end
24
30
  end
25
31
  end
26
32
  end
@@ -13,7 +13,7 @@ module Mindee
13
13
  if local_input.pdf?
14
14
  @source_pdf = local_input.io_stream
15
15
  else
16
- pdf_image = ImageExtraction.attach_image_as_new_file(local_input.io_stream)
16
+ pdf_image = Extraction::ImageExtractor.attach_image_as_new_file(local_input.io_stream)
17
17
  io_buffer = StringIO.new
18
18
  pdf_image.save(io_buffer)
19
19
 
@@ -65,6 +65,7 @@ module Mindee
65
65
 
66
66
  # rubocop:disable Metrics/CyclomaticComplexity
67
67
  # rubocop:disable Metrics/PerceivedComplexity
68
+
68
69
  # Extracts invoices as complete PDFs from the document.
69
70
  # @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
70
71
  # @param strict [Boolean]
@@ -99,6 +100,7 @@ module Mindee
99
100
  end
100
101
  extract_sub_documents(correct_page_indexes)
101
102
  end
103
+
102
104
  # rubocop:enable Metrics/CyclomaticComplexity
103
105
  # rubocop:enable Metrics/PerceivedComplexity
104
106
 
@@ -271,6 +271,7 @@ module Mindee
271
271
  end
272
272
  candidates
273
273
  end
274
+
274
275
  # rubocop:enable Metrics/CyclomaticComplexity
275
276
  # rubocop:enable Metrics/PerceivedComplexity
276
277
 
@@ -10,9 +10,10 @@ module Mindee
10
10
  # @return [Float]
11
11
  attr_accessor :y
12
12
 
13
+ # rubocop:disable Naming/MethodParameterName
14
+
13
15
  # @param x [Float]
14
16
  # @param y [Float]
15
- # rubocop:disable Naming/MethodParameterName
16
17
  def initialize(x, y)
17
18
  @x = x
18
19
  @y = y
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Image processing module.
5
+ module Image
6
+ # Image compressor module to handle image compression.
7
+ module ImageCompressor
8
+ # Resize and/or compress an SKBitmap. This assumes the ratio was provided before hands.
9
+ # @param image [MiniMagick::Image, StringIO] Input image.
10
+ # @param quality [Integer, nil] Quality of the final file.
11
+ # @param max_width [Integer, nil] Maximum width. If not specified, the horizontal ratio will remain the same.
12
+ # @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
13
+ # @return [StringIO]
14
+ def self.compress_image(image, quality: 85, max_width: nil, max_height: nil)
15
+ processed_image = ImageUtils.to_image(image)
16
+ processed_image.format 'jpg'
17
+ final_width, final_height = ImageUtils.calculate_new_dimensions(
18
+ processed_image,
19
+ max_width: max_width,
20
+ max_height: max_height
21
+ )
22
+ ImageUtils.resize_image(processed_image, final_width, final_height) if final_width || final_height
23
+ ImageUtils.compress_image_quality(processed_image, quality)
24
+
25
+ ImageUtils.image_to_stringio(processed_image)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Image processing module.
5
+ module Image
6
+ # Miscellaneous image operations.
7
+ module ImageUtils
8
+ # Resizes a provided MiniMagick Image with the given width & height, if present.
9
+ # @param image [MiniMagick::Image] MiniMagick image handle.
10
+ # @param width [Integer] Width to comply with.
11
+ # @param height [Integer] Height to comply with.
12
+ def self.resize_image(image, width, height)
13
+ if width && height
14
+ image.resize "#{width}x#{height}"
15
+ elsif width
16
+ image.resize width.to_s
17
+ elsif height
18
+ image.resize "x#{height}"
19
+ end
20
+ end
21
+
22
+ # Compresses the quality of the provided MiniMagick image.
23
+ # @param image [MiniMagick::Image] MiniMagick image handle.
24
+ # @param quality [Integer] Quality to apply to the image. This is independent of a JPG's base quality.
25
+ def self.compress_image_quality(image, quality)
26
+ image.quality quality.to_s
27
+ end
28
+
29
+ # Mostly here so that IDEs don't get confused on the type (@type annotation fails sometimes.)
30
+ # @param [MiniMagick::Image, StringIO, File, Tempfile] image The input image
31
+ # @return [MiniMagick::Image]
32
+ def self.to_image(image)
33
+ if image.respond_to?(:read) && image.respond_to?(:rewind)
34
+ image.rewind
35
+ MiniMagick::Image.read(image)
36
+ elsif image.is_a?(MiniMagick::Image)
37
+ image
38
+ else
39
+ raise "Expected an I/O object or a MiniMagick::Image. '#{image.class}' given instead."
40
+ end
41
+ end
42
+
43
+ # Converts a StringIO containing an image into a MiniMagick image.
44
+ # @param image [MiniMagick::Image] the input image.
45
+ # @param format [String] Format parameter, left open for the future, but should be JPEG for current use-cases.
46
+ # @return [StringIO]
47
+ def self.image_to_stringio(image, format = 'JPEG')
48
+ image.format format
49
+ blob = image.to_blob
50
+ stringio = StringIO.new(blob)
51
+ stringio.rewind
52
+
53
+ stringio
54
+ end
55
+
56
+ # Computes the new dimensions for a given SKBitmap, and returns a scaled down version of it relative to the
57
+ # provided bounds.
58
+ # @param [MiniMagick::Image] original Input MiniMagick image.
59
+ # @param max_width [Integer] Maximum width. If not specified, the horizontal ratio will remain the same.
60
+ # @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same.
61
+ def self.calculate_new_dimensions(original, max_width: nil, max_height: nil)
62
+ raise 'Provided image could not be processed for resizing.' if original.nil?
63
+
64
+ return [original.width, original.height] if max_width.nil? && max_height.nil?
65
+
66
+ width_ratio = max_width ? max_width.to_f / original.width : Float::INFINITY
67
+ height_ratio = max_height ? max_height.to_f / original.height : Float::INFINITY
68
+
69
+ scale_factor = [width_ratio, height_ratio].min
70
+
71
+ new_width = (original.width * scale_factor).to_i
72
+ new_height = (original.height * scale_factor).to_i
73
+
74
+ [new_width, new_height]
75
+ end
76
+
77
+ # Computes the Height & Width from a page's media box. Falls back to the size of the initial image.
78
+ # @param image [MiniMagick::Image] The initial image that will fit into the page.
79
+ # @param media_box [Array<Integer>, nil]
80
+ # @return [Array<Integer>]
81
+ def self.calculate_dimensions_from_media_box(image, media_box)
82
+ if !media_box.nil? && media_box.any?
83
+ [
84
+ media_box[2]&.to_i || image[:width].to_i,
85
+ media_box[3]&.to_i || image[:height].to_i,
86
+ ]
87
+ else
88
+ [image[:width].to_i, image[:height].to_i]
89
+ end
90
+ end
91
+
92
+ # Transforms a PDF into a MagickImage. This is currently used for single-page PDFs.
93
+ # @param pdf_stream [StringIO] Input stream.
94
+ # @param image_quality [Integer] Quality to apply to the image.
95
+ # @return [MiniMagick::Image]
96
+ def self.pdf_to_magick_image(pdf_stream, image_quality)
97
+ compressed_image = MiniMagick::Image.read(pdf_stream.read)
98
+ compressed_image.format('jpg')
99
+ compressed_image.quality image_quality.to_s
100
+ compressed_image
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'image/image_compressor'
4
+ require_relative 'image/image_utils'
@@ -4,6 +4,7 @@ require 'stringio'
4
4
  require 'marcel'
5
5
 
6
6
  require_relative '../pdf'
7
+ require_relative '../image'
7
8
 
8
9
  module Mindee
9
10
  module Input
@@ -126,6 +127,41 @@ module Mindee
126
127
  pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
127
128
  pdf_processor.pages.size
128
129
  end
130
+
131
+ # Compresses the file, according to the provided info.
132
+ # @param [Integer] quality Quality of the output file.
133
+ # @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
134
+ # @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
135
+ # @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
136
+ # This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
137
+ # WARNING: this operation is strongly discouraged.
138
+ # @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
139
+ # not. Needs force_source_text to work.
140
+ def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
141
+ buffer = if pdf?
142
+ Mindee::PDF::PDFCompressor.compress_pdf(
143
+ @io_stream,
144
+ quality: quality,
145
+ force_source_text_compression: force_source_text,
146
+ disable_source_text: disable_source_text
147
+ )
148
+ else
149
+ Mindee::Image::ImageCompressor.compress_image(
150
+ @io_stream,
151
+ quality: quality,
152
+ max_width: max_width,
153
+ max_height: max_height
154
+ )
155
+ end
156
+ @io_stream = buffer
157
+ @io_stream.rewind
158
+ end
159
+
160
+ # Checks whether the file has source text if it is a pdf. False otherwise
161
+ # @return [Boolean] True if the file is a PDF and has source text.
162
+ def source_text?
163
+ Mindee::PDF::PDFTools.source_text?(@io_stream)
164
+ end
129
165
  end
130
166
 
131
167
  # Load a document from a path.
@@ -18,11 +18,15 @@ module Mindee
18
18
  # The textual representation of the date as found on the document.
19
19
  # @return [String, nil]
20
20
  attr_reader :raw
21
+ # Whether the field was computed or retrieved directly from the document.
22
+ # @return [Boolean, nil]
23
+ attr_reader :is_computed
21
24
 
22
25
  # @param prediction [Hash]
23
26
  # @param page_id [Integer, nil]
24
27
  def initialize(prediction, page_id)
25
28
  super
29
+ @is_computed = prediction['is_computed']
26
30
  return unless @value
27
31
 
28
32
  @date_object = Date.parse(@value)
@@ -31,6 +31,8 @@ module Mindee
31
31
 
32
32
  # rubocop:disable Metrics/CyclomaticComplexity
33
33
  # rubocop:disable Metrics/PerceivedComplexity
34
+
35
+ # String representation.
34
36
  # @return [String]
35
37
  def to_s
36
38
  return "Polygon with #{@polygon.size} points." if @polygon&.size&.positive?
@@ -40,6 +42,7 @@ module Mindee
40
42
 
41
43
  ''
42
44
  end
45
+
43
46
  # rubocop:enable Metrics/CyclomaticComplexity
44
47
  # rubocop:enable Metrics/PerceivedComplexity
45
48