mindee 3.10.0 → 3.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -0
  3. data/bin/mindee.rb +7 -1
  4. data/docs/code_samples/financial_document_v1_async.txt +19 -0
  5. data/docs/code_samples/us_healthcare_cards_v1_async.txt +19 -0
  6. data/docs/expense_receipts_v5.md +12 -10
  7. data/docs/financial_document_v1.md +51 -22
  8. data/docs/invoices_v4.md +4 -3
  9. data/docs/us_healthcare_cards_v1.md +204 -0
  10. data/lib/mindee/extraction/ocr_extractor.rb +110 -0
  11. data/lib/mindee/extraction/tax_extractor.rb +322 -0
  12. data/lib/mindee/extraction.rb +3 -0
  13. data/lib/mindee/geometry/utils.rb +19 -0
  14. data/lib/mindee/image_extraction/common/extracted_image.rb +73 -0
  15. data/lib/mindee/image_extraction/common/image_extractor.rb +191 -0
  16. data/lib/mindee/image_extraction/common.rb +3 -0
  17. data/lib/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.rb +26 -0
  18. data/lib/mindee/image_extraction/multi_receipts_extractor.rb +3 -0
  19. data/lib/mindee/image_extraction.rb +4 -0
  20. data/lib/mindee/input/sources.rb +8 -0
  21. data/lib/mindee/parsing/common/api_response.rb +1 -1
  22. data/lib/mindee/parsing/common/ocr/mvision_v1.rb +16 -0
  23. data/lib/mindee/parsing/common/ocr/ocr.rb +10 -0
  24. data/lib/mindee/parsing/standard/company_registration_field.rb +17 -0
  25. data/lib/mindee/product/financial_document/financial_document_v1_document.rb +3 -1
  26. data/lib/mindee/product/financial_document/financial_document_v1_line_item.rb +7 -0
  27. data/lib/mindee/product/financial_document/financial_document_v1_page.rb +1 -1
  28. data/lib/mindee/product/international_id/international_id_v2_document.rb +1 -1
  29. data/lib/mindee/product/international_id/international_id_v2_page.rb +1 -1
  30. data/lib/mindee/product/invoice/invoice_v4_document.rb +3 -1
  31. data/lib/mindee/product/invoice/invoice_v4_line_item.rb +7 -0
  32. data/lib/mindee/product/invoice/invoice_v4_page.rb +1 -1
  33. data/lib/mindee/product/multi_receipts_detector/multi_receipts_detector_v1_document.rb +1 -1
  34. data/lib/mindee/product/multi_receipts_detector/multi_receipts_detector_v1_page.rb +1 -1
  35. data/lib/mindee/product/receipt/receipt_v5_document.rb +1 -1
  36. data/lib/mindee/product/receipt/receipt_v5_page.rb +1 -1
  37. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1.rb +41 -0
  38. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_copay.rb +57 -0
  39. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_document.rb +127 -0
  40. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_page.rb +34 -0
  41. data/lib/mindee/product.rb +1 -0
  42. data/lib/mindee/version.rb +1 -1
  43. data/lib/mindee.rb +5 -0
  44. data/mindee.gemspec +1 -0
  45. metadata +32 -2
@@ -0,0 +1,322 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'ocr_extractor'
4
+
5
+ module Mindee
6
+ module Extraction
7
+ # Tax extractor class
8
+ class TaxExtractor < OcrExtractor
9
+ # Extracts the most relevant candidate.
10
+ # @param candidates [Array<Hash>] a candidate for the tax.
11
+ # @param tax_names [Array<String>] list of all possible names the tax can have.
12
+ # @return [Hash, nil]
13
+ def self.pick_best(candidates, tax_names)
14
+ return candidates[0] if candidates.size == 1
15
+ return nil if candidates.empty?
16
+
17
+ picked = 0
18
+ picked_score = 0
19
+
20
+ candidates.each_with_index do |candidate, i|
21
+ next unless valid_candidate?(candidate, tax_names)
22
+
23
+ sum_fields_score = calculate_score(candidate, i)
24
+
25
+ if picked_score < sum_fields_score
26
+ picked_score = sum_fields_score
27
+ picked = i
28
+ end
29
+ end
30
+
31
+ candidates[picked]
32
+ end
33
+
34
+ # Checks whether a tax code has been properly read. Shouldn't trigger except in case of very specific regex breaks
35
+ # due to unsupported diacritics.
36
+ # @param candidate [Hash] A candidate for the tax.
37
+ # @param tax_names [Array<String>] list of all possible names the tax can have.
38
+ # @return [Boolean]
39
+ def self.valid_candidate?(candidate, tax_names)
40
+ return false if tax_names.empty? || candidate.nil? || candidate['code'].nil?
41
+
42
+ tax_names.each do |tax_name|
43
+ return true if remove_accents(tax_name.downcase) == remove_accents(candidate['code'].downcase)
44
+ end
45
+ false
46
+ end
47
+
48
+ # [Experimental] computes the score of a valid candidate for a tax.
49
+ # @param candidate [Hash] A candidate for the tax.
50
+ # @param index [Integer]
51
+ def self.calculate_score(candidate, index)
52
+ score = index + 1
53
+ unless candidate['rate'].nil?
54
+ score += 1
55
+ score -= 2 if candidate['rate'] > 100
56
+ score -= 1 if candidate['rate'] > 30
57
+ end
58
+ score += 4 unless candidate['value'].nil?
59
+ score += 1 unless candidate['base'].nil?
60
+ score
61
+ end
62
+
63
+ # Curates tax values based on simple rules to avoid improbable data
64
+ # @param found_hash [Hash] Hash of currently retrieved values
65
+ # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
66
+ # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
67
+ # @return [Hash]
68
+ def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
69
+ reconstructed_hash = { 'code' => nil, 'page_id' => nil, 'rate' => nil, 'base' => nil, 'value' => nil }
70
+ return reconstructed_hash if found_hash.nil?
71
+
72
+ reconstructed_hash['code'] =
73
+ found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
74
+
75
+ if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
76
+ found_hash['rate'] =
77
+ found_hash['rate'] * 100
78
+ end
79
+ found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
80
+ found_hash = decimate_rates_if_needed(found_hash)
81
+ found_hash = fix_rate(found_hash)
82
+ reconstructed_hash['rate'] = found_hash['rate']
83
+ set_base_and_value(reconstructed_hash, found_hash)
84
+ end
85
+
86
+ # Swaps the rate with base or value if rate is out of bounds
87
+ # @param found_hash [Hash] Hash of currently retrieved values
88
+ # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
89
+ # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
90
+ # @return [Hash]
91
+ def self.swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
92
+ if found_hash['rate'] && (found_hash['rate'] > max_rate_percentage || found_hash['rate'] < min_rate_percentage)
93
+ if valid_percentage?(found_hash['base'], min_rate_percentage, max_rate_percentage)
94
+ found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
95
+ elsif valid_percentage?(found_hash['value'], min_rate_percentage, max_rate_percentage)
96
+ found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
97
+ end
98
+ end
99
+ found_hash
100
+ end
101
+
102
+ # Rates can't be negative if set.
103
+ # @param found_hash [Hash] Hash of currently retrieved values
104
+ def self.fix_rate(found_hash)
105
+ found_hash['rate'] = found_hash['rate'].abs unless found_hash['rate'].nil?
106
+ found_hash
107
+ end
108
+
109
+ # Swaps the rate with base or value if rate is out of bounds
110
+ # @param found_hash [Hash] Hash of currently retrieved values
111
+ # @return [Hash]
112
+ def self.decimate_rates_if_needed(found_hash)
113
+ if found_hash['rate'] && found_hash['rate'] > 100
114
+ if !found_hash['base'].nil? && found_hash['rate'] > found_hash['base']
115
+ found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
116
+ elsif !found_hash['value'].nil? && found_hash['rate'] > found_hash['value']
117
+ found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
118
+ end
119
+ end
120
+ found_hash
121
+ end
122
+
123
+ # Sets the base and value in the reconstructed hash based on certain conditions
124
+ # @param reconstructed_hash [Hash] Hash being reconstructed with new values
125
+ # @param found_hash [Hash] Hash of currently retrieved values
126
+ # @return [Hash]
127
+ def self.set_base_and_value(reconstructed_hash, found_hash)
128
+ if found_hash['base'].nil?
129
+ reconstructed_hash['base'] = found_hash['base']
130
+ reconstructed_hash['value'] = found_hash['value']
131
+ elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
132
+ reconstructed_hash['base'] = found_hash['value']
133
+ reconstructed_hash['value'] = found_hash['base']
134
+ else
135
+ reconstructed_hash['value'] = found_hash['value']
136
+ end
137
+ reconstructed_hash
138
+ end
139
+
140
+ # Extracts a single custom type of tax.
141
+ # For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
142
+ # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
143
+ # @param tax_names [Array<String>] list of all possible names the tax can have.
144
+ # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
145
+ # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
146
+ # @return [Mindee::Parsing::Standard::TaxField, nil]
147
+ def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_rate_percentage = 100)
148
+ return nil if ocr_result.is_a?(Mindee::Parsing::Common::Ocr) || tax_names.empty?
149
+
150
+ tax_names.sort!
151
+ found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
152
+ # a tax is considered found horizontally if it has a value, otherwise it is vertical
153
+ if found_hash.nil? || found_hash['value'].nil?
154
+ found_hash = extract_vertical_tax(ocr_result, tax_names,
155
+ found_hash)
156
+ end
157
+ found_hash = curate_values(found_hash, min_rate_percentage, max_rate_percentage)
158
+
159
+ return if found_hash.nil? || found_hash.empty?
160
+
161
+ create_tax_field(found_hash)
162
+ end
163
+
164
+ # Creates a tax field from a given hash.
165
+ # @param found_hash [Hash] Hash of currently retrieved values
166
+ # @return [Mindee::Parsing::Standard::TaxField]
167
+ def self.create_tax_field(found_hash)
168
+ Mindee::Parsing::Standard::TaxField.new(
169
+ found_hash,
170
+ found_hash.key?('page_id') ? found_hash['page_id'] : nil
171
+ )
172
+ end
173
+
174
+ # Extracts the rate and code, if found, from matches into the found_hash.
175
+ # @param matches [MatchData] RegEx matches of the values for taxes
176
+ # @param found_hash [Hash] Hash of currently retrieved values
177
+ # @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
178
+ # @return [Hash]
179
+ def self.extract_percentage_from_tax(matches, found_hash, percent_first)
180
+ if percent_first
181
+ found_hash['code'] = matches[2].strip unless matches[2].nil?
182
+ found_hash['rate'] = parse_amount(matches[1].gsub('%', '')) unless matches[1].nil?
183
+ else
184
+ found_hash['code'] = matches[1].strip unless matches[1].nil?
185
+ found_hash['rate'] = parse_amount(matches[2].gsub('%', '')) unless matches[2].nil?
186
+ end
187
+ found_hash
188
+ end
189
+
190
+ # rubocop:disable Metrics/CyclomaticComplexity
191
+ # rubocop:disable Metrics/PerceivedComplexity
192
+
193
+ # Extracts the basis and value of a tax from regex matches, independent of the order.
194
+ # @param matches [MatchData] RegEx matches of the values for taxes
195
+ # @param found_hash [Hash] Hash of currently retrieved values
196
+ # @return [Hash]
197
+ def self.extract_basis_and_value(matches, found_hash)
198
+ if matches[4].nil? && !matches[3].nil?
199
+ found_hash['value'] = parse_amount(matches[3]) unless matches[3].nil?
200
+ elsif matches[3].nil? && !matches[4].nil?
201
+ found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
202
+ elsif !matches[3].nil? && !matches[4].nil?
203
+ found_hash['base'] = parse_amount(matches[3]) unless matches[3].nil?
204
+ found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
205
+ end
206
+ found_hash
207
+ end
208
+
209
+ # rubocop:enable Metrics/CyclomaticComplexity
210
+ # rubocop:enable Metrics/PerceivedComplexity
211
+
212
+ # Extracts tax information from a horizontal line.
213
+ # @param line [String] Line to be processed.
214
+ # @param pattern [Regexp] RegEx pattern to search the line with.
215
+ # @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
216
+ # @return [Hash]
217
+ def self.extract_tax_from_horizontal_line(line, pattern, page_id, percent_first)
218
+ found_hash = {}
219
+
220
+ matches = line.match(pattern)
221
+
222
+ # Edge case for when the tax is split-up between two pages, we'll consider that
223
+ # the answer belongs to the first one.
224
+ found_hash['page_id'] = page_id unless found_hash.key?('page_id')
225
+ return found_hash if matches.nil?
226
+
227
+ found_hash = extract_percentage_from_tax(matches, found_hash, percent_first)
228
+ extract_basis_and_value(matches, found_hash)
229
+ end
230
+
231
+ # rubocop:disable Metrics/CyclomaticComplexity
232
+ # rubocop:disable Metrics/PerceivedComplexity
233
+
234
+ # Processes a horizontal line for tax extraction. Returns a hash of collected values.
235
+ # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] Processed OCR results.
236
+ # @param tax_names [Array<String>] Possible tax names candidates.
237
+ # @return [Array<Hash>]
238
+ def self.extract_horizontal_tax(ocr_result, tax_names)
239
+ candidates = [{ 'code' => nil, 'value' => nil, 'base' => nil, 'rate' => nil }]
240
+ linear_pattern_percent_first = %r{
241
+ ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
242
+ ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
243
+ ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
244
+ ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
245
+ }x
246
+ linear_pattern_percent_second = %r{
247
+ ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
248
+ ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
249
+ ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
250
+ ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
251
+ }x
252
+ ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
253
+ page.all_lines.each do |line|
254
+ clean_line = remove_currency_symbols(line.to_s.scrub.gsub(%r{[+(\[)\]¿?*_]}, '')).gsub(%r{\.{2,}}, ' ')
255
+ .gsub(%r{ +}, ' ').strip
256
+
257
+ next if match_index(clean_line, tax_names).nil?
258
+
259
+ unless clean_line.match(linear_pattern_percent_second).nil?
260
+ candidates.append(extract_tax_from_horizontal_line(clean_line[match_index(clean_line, tax_names)..],
261
+ linear_pattern_percent_second, page_id, false))
262
+ end
263
+ if clean_line.include?('%') && !clean_line.match(linear_pattern_percent_first).nil?
264
+ candidates.append(extract_tax_from_horizontal_line(clean_line[clean_line.index(%r{\d*[.,]?\d* ?%})..],
265
+ linear_pattern_percent_first, page_id, true))
266
+ elsif !clean_line.match(linear_pattern_percent_first).nil?
267
+ candidates.append(extract_tax_from_horizontal_line(clean_line,
268
+ linear_pattern_percent_first, page_id, true))
269
+ end
270
+ end
271
+ end
272
+ candidates
273
+ end
274
+ # rubocop:enable Metrics/CyclomaticComplexity
275
+ # rubocop:enable Metrics/PerceivedComplexity
276
+
277
+ # Processes a vertical reconstructed line for tax extraction. Returns a hash of collected values.
278
+ # @param line [Mindee::Parsing::Common::Ocr::OcrLine] Processed OCR results.
279
+ # @param found_hash [Hash] Hash containing previously found values, if any.
280
+ # @return [Hash]
281
+ def self.extract_vertical_tax_values(line, found_hash)
282
+ amounts = []
283
+ line.each do |reconstructed_word|
284
+ amounts.push(parse_amount(reconstructed_word.text)) unless parse_amount(reconstructed_word.text).nil?
285
+ end
286
+ if amounts.length == 1 && !found_hash.key?('value')
287
+ found_hash['value'] = amounts[0]
288
+ else
289
+ found_hash['rate'] = amounts[0] if found_hash['rate'].nil?
290
+ found_hash['value'] = amounts[1] if found_hash['value'].nil?
291
+ end
292
+ found_hash
293
+ end
294
+
295
+ # Extracts tax data from a vertical reconstructed row.
296
+ # @param ocr_result [Mindee::Parsing::Common::Ocr] OCR raw results
297
+ # @param tax_names [Array<String>] Array of possible names a tax can have
298
+ # @param found_hash [Hash] Hash of currently retrieved values
299
+ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
300
+ found_hash = { 'code' => nil, 'page_id' => nil } if found_hash.nil?
301
+
302
+ ocr_result.mvision_v1.pages.each_with_index do |page, page_id|
303
+ page.all_words.each do |word|
304
+ next if match_index(word.text, tax_names).nil?
305
+
306
+ reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
307
+ found_hash['page_id'] = page_id if found_hash['page_id'].nil?
308
+ found_hash['code'] = word.text.strip if found_hash['code'].nil?
309
+ found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
310
+ end
311
+ end
312
+ found_hash
313
+ end
314
+
315
+ private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
316
+ :extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
317
+ :create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
318
+ :decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
319
+ :swap_rates_if_needed
320
+ end
321
+ end
322
+ end
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'extraction/tax_extractor'
@@ -77,5 +77,24 @@ module Mindee
77
77
  coords = points.map(&:x)
78
78
  MinMax.new(coords.min, coords.max)
79
79
  end
80
+
81
+ # Checks whether a set of coordinates is below another on the page, with a slight margin for the lateral value.
82
+ # @param candidate [Array<Mindee::Geometry::Point] Polygon to check
83
+ # @param anchor [Array<Mindee::Geometry::Point] Reference polygon
84
+ # @param margin_left [Float] Margin tolerance on the left of the anchor
85
+ # @param margin_right [Float] Margin tolerance on the right of the anchor
86
+ def self.below?(candidate, anchor, margin_left, margin_right)
87
+ return false if Geometry.get_min_max_y(candidate).min < Geometry.get_min_max_y(anchor).min
88
+ if Geometry.get_min_max_x(candidate).min <
89
+ Geometry.get_min_max_x(anchor).min - (Geometry.get_min_max_x(anchor).min * margin_left)
90
+ return false
91
+ end
92
+ if Geometry.get_min_max_x(candidate).max >
93
+ Geometry.get_min_max_x(anchor).max + (Geometry.get_min_max_x(anchor).max * margin_right)
94
+ return false
95
+ end
96
+
97
+ true
98
+ end
80
99
  end
81
100
  end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../input/sources'
4
+
5
+ module Mindee
6
+ # Image Extraction Module.
7
+ module ImageExtraction
8
+ # Generic class for image extraction.
9
+ class ExtractedImage
10
+ # Id of the page the image was extracted from.
11
+ attr_reader :page_id
12
+
13
+ # Id of the element on a given page.
14
+ attr_reader :element_id
15
+
16
+ # Buffer object of the file's content.
17
+ attr_reader :buffer
18
+
19
+ # Internal name for the file.
20
+ attr_reader :internal_file_name
21
+
22
+ # Initializes the ExtractedImage with a buffer and an internal file name.
23
+ #
24
+ # @param input_source [LocalInputSource] Local source for input.
25
+ # @param page_id [Integer] ID of the page the element was found on.
26
+ # @param element_id [Integer, nil] ID of the element in a page.
27
+ def initialize(input_source, page_id, element_id)
28
+ @buffer = StringIO.new(input_source.io_stream.read)
29
+ @buffer.rewind
30
+ extension = if input_source.pdf?
31
+ 'jpg'
32
+ else
33
+ File.extname(input_source.filename)
34
+ end
35
+ @internal_file_name = "#{input_source.filename}_p#{page_id}_#{element_id}.#{extension}"
36
+ @page_id = page_id
37
+ @element_id = element_id.nil? ? 0 : element_id
38
+ end
39
+
40
+ # Saves the document to a file.
41
+ #
42
+ # @param output_path [String] Path to save the file to.
43
+ # @param file_format [String, nil] Optional MiniMagick-compatible format for the file. Inferred from file
44
+ # extension if not provided.
45
+ # @raise [MindeeError] If an invalid path or filename is provided.
46
+ def save_to_file(output_path, file_format = nil)
47
+ resolved_path = Pathname.new(output_path).realpath
48
+ if file_format.nil?
49
+ raise ArgumentError, 'Invalid file format.' if resolved_path.extname.delete('.').empty?
50
+
51
+ file_format = resolved_path.extname.delete('.').upcase
52
+ end
53
+ @buffer.rewind
54
+ image = MiniMagick::Image.read(@buffer)
55
+ image.format file_format.downcase
56
+ image.write resolved_path.to_s
57
+ logger.info("File saved successfully to '#{resolved_path}'.")
58
+ rescue TypeError
59
+ raise 'Invalid path/filename provided.'
60
+ rescue StandardError
61
+ raise "Could not save file #{Pathname.new(output_path).basename}."
62
+ end
63
+
64
+ # Return the file as a Mindee-compatible BufferInput source.
65
+ #
66
+ # @return [FileInputSource] A BufferInput source.
67
+ def as_source
68
+ @buffer.rewind
69
+ Mindee::Input::Source::BytesInputSource.new(@buffer.read, @internal_file_name)
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,191 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mini_magick'
4
+ require 'origami'
5
+ require 'stringio'
6
+ require 'tempfile'
7
+ require_relative '../../input/sources'
8
+ require_relative 'extracted_image'
9
+
10
+ module Mindee
11
+ # Image Extraction Module.
12
+ module ImageExtraction
13
+ def attach_image_as_new_file(input_buffer)
14
+ # Attaches an image as a new page in a PdfDocument object.
15
+ #
16
+ # @param [StringIO] input_buffer Input buffer. Only supports JPEG.
17
+ # @return [Origami::PDF] A PdfDocument handle.
18
+
19
+ magick_image = MiniMagick::Image.read(input_buffer)
20
+ # NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
21
+ # converted.
22
+ magick_image.format('jpg')
23
+ original_density = magick_image.resolution
24
+ scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
25
+ # the pdf otherwise the resulting image shrinks.
26
+ magick_image.format('pdf', 0, { density: scale_factor.to_s })
27
+ io_buffer = StringIO.new
28
+ magick_image.write(io_buffer)
29
+ Origami::PDF.read(io_buffer)
30
+ end
31
+
32
+ # Extracts multiple images from a given local input source.
33
+ #
34
+ # @param [Mindee::Input::Source::LocalInputSource] input_source
35
+ # @param [Integer] page_id ID of the Page to extract from.
36
+ # @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates
37
+ # to extract.
38
+ # @return [Array<Mindee::ImageExtraction::ExtractedImage>] Extracted Images.
39
+ def extract_multiple_images_from_source(input_source, page_id, polygons)
40
+ new_stream = load_doc(input_source, page_id)
41
+ new_stream.seek(0)
42
+
43
+ extract_images_from_polygons(input_source, new_stream, page_id, polygons)
44
+ end
45
+
46
+ # Retrieves a PDF document's page.
47
+ #
48
+ # @param [Origami::PDF] pdf_doc Origami PDF handle.
49
+ # @param [Integer] page_id Page ID.
50
+ def get_page(pdf_doc, page_id)
51
+ stream = StringIO.new
52
+ pdf_doc.save(stream)
53
+
54
+ options = {
55
+ page_indexes: [page_id - 1],
56
+ }
57
+
58
+ Mindee::PDF::PdfProcessor.parse(stream, options)
59
+ end
60
+
61
+ # Extracts images from their positions on a file (as polygons).
62
+ #
63
+ # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
64
+ # @param [StringIO] pdf_stream Buffer of the PDF.
65
+ # @param [Integer] page_id Page ID.
66
+ # @param [Array<Mindee::Geometry::Point, Mindee::Geometry::Polygon, Mindee::Geometry::Quadrangle>] polygons
67
+ # @return [Array<Mindee::ImageExtraction::ExtractedImage>] Extracted Images.
68
+ def extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
69
+ extracted_elements = []
70
+
71
+ polygons.each_with_index do |polygon, element_id|
72
+ polygon = normalize_polygon(polygon)
73
+ page_content = read_page_content(pdf_stream)
74
+
75
+ min_max_x = Geometry.get_min_max_x([
76
+ polygon.top_left,
77
+ polygon.bottom_right,
78
+ polygon.top_right,
79
+ polygon.bottom_left,
80
+ ])
81
+ min_max_y = Geometry.get_min_max_y([
82
+ polygon.top_left,
83
+ polygon.bottom_right,
84
+ polygon.top_right,
85
+ polygon.bottom_left,
86
+ ])
87
+ file_extension = determine_file_extension(input_source)
88
+ cropped_image = crop_image(page_content, min_max_x, min_max_y)
89
+ if file_extension == 'pdf'
90
+ cropped_image.format('jpg')
91
+ else
92
+ cropped_image.format(file_extension)
93
+ end
94
+
95
+ buffer = StringIO.new
96
+ write_image_to_buffer(cropped_image, buffer)
97
+ file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"
98
+
99
+ extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
100
+ end
101
+
102
+ extracted_elements
103
+ end
104
+
105
+ # Retrieves the bounding box of a polygon.
106
+ #
107
+ # @param [Array<Point>, Mindee::Geometry::Polygon] polygon
108
+ def normalize_polygon(polygon)
109
+ if polygon.is_a?(Mindee::Geometry::Polygon)
110
+ Mindee::Geometry.get_bounding_box(polygon)
111
+ else
112
+ polygon
113
+ end
114
+ end
115
+
116
+ # Loads a buffer into a MiniMagick Image.
117
+ #
118
+ # @param [StringIO] pdf_stream Buffer containg the PDF
119
+ # @return [MiniMagick::Image] a valid MiniMagick image handle.
120
+ def read_page_content(pdf_stream)
121
+ pdf_stream.rewind
122
+ MiniMagick::Image.read(pdf_stream)
123
+ end
124
+
125
+ # Crops a MiniMagick Image from a the given bounding box.
126
+ #
127
+ # @param [MiniMagick::Image] image Input Image.
128
+ # @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
129
+ # @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
130
+ def crop_image(image, min_max_x, min_max_y)
131
+ width = image[:width].to_i
132
+ height = image[:height].to_i
133
+
134
+ image.format('jpg')
135
+ new_width = (min_max_x.max - min_max_x.min) * width
136
+ new_height = (min_max_y.max - min_max_y.min) * height
137
+ image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")
138
+
139
+ image
140
+ end
141
+
142
+ # Writes a MiniMagick::Image to a buffer.
143
+ #
144
+ # @param [MiniMagick::Image] image a valid MiniMagick image.
145
+ # @param [StringIO] buffer
146
+ def write_image_to_buffer(image, buffer)
147
+ image.write(buffer)
148
+ end
149
+
150
+ # Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.
151
+ #
152
+ # @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
153
+ # @return [String] A valid file extension.
154
+ def determine_file_extension(input_source)
155
+ if input_source.pdf? || input_source.filename.downcase.end_with?('pdf')
156
+ 'jpg'
157
+ else
158
+ File.extname(input_source.filename).strip.downcase[1..]
159
+ end
160
+ end
161
+
162
+ # Generates an ExtractedImage.
163
+ #
164
+ # @param [StringIO] buffer Buffer containing the image.
165
+ # @param [String] file_name Name for the file.
166
+ # @param [Object] page_id ID of the page the file was generated from.
167
+ # @param [Object] element_id ID of the element of a given page.
168
+ def create_extracted_image(buffer, file_name, page_id, element_id)
169
+ buffer.rewind
170
+ ExtractedImage.new(
171
+ Mindee::Input::Source::BytesInputSource.new(buffer.read, file_name),
172
+ page_id,
173
+ element_id
174
+ )
175
+ end
176
+
177
+ # Loads a single_page from an image file or a pdf document.
178
+ #
179
+ # @param input_file [LocalInputSource] Local input.
180
+ # @param [Integer] page_id Page ID.
181
+ # @return [MiniMagick::Image] A valid PdfDocument handle.
182
+ def load_doc(input_file, page_id)
183
+ input_file.io_stream.rewind
184
+ if input_file.pdf?
185
+ get_page(Origami::PDF.read(input_file.io_stream), page_id)
186
+ else
187
+ input_file.io_stream
188
+ end
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'common/image_extractor'
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Image Extraction Module.
5
+ module ImageExtraction
6
+ def extract_receipts(input_source, inference)
7
+ # Extracts individual receipts from multi-receipts documents.
8
+ #
9
+ # @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from.
10
+ # @param inference [Inference] Results of the inference.
11
+ # @return [Array<ExtractedImage>] Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
12
+
13
+ images = []
14
+ raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts
15
+
16
+ (0...input_source.count_pdf_pages).each do |page_id|
17
+ receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box)
18
+ images.concat(
19
+ extract_multiple_images_from_source(input_source, page_id + 1, receipt_positions)
20
+ )
21
+ end
22
+
23
+ images
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'multi_receipts_extractor/multi_receipts_extractor'
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'image_extraction/common'
4
+ require_relative 'image_extraction/multi_receipts_extractor'
@@ -118,6 +118,14 @@ module Mindee
118
118
  @io_stream.close if close
119
119
  ['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
120
120
  end
121
+
122
+ def count_pdf_pages
123
+ return 1 unless pdf?
124
+
125
+ @io_stream.seek(0)
126
+ pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
127
+ pdf_processor.pages.size
128
+ end
121
129
  end
122
130
 
123
131
  # Load a document from a path.