mindee 3.10.0 → 3.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/bin/mindee.rb +7 -1
- data/docs/code_samples/financial_document_v1_async.txt +19 -0
- data/docs/code_samples/us_healthcare_cards_v1_async.txt +19 -0
- data/docs/expense_receipts_v5.md +12 -10
- data/docs/financial_document_v1.md +51 -22
- data/docs/invoices_v4.md +4 -3
- data/docs/us_healthcare_cards_v1.md +204 -0
- data/lib/mindee/extraction/ocr_extractor.rb +110 -0
- data/lib/mindee/extraction/tax_extractor.rb +322 -0
- data/lib/mindee/extraction.rb +3 -0
- data/lib/mindee/geometry/utils.rb +19 -0
- data/lib/mindee/image_extraction/common/extracted_image.rb +73 -0
- data/lib/mindee/image_extraction/common/image_extractor.rb +191 -0
- data/lib/mindee/image_extraction/common.rb +3 -0
- data/lib/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.rb +26 -0
- data/lib/mindee/image_extraction/multi_receipts_extractor.rb +3 -0
- data/lib/mindee/image_extraction.rb +4 -0
- data/lib/mindee/input/sources.rb +8 -0
- data/lib/mindee/parsing/common/api_response.rb +1 -1
- data/lib/mindee/parsing/common/ocr/mvision_v1.rb +16 -0
- data/lib/mindee/parsing/common/ocr/ocr.rb +10 -0
- data/lib/mindee/parsing/standard/company_registration_field.rb +17 -0
- data/lib/mindee/product/financial_document/financial_document_v1_document.rb +3 -1
- data/lib/mindee/product/financial_document/financial_document_v1_line_item.rb +7 -0
- data/lib/mindee/product/financial_document/financial_document_v1_page.rb +1 -1
- data/lib/mindee/product/international_id/international_id_v2_document.rb +1 -1
- data/lib/mindee/product/international_id/international_id_v2_page.rb +1 -1
- data/lib/mindee/product/invoice/invoice_v4_document.rb +3 -1
- data/lib/mindee/product/invoice/invoice_v4_line_item.rb +7 -0
- data/lib/mindee/product/invoice/invoice_v4_page.rb +1 -1
- data/lib/mindee/product/multi_receipts_detector/multi_receipts_detector_v1_document.rb +1 -1
- data/lib/mindee/product/multi_receipts_detector/multi_receipts_detector_v1_page.rb +1 -1
- data/lib/mindee/product/receipt/receipt_v5_document.rb +1 -1
- data/lib/mindee/product/receipt/receipt_v5_page.rb +1 -1
- data/lib/mindee/product/us/healthcare_card/healthcare_card_v1.rb +41 -0
- data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_copay.rb +57 -0
- data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_document.rb +127 -0
- data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_page.rb +34 -0
- data/lib/mindee/product.rb +1 -0
- data/lib/mindee/version.rb +1 -1
- data/lib/mindee.rb +5 -0
- data/mindee.gemspec +1 -0
- metadata +32 -2
@@ -0,0 +1,322 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'ocr_extractor'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Extraction
|
7
|
+
# Tax extractor class
|
8
|
+
class TaxExtractor < OcrExtractor
|
9
|
+
# Extracts the most relevant candidate.
|
10
|
+
# @param candidates [Array<Hash>] a candidate for the tax.
|
11
|
+
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
12
|
+
# @return [Hash, nil]
|
13
|
+
def self.pick_best(candidates, tax_names)
|
14
|
+
return candidates[0] if candidates.size == 1
|
15
|
+
return nil if candidates.empty?
|
16
|
+
|
17
|
+
picked = 0
|
18
|
+
picked_score = 0
|
19
|
+
|
20
|
+
candidates.each_with_index do |candidate, i|
|
21
|
+
next unless valid_candidate?(candidate, tax_names)
|
22
|
+
|
23
|
+
sum_fields_score = calculate_score(candidate, i)
|
24
|
+
|
25
|
+
if picked_score < sum_fields_score
|
26
|
+
picked_score = sum_fields_score
|
27
|
+
picked = i
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
candidates[picked]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Checks whether a tax code has been properly read. Shouldn't trigger except in case of very specific regex breaks
|
35
|
+
# due to unsupported diacritics.
|
36
|
+
# @param candidate [Hash] A candidate for the tax.
|
37
|
+
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
38
|
+
# @return [Boolean]
|
39
|
+
def self.valid_candidate?(candidate, tax_names)
|
40
|
+
return false if tax_names.empty? || candidate.nil? || candidate['code'].nil?
|
41
|
+
|
42
|
+
tax_names.each do |tax_name|
|
43
|
+
return true if remove_accents(tax_name.downcase) == remove_accents(candidate['code'].downcase)
|
44
|
+
end
|
45
|
+
false
|
46
|
+
end
|
47
|
+
|
48
|
+
# [Experimental] computes the score of a valid candidate for a tax.
|
49
|
+
# @param candidate [Hash] A candidate for the tax.
|
50
|
+
# @param index [Integer]
|
51
|
+
def self.calculate_score(candidate, index)
|
52
|
+
score = index + 1
|
53
|
+
unless candidate['rate'].nil?
|
54
|
+
score += 1
|
55
|
+
score -= 2 if candidate['rate'] > 100
|
56
|
+
score -= 1 if candidate['rate'] > 30
|
57
|
+
end
|
58
|
+
score += 4 unless candidate['value'].nil?
|
59
|
+
score += 1 unless candidate['base'].nil?
|
60
|
+
score
|
61
|
+
end
|
62
|
+
|
63
|
+
# Curates tax values based on simple rules to avoid improbable data
|
64
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
65
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
66
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
67
|
+
# @return [Hash]
|
68
|
+
def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
|
69
|
+
reconstructed_hash = { 'code' => nil, 'page_id' => nil, 'rate' => nil, 'base' => nil, 'value' => nil }
|
70
|
+
return reconstructed_hash if found_hash.nil?
|
71
|
+
|
72
|
+
reconstructed_hash['code'] =
|
73
|
+
found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
|
74
|
+
|
75
|
+
if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
|
76
|
+
found_hash['rate'] =
|
77
|
+
found_hash['rate'] * 100
|
78
|
+
end
|
79
|
+
found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
|
80
|
+
found_hash = decimate_rates_if_needed(found_hash)
|
81
|
+
found_hash = fix_rate(found_hash)
|
82
|
+
reconstructed_hash['rate'] = found_hash['rate']
|
83
|
+
set_base_and_value(reconstructed_hash, found_hash)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Swaps the rate with base or value if rate is out of bounds
|
87
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
88
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
89
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
90
|
+
# @return [Hash]
|
91
|
+
def self.swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
|
92
|
+
if found_hash['rate'] && (found_hash['rate'] > max_rate_percentage || found_hash['rate'] < min_rate_percentage)
|
93
|
+
if valid_percentage?(found_hash['base'], min_rate_percentage, max_rate_percentage)
|
94
|
+
found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
|
95
|
+
elsif valid_percentage?(found_hash['value'], min_rate_percentage, max_rate_percentage)
|
96
|
+
found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
|
97
|
+
end
|
98
|
+
end
|
99
|
+
found_hash
|
100
|
+
end
|
101
|
+
|
102
|
+
# Rates can't be negative if set.
|
103
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
104
|
+
def self.fix_rate(found_hash)
|
105
|
+
found_hash['rate'] = found_hash['rate'].abs unless found_hash['rate'].nil?
|
106
|
+
found_hash
|
107
|
+
end
|
108
|
+
|
109
|
+
# Swaps the rate with base or value if rate is out of bounds
|
110
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
111
|
+
# @return [Hash]
|
112
|
+
def self.decimate_rates_if_needed(found_hash)
|
113
|
+
if found_hash['rate'] && found_hash['rate'] > 100
|
114
|
+
if !found_hash['base'].nil? && found_hash['rate'] > found_hash['base']
|
115
|
+
found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
|
116
|
+
elsif !found_hash['value'].nil? && found_hash['rate'] > found_hash['value']
|
117
|
+
found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
|
118
|
+
end
|
119
|
+
end
|
120
|
+
found_hash
|
121
|
+
end
|
122
|
+
|
123
|
+
# Sets the base and value in the reconstructed hash based on certain conditions
|
124
|
+
# @param reconstructed_hash [Hash] Hash being reconstructed with new values
|
125
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
126
|
+
# @return [Hash]
|
127
|
+
def self.set_base_and_value(reconstructed_hash, found_hash)
|
128
|
+
if found_hash['base'].nil?
|
129
|
+
reconstructed_hash['base'] = found_hash['base']
|
130
|
+
reconstructed_hash['value'] = found_hash['value']
|
131
|
+
elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
|
132
|
+
reconstructed_hash['base'] = found_hash['value']
|
133
|
+
reconstructed_hash['value'] = found_hash['base']
|
134
|
+
else
|
135
|
+
reconstructed_hash['value'] = found_hash['value']
|
136
|
+
end
|
137
|
+
reconstructed_hash
|
138
|
+
end
|
139
|
+
|
140
|
+
# Extracts a single custom type of tax.
|
141
|
+
# For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
|
142
|
+
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
|
143
|
+
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
144
|
+
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
145
|
+
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
146
|
+
# @return [Mindee::Parsing::Standard::TaxField, nil]
|
147
|
+
def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_rate_percentage = 100)
|
148
|
+
return nil if ocr_result.is_a?(Mindee::Parsing::Common::Ocr) || tax_names.empty?
|
149
|
+
|
150
|
+
tax_names.sort!
|
151
|
+
found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
|
152
|
+
# a tax is considered found horizontally if it has a value, otherwise it is vertical
|
153
|
+
if found_hash.nil? || found_hash['value'].nil?
|
154
|
+
found_hash = extract_vertical_tax(ocr_result, tax_names,
|
155
|
+
found_hash)
|
156
|
+
end
|
157
|
+
found_hash = curate_values(found_hash, min_rate_percentage, max_rate_percentage)
|
158
|
+
|
159
|
+
return if found_hash.nil? || found_hash.empty?
|
160
|
+
|
161
|
+
create_tax_field(found_hash)
|
162
|
+
end
|
163
|
+
|
164
|
+
# Creates a tax field from a given hash.
|
165
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
166
|
+
# @return [Mindee::Parsing::Standard::TaxField]
|
167
|
+
def self.create_tax_field(found_hash)
|
168
|
+
Mindee::Parsing::Standard::TaxField.new(
|
169
|
+
found_hash,
|
170
|
+
found_hash.key?('page_id') ? found_hash['page_id'] : nil
|
171
|
+
)
|
172
|
+
end
|
173
|
+
|
174
|
+
# Extracts the rate and code, if found, from matches into the found_hash.
|
175
|
+
# @param matches [MatchData] RegEx matches of the values for taxes
|
176
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
177
|
+
# @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
|
178
|
+
# @return [Hash]
|
179
|
+
def self.extract_percentage_from_tax(matches, found_hash, percent_first)
|
180
|
+
if percent_first
|
181
|
+
found_hash['code'] = matches[2].strip unless matches[2].nil?
|
182
|
+
found_hash['rate'] = parse_amount(matches[1].gsub('%', '')) unless matches[1].nil?
|
183
|
+
else
|
184
|
+
found_hash['code'] = matches[1].strip unless matches[1].nil?
|
185
|
+
found_hash['rate'] = parse_amount(matches[2].gsub('%', '')) unless matches[2].nil?
|
186
|
+
end
|
187
|
+
found_hash
|
188
|
+
end
|
189
|
+
|
190
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
191
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
192
|
+
|
193
|
+
# Extracts the basis and value of a tax from regex matches, independent of the order.
|
194
|
+
# @param matches [MatchData] RegEx matches of the values for taxes
|
195
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
196
|
+
# @return [Hash]
|
197
|
+
def self.extract_basis_and_value(matches, found_hash)
|
198
|
+
if matches[4].nil? && !matches[3].nil?
|
199
|
+
found_hash['value'] = parse_amount(matches[3]) unless matches[3].nil?
|
200
|
+
elsif matches[3].nil? && !matches[4].nil?
|
201
|
+
found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
|
202
|
+
elsif !matches[3].nil? && !matches[4].nil?
|
203
|
+
found_hash['base'] = parse_amount(matches[3]) unless matches[3].nil?
|
204
|
+
found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
|
205
|
+
end
|
206
|
+
found_hash
|
207
|
+
end
|
208
|
+
|
209
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
210
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
211
|
+
|
212
|
+
# Extracts tax information from a horizontal line.
|
213
|
+
# @param line [String] Line to be processed.
|
214
|
+
# @param pattern [Regexp] RegEx pattern to search the line with.
|
215
|
+
# @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
|
216
|
+
# @return [Hash]
|
217
|
+
def self.extract_tax_from_horizontal_line(line, pattern, page_id, percent_first)
|
218
|
+
found_hash = {}
|
219
|
+
|
220
|
+
matches = line.match(pattern)
|
221
|
+
|
222
|
+
# Edge case for when the tax is split-up between two pages, we'll consider that
|
223
|
+
# the answer belongs to the first one.
|
224
|
+
found_hash['page_id'] = page_id unless found_hash.key?('page_id')
|
225
|
+
return found_hash if matches.nil?
|
226
|
+
|
227
|
+
found_hash = extract_percentage_from_tax(matches, found_hash, percent_first)
|
228
|
+
extract_basis_and_value(matches, found_hash)
|
229
|
+
end
|
230
|
+
|
231
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
232
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
233
|
+
|
234
|
+
# Processes a horizontal line for tax extraction. Returns a hash of collected values.
|
235
|
+
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] Processed OCR results.
|
236
|
+
# @param tax_names [Array<String>] Possible tax names candidates.
|
237
|
+
# @return [Array<Hash>]
|
238
|
+
def self.extract_horizontal_tax(ocr_result, tax_names)
|
239
|
+
candidates = [{ 'code' => nil, 'value' => nil, 'base' => nil, 'rate' => nil }]
|
240
|
+
linear_pattern_percent_first = %r{
|
241
|
+
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
|
242
|
+
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
|
243
|
+
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
|
244
|
+
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
|
245
|
+
}x
|
246
|
+
linear_pattern_percent_second = %r{
|
247
|
+
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
|
248
|
+
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
|
249
|
+
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
|
250
|
+
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
|
251
|
+
}x
|
252
|
+
ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
|
253
|
+
page.all_lines.each do |line|
|
254
|
+
clean_line = remove_currency_symbols(line.to_s.scrub.gsub(%r{[+(\[)\]¿?*_]}, '')).gsub(%r{\.{2,}}, ' ')
|
255
|
+
.gsub(%r{ +}, ' ').strip
|
256
|
+
|
257
|
+
next if match_index(clean_line, tax_names).nil?
|
258
|
+
|
259
|
+
unless clean_line.match(linear_pattern_percent_second).nil?
|
260
|
+
candidates.append(extract_tax_from_horizontal_line(clean_line[match_index(clean_line, tax_names)..],
|
261
|
+
linear_pattern_percent_second, page_id, false))
|
262
|
+
end
|
263
|
+
if clean_line.include?('%') && !clean_line.match(linear_pattern_percent_first).nil?
|
264
|
+
candidates.append(extract_tax_from_horizontal_line(clean_line[clean_line.index(%r{\d*[.,]?\d* ?%})..],
|
265
|
+
linear_pattern_percent_first, page_id, true))
|
266
|
+
elsif !clean_line.match(linear_pattern_percent_first).nil?
|
267
|
+
candidates.append(extract_tax_from_horizontal_line(clean_line,
|
268
|
+
linear_pattern_percent_first, page_id, true))
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
candidates
|
273
|
+
end
|
274
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
275
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
276
|
+
|
277
|
+
# Processes a vertical reconstructed line for tax extraction. Returns a hash of collected values.
|
278
|
+
# @param line [Mindee::Parsing::Common::Ocr::OcrLine] Processed OCR results.
|
279
|
+
# @param found_hash [Hash] Hash containing previously found values, if any.
|
280
|
+
# @return [Hash]
|
281
|
+
def self.extract_vertical_tax_values(line, found_hash)
|
282
|
+
amounts = []
|
283
|
+
line.each do |reconstructed_word|
|
284
|
+
amounts.push(parse_amount(reconstructed_word.text)) unless parse_amount(reconstructed_word.text).nil?
|
285
|
+
end
|
286
|
+
if amounts.length == 1 && !found_hash.key?('value')
|
287
|
+
found_hash['value'] = amounts[0]
|
288
|
+
else
|
289
|
+
found_hash['rate'] = amounts[0] if found_hash['rate'].nil?
|
290
|
+
found_hash['value'] = amounts[1] if found_hash['value'].nil?
|
291
|
+
end
|
292
|
+
found_hash
|
293
|
+
end
|
294
|
+
|
295
|
+
# Extracts tax data from a vertical reconstructed row.
|
296
|
+
# @param ocr_result [Mindee::Parsing::Common::Ocr] OCR raw results
|
297
|
+
# @param tax_names [Array<String>] Array of possible names a tax can have
|
298
|
+
# @param found_hash [Hash] Hash of currently retrieved values
|
299
|
+
def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
|
300
|
+
found_hash = { 'code' => nil, 'page_id' => nil } if found_hash.nil?
|
301
|
+
|
302
|
+
ocr_result.mvision_v1.pages.each_with_index do |page, page_id|
|
303
|
+
page.all_words.each do |word|
|
304
|
+
next if match_index(word.text, tax_names).nil?
|
305
|
+
|
306
|
+
reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
|
307
|
+
found_hash['page_id'] = page_id if found_hash['page_id'].nil?
|
308
|
+
found_hash['code'] = word.text.strip if found_hash['code'].nil?
|
309
|
+
found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
found_hash
|
313
|
+
end
|
314
|
+
|
315
|
+
private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
|
316
|
+
:extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
|
317
|
+
:create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
|
318
|
+
:decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
|
319
|
+
:swap_rates_if_needed
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
@@ -77,5 +77,24 @@ module Mindee
|
|
77
77
|
coords = points.map(&:x)
|
78
78
|
MinMax.new(coords.min, coords.max)
|
79
79
|
end
|
80
|
+
|
81
|
+
# Checks whether a set of coordinates is below another on the page, with a slight margin for the lateral value.
|
82
|
+
# @param candidate [Array<Mindee::Geometry::Point] Polygon to check
|
83
|
+
# @param anchor [Array<Mindee::Geometry::Point] Reference polygon
|
84
|
+
# @param margin_left [Float] Margin tolerance on the left of the anchor
|
85
|
+
# @param margin_right [Float] Margin tolerance on the right of the anchor
|
86
|
+
def self.below?(candidate, anchor, margin_left, margin_right)
|
87
|
+
return false if Geometry.get_min_max_y(candidate).min < Geometry.get_min_max_y(anchor).min
|
88
|
+
if Geometry.get_min_max_x(candidate).min <
|
89
|
+
Geometry.get_min_max_x(anchor).min - (Geometry.get_min_max_x(anchor).min * margin_left)
|
90
|
+
return false
|
91
|
+
end
|
92
|
+
if Geometry.get_min_max_x(candidate).max >
|
93
|
+
Geometry.get_min_max_x(anchor).max + (Geometry.get_min_max_x(anchor).max * margin_right)
|
94
|
+
return false
|
95
|
+
end
|
96
|
+
|
97
|
+
true
|
98
|
+
end
|
80
99
|
end
|
81
100
|
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../../input/sources'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
# Image Extraction Module.
|
7
|
+
module ImageExtraction
|
8
|
+
# Generic class for image extraction.
|
9
|
+
class ExtractedImage
|
10
|
+
# Id of the page the image was extracted from.
|
11
|
+
attr_reader :page_id
|
12
|
+
|
13
|
+
# Id of the element on a given page.
|
14
|
+
attr_reader :element_id
|
15
|
+
|
16
|
+
# Buffer object of the file's content.
|
17
|
+
attr_reader :buffer
|
18
|
+
|
19
|
+
# Internal name for the file.
|
20
|
+
attr_reader :internal_file_name
|
21
|
+
|
22
|
+
# Initializes the ExtractedImage with a buffer and an internal file name.
|
23
|
+
#
|
24
|
+
# @param input_source [LocalInputSource] Local source for input.
|
25
|
+
# @param page_id [Integer] ID of the page the element was found on.
|
26
|
+
# @param element_id [Integer, nil] ID of the element in a page.
|
27
|
+
def initialize(input_source, page_id, element_id)
|
28
|
+
@buffer = StringIO.new(input_source.io_stream.read)
|
29
|
+
@buffer.rewind
|
30
|
+
extension = if input_source.pdf?
|
31
|
+
'jpg'
|
32
|
+
else
|
33
|
+
File.extname(input_source.filename)
|
34
|
+
end
|
35
|
+
@internal_file_name = "#{input_source.filename}_p#{page_id}_#{element_id}.#{extension}"
|
36
|
+
@page_id = page_id
|
37
|
+
@element_id = element_id.nil? ? 0 : element_id
|
38
|
+
end
|
39
|
+
|
40
|
+
# Saves the document to a file.
|
41
|
+
#
|
42
|
+
# @param output_path [String] Path to save the file to.
|
43
|
+
# @param file_format [String, nil] Optional MiniMagick-compatible format for the file. Inferred from file
|
44
|
+
# extension if not provided.
|
45
|
+
# @raise [MindeeError] If an invalid path or filename is provided.
|
46
|
+
def save_to_file(output_path, file_format = nil)
|
47
|
+
resolved_path = Pathname.new(output_path).realpath
|
48
|
+
if file_format.nil?
|
49
|
+
raise ArgumentError, 'Invalid file format.' if resolved_path.extname.delete('.').empty?
|
50
|
+
|
51
|
+
file_format = resolved_path.extname.delete('.').upcase
|
52
|
+
end
|
53
|
+
@buffer.rewind
|
54
|
+
image = MiniMagick::Image.read(@buffer)
|
55
|
+
image.format file_format.downcase
|
56
|
+
image.write resolved_path.to_s
|
57
|
+
logger.info("File saved successfully to '#{resolved_path}'.")
|
58
|
+
rescue TypeError
|
59
|
+
raise 'Invalid path/filename provided.'
|
60
|
+
rescue StandardError
|
61
|
+
raise "Could not save file #{Pathname.new(output_path).basename}."
|
62
|
+
end
|
63
|
+
|
64
|
+
# Return the file as a Mindee-compatible BufferInput source.
|
65
|
+
#
|
66
|
+
# @return [FileInputSource] A BufferInput source.
|
67
|
+
def as_source
|
68
|
+
@buffer.rewind
|
69
|
+
Mindee::Input::Source::BytesInputSource.new(@buffer.read, @internal_file_name)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,191 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'mini_magick'
|
4
|
+
require 'origami'
|
5
|
+
require 'stringio'
|
6
|
+
require 'tempfile'
|
7
|
+
require_relative '../../input/sources'
|
8
|
+
require_relative 'extracted_image'
|
9
|
+
|
10
|
+
module Mindee
|
11
|
+
# Image Extraction Module.
|
12
|
+
module ImageExtraction
|
13
|
+
def attach_image_as_new_file(input_buffer)
|
14
|
+
# Attaches an image as a new page in a PdfDocument object.
|
15
|
+
#
|
16
|
+
# @param [StringIO] input_buffer Input buffer. Only supports JPEG.
|
17
|
+
# @return [Origami::PDF] A PdfDocument handle.
|
18
|
+
|
19
|
+
magick_image = MiniMagick::Image.read(input_buffer)
|
20
|
+
# NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
|
21
|
+
# converted.
|
22
|
+
magick_image.format('jpg')
|
23
|
+
original_density = magick_image.resolution
|
24
|
+
scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
|
25
|
+
# the pdf otherwise the resulting image shrinks.
|
26
|
+
magick_image.format('pdf', 0, { density: scale_factor.to_s })
|
27
|
+
io_buffer = StringIO.new
|
28
|
+
magick_image.write(io_buffer)
|
29
|
+
Origami::PDF.read(io_buffer)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Extracts multiple images from a given local input source.
|
33
|
+
#
|
34
|
+
# @param [Mindee::Input::Source::LocalInputSource] input_source
|
35
|
+
# @param [Integer] page_id ID of the Page to extract from.
|
36
|
+
# @param [Array<Array<Mindee::Geometry::Point>>, Array<Mindee::Geometry::Quadrangle>] polygons List of coordinates
|
37
|
+
# to extract.
|
38
|
+
# @return [Array<Mindee::ImageExtraction::ExtractedImage>] Extracted Images.
|
39
|
+
def extract_multiple_images_from_source(input_source, page_id, polygons)
|
40
|
+
new_stream = load_doc(input_source, page_id)
|
41
|
+
new_stream.seek(0)
|
42
|
+
|
43
|
+
extract_images_from_polygons(input_source, new_stream, page_id, polygons)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Retrieves a PDF document's page.
|
47
|
+
#
|
48
|
+
# @param [Origami::PDF] pdf_doc Origami PDF handle.
|
49
|
+
# @param [Integer] page_id Page ID.
|
50
|
+
def get_page(pdf_doc, page_id)
|
51
|
+
stream = StringIO.new
|
52
|
+
pdf_doc.save(stream)
|
53
|
+
|
54
|
+
options = {
|
55
|
+
page_indexes: [page_id - 1],
|
56
|
+
}
|
57
|
+
|
58
|
+
Mindee::PDF::PdfProcessor.parse(stream, options)
|
59
|
+
end
|
60
|
+
|
61
|
+
# Extracts images from their positions on a file (as polygons).
|
62
|
+
#
|
63
|
+
# @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
|
64
|
+
# @param [StringIO] pdf_stream Buffer of the PDF.
|
65
|
+
# @param [Integer] page_id Page ID.
|
66
|
+
# @param [Array<Mindee::Geometry::Point, Mindee::Geometry::Polygon, Mindee::Geometry::Quadrangle>] polygons
|
67
|
+
# @return [Array<Mindee::ImageExtraction::ExtractedImage>] Extracted Images.
|
68
|
+
def extract_images_from_polygons(input_source, pdf_stream, page_id, polygons)
|
69
|
+
extracted_elements = []
|
70
|
+
|
71
|
+
polygons.each_with_index do |polygon, element_id|
|
72
|
+
polygon = normalize_polygon(polygon)
|
73
|
+
page_content = read_page_content(pdf_stream)
|
74
|
+
|
75
|
+
min_max_x = Geometry.get_min_max_x([
|
76
|
+
polygon.top_left,
|
77
|
+
polygon.bottom_right,
|
78
|
+
polygon.top_right,
|
79
|
+
polygon.bottom_left,
|
80
|
+
])
|
81
|
+
min_max_y = Geometry.get_min_max_y([
|
82
|
+
polygon.top_left,
|
83
|
+
polygon.bottom_right,
|
84
|
+
polygon.top_right,
|
85
|
+
polygon.bottom_left,
|
86
|
+
])
|
87
|
+
file_extension = determine_file_extension(input_source)
|
88
|
+
cropped_image = crop_image(page_content, min_max_x, min_max_y)
|
89
|
+
if file_extension == 'pdf'
|
90
|
+
cropped_image.format('jpg')
|
91
|
+
else
|
92
|
+
cropped_image.format(file_extension)
|
93
|
+
end
|
94
|
+
|
95
|
+
buffer = StringIO.new
|
96
|
+
write_image_to_buffer(cropped_image, buffer)
|
97
|
+
file_name = "#{input_source.filename}_page#{page_id}-#{element_id}.#{file_extension}"
|
98
|
+
|
99
|
+
extracted_elements << create_extracted_image(buffer, file_name, page_id, element_id)
|
100
|
+
end
|
101
|
+
|
102
|
+
extracted_elements
|
103
|
+
end
|
104
|
+
|
105
|
+
# Retrieves the bounding box of a polygon.
|
106
|
+
#
|
107
|
+
# @param [Array<Point>, Mindee::Geometry::Polygon] polygon
|
108
|
+
def normalize_polygon(polygon)
|
109
|
+
if polygon.is_a?(Mindee::Geometry::Polygon)
|
110
|
+
Mindee::Geometry.get_bounding_box(polygon)
|
111
|
+
else
|
112
|
+
polygon
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Loads a buffer into a MiniMagick Image.
|
117
|
+
#
|
118
|
+
# @param [StringIO] pdf_stream Buffer containg the PDF
|
119
|
+
# @return [MiniMagick::Image] a valid MiniMagick image handle.
|
120
|
+
def read_page_content(pdf_stream)
|
121
|
+
pdf_stream.rewind
|
122
|
+
MiniMagick::Image.read(pdf_stream)
|
123
|
+
end
|
124
|
+
|
125
|
+
# Crops a MiniMagick Image from a the given bounding box.
|
126
|
+
#
|
127
|
+
# @param [MiniMagick::Image] image Input Image.
|
128
|
+
# @param [Mindee::Geometry::MinMax] min_max_x minimum & maximum values for the x coordinates.
|
129
|
+
# @param [Mindee::Geometry::MinMax] min_max_y minimum & maximum values for the y coordinates.
|
130
|
+
def crop_image(image, min_max_x, min_max_y)
|
131
|
+
width = image[:width].to_i
|
132
|
+
height = image[:height].to_i
|
133
|
+
|
134
|
+
image.format('jpg')
|
135
|
+
new_width = (min_max_x.max - min_max_x.min) * width
|
136
|
+
new_height = (min_max_y.max - min_max_y.min) * height
|
137
|
+
image.crop("#{new_width}x#{new_height}+#{min_max_x.min * width}+#{min_max_y.min * height}")
|
138
|
+
|
139
|
+
image
|
140
|
+
end
|
141
|
+
|
142
|
+
# Writes a MiniMagick::Image to a buffer.
|
143
|
+
#
|
144
|
+
# @param [MiniMagick::Image] image a valid MiniMagick image.
|
145
|
+
# @param [StringIO] buffer
|
146
|
+
def write_image_to_buffer(image, buffer)
|
147
|
+
image.write(buffer)
|
148
|
+
end
|
149
|
+
|
150
|
+
# Retrieves the file extension from the main file to apply it to the extracted images. Note: coerces pdf as jpg.
|
151
|
+
#
|
152
|
+
# @param [Mindee::Input::Source::LocalInputSource] input_source Local input source.
|
153
|
+
# @return [String] A valid file extension.
|
154
|
+
def determine_file_extension(input_source)
|
155
|
+
if input_source.pdf? || input_source.filename.downcase.end_with?('pdf')
|
156
|
+
'jpg'
|
157
|
+
else
|
158
|
+
File.extname(input_source.filename).strip.downcase[1..]
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Generates an ExtractedImage.
|
163
|
+
#
|
164
|
+
# @param [StringIO] buffer Buffer containing the image.
|
165
|
+
# @param [String] file_name Name for the file.
|
166
|
+
# @param [Object] page_id ID of the page the file was generated from.
|
167
|
+
# @param [Object] element_id ID of the element of a given page.
|
168
|
+
def create_extracted_image(buffer, file_name, page_id, element_id)
|
169
|
+
buffer.rewind
|
170
|
+
ExtractedImage.new(
|
171
|
+
Mindee::Input::Source::BytesInputSource.new(buffer.read, file_name),
|
172
|
+
page_id,
|
173
|
+
element_id
|
174
|
+
)
|
175
|
+
end
|
176
|
+
|
177
|
+
# Loads a single_page from an image file or a pdf document.
|
178
|
+
#
|
179
|
+
# @param input_file [LocalInputSource] Local input.
|
180
|
+
# @param [Integer] page_id Page ID.
|
181
|
+
# @return [MiniMagick::Image] A valid PdfDocument handle.
|
182
|
+
def load_doc(input_file, page_id)
|
183
|
+
input_file.io_stream.rewind
|
184
|
+
if input_file.pdf?
|
185
|
+
get_page(Origami::PDF.read(input_file.io_stream), page_id)
|
186
|
+
else
|
187
|
+
input_file.io_stream
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
# Image Extraction Module.
|
5
|
+
module ImageExtraction
|
6
|
+
def extract_receipts(input_source, inference)
|
7
|
+
# Extracts individual receipts from multi-receipts documents.
|
8
|
+
#
|
9
|
+
# @param input_source [LocalInputSource] Local Input Source to extract sub-receipts from.
|
10
|
+
# @param inference [Inference] Results of the inference.
|
11
|
+
# @return [Array<ExtractedImage>] Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
|
12
|
+
|
13
|
+
images = []
|
14
|
+
raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts
|
15
|
+
|
16
|
+
(0...input_source.count_pdf_pages).each do |page_id|
|
17
|
+
receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box)
|
18
|
+
images.concat(
|
19
|
+
extract_multiple_images_from_source(input_source, page_id + 1, receipt_positions)
|
20
|
+
)
|
21
|
+
end
|
22
|
+
|
23
|
+
images
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/mindee/input/sources.rb
CHANGED
@@ -118,6 +118,14 @@ module Mindee
|
|
118
118
|
@io_stream.close if close
|
119
119
|
['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
|
120
120
|
end
|
121
|
+
|
122
|
+
def count_pdf_pages
|
123
|
+
return 1 unless pdf?
|
124
|
+
|
125
|
+
@io_stream.seek(0)
|
126
|
+
pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
|
127
|
+
pdf_processor.pages.size
|
128
|
+
end
|
121
129
|
end
|
122
130
|
|
123
131
|
# Load a document from a path.
|