mindee 3.12.0 → 3.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/CHANGELOG.md +18 -0
- data/README.md +23 -23
- data/Rakefile +5 -0
- data/docs/bank_account_details_v2.md +5 -1
- data/docs/bank_check_v1.md +6 -2
- data/docs/bank_statement_fr_v1.md +3 -0
- data/docs/barcode_reader_v1.md +5 -1
- data/docs/bill_of_lading_v1.md +202 -0
- data/docs/carte_grise_v1.md +5 -1
- data/docs/carte_vitale_v1.md +5 -1
- data/docs/code_samples/bill_of_lading_v1_async.txt +19 -0
- data/docs/code_samples/energy_bill_fra_v1_async.txt +19 -0
- data/docs/code_samples/invoices_v4_async.txt +19 -0
- data/docs/code_samples/nutrition_facts_v1_async.txt +19 -0
- data/docs/code_samples/payslip_fra_v2_async.txt +19 -0
- data/docs/cropper_v1.md +6 -2
- data/docs/custom_v1.md +5 -3
- data/docs/energy_bill_fra_v1.md +249 -0
- data/docs/eu_driver_license_v1.md +6 -2
- data/docs/expense_receipts_v5.md +26 -1
- data/docs/financial_document_v1.md +29 -1
- data/docs/generated_v1.md +3 -0
- data/docs/getting_started.md +3 -0
- data/docs/idcard_fr_v2.md +15 -2
- data/docs/international_id_v2.md +13 -1
- data/docs/invoice_splitter_v1.md +16 -13
- data/docs/invoices_v4.md +54 -21
- data/docs/license_plates_v1.md +5 -1
- data/docs/multi_receipts_detector_v1.md +5 -1
- data/docs/nutrition_facts_v1.md +295 -0
- data/docs/passport_v1.md +5 -1
- data/docs/payslip_fra_v2.md +218 -0
- data/docs/proof_of_address_v1.md +5 -1
- data/docs/resume_v1.md +24 -1
- data/docs/us_driver_license_v1.md +6 -2
- data/docs/us_healthcare_cards_v1.md +5 -1
- data/docs/us_mail_v2.md +5 -1
- data/docs/us_w9_v1.md +6 -2
- data/examples/auto_invoice_splitter_extraction.rb +43 -0
- data/lib/mindee/client.rb +20 -8
- data/lib/mindee/{image_extraction → extraction}/common/image_extractor.rb +2 -4
- data/lib/mindee/{image_extraction → extraction}/common.rb +1 -0
- data/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +55 -0
- data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +111 -0
- data/lib/mindee/extraction/pdf_extractor.rb +4 -0
- data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +322 -0
- data/lib/mindee/extraction/tax_extractor.rb +1 -320
- data/lib/mindee/extraction.rb +3 -0
- data/lib/mindee/http/endpoint.rb +18 -6
- data/lib/mindee/parsing/common/api_response.rb +1 -1
- data/lib/mindee/parsing/common/document.rb +31 -1
- data/lib/mindee/parsing/common/extras/cropper_extra.rb +29 -0
- data/lib/mindee/parsing/common/extras/extras.rb +50 -0
- data/lib/mindee/parsing/common/extras/full_text_ocr_extra.rb +32 -0
- data/lib/mindee/parsing/common/extras.rb +5 -0
- data/lib/mindee/parsing/common/page.rb +5 -0
- data/lib/mindee/parsing/standard/base_field.rb +1 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1.rb +39 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier.rb +52 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier_item.rb +95 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_consignee.rb +58 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_document.rb +136 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_notify_party.rb +58 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_page.rb +32 -0
- data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_shipper.rb +58 -0
- data/lib/mindee/product/financial_document/financial_document_v1_line_item.rb +15 -1
- data/lib/mindee/product/fr/bank_account_details/bank_account_details_v2_bban.rb +4 -15
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1.rb +41 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_document.rb +235 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_consumer.rb +48 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_supplier.rb +48 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_usage.rb +97 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_meter_detail.rb +54 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_page.rb +34 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_subscription.rb +97 -0
- data/lib/mindee/product/fr/energy_bill/energy_bill_v1_taxes_and_contribution.rb +97 -0
- data/lib/mindee/product/fr/payslip/payslip_v2.rb +41 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_bank_account_detail.rb +54 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_document.rb +128 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_employee.rb +78 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_employer.rb +78 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_employment.rb +72 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_page.rb +34 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_pay_detail.rb +100 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_pay_period.rb +66 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_pto.rb +56 -0
- data/lib/mindee/product/fr/payslip/payslip_v2_salary_detail.rb +81 -0
- data/lib/mindee/product/invoice/invoice_v4_line_item.rb +15 -1
- data/lib/mindee/product/invoice_splitter/invoice_splitter_v1_document.rb +1 -1
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1.rb +39 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_added_sugar.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_calorie.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_cholesterol.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_dietary_fiber.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_document.rb +173 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_nutrient.rb +87 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_page.rb +32 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_protein.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_saturated_fat.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_serving_size.rb +46 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_sodium.rb +58 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_carbohydrate.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_fat.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_sugar.rb +52 -0
- data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_trans_fat.rb +52 -0
- data/lib/mindee/product/receipt/receipt_v5_line_item.rb +11 -1
- data/lib/mindee/product/resume/resume_v1_certificate.rb +11 -1
- data/lib/mindee/product/resume/resume_v1_education.rb +14 -1
- data/lib/mindee/product/resume/resume_v1_language.rb +9 -1
- data/lib/mindee/product/resume/resume_v1_professional_experience.rb +15 -1
- data/lib/mindee/product/resume/resume_v1_social_networks_url.rb +9 -1
- data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_copay.rb +9 -1
- data/lib/mindee/product/us/us_mail/us_mail_v2_recipient_address.rb +14 -1
- data/lib/mindee/product/us/us_mail/us_mail_v2_sender_address.rb +5 -17
- data/lib/mindee/product.rb +5 -1
- data/lib/mindee/version.rb +1 -1
- metadata +70 -9
- data/lib/mindee/image_extraction.rb +0 -4
- /data/lib/mindee/{image_extraction → extraction}/common/extracted_image.rb +0 -0
- /data/lib/mindee/{image_extraction → extraction}/multi_receipts_extractor/multi_receipts_extractor.rb +0 -0
- /data/lib/mindee/{image_extraction → extraction}/multi_receipts_extractor.rb +0 -0
- /data/lib/mindee/extraction/{ocr_extractor.rb → tax_extractor/ocr_extractor.rb} +0 -0
@@ -1,322 +1,3 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative '
|
4
|
-
|
5
|
-
module Mindee
|
6
|
-
module Extraction
|
7
|
-
# Tax extractor class
|
8
|
-
class TaxExtractor < OcrExtractor
|
9
|
-
# Extracts the most relevant candidate.
|
10
|
-
# @param candidates [Array<Hash>] a candidate for the tax.
|
11
|
-
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
12
|
-
# @return [Hash, nil]
|
13
|
-
def self.pick_best(candidates, tax_names)
|
14
|
-
return candidates[0] if candidates.size == 1
|
15
|
-
return nil if candidates.empty?
|
16
|
-
|
17
|
-
picked = 0
|
18
|
-
picked_score = 0
|
19
|
-
|
20
|
-
candidates.each_with_index do |candidate, i|
|
21
|
-
next unless valid_candidate?(candidate, tax_names)
|
22
|
-
|
23
|
-
sum_fields_score = calculate_score(candidate, i)
|
24
|
-
|
25
|
-
if picked_score < sum_fields_score
|
26
|
-
picked_score = sum_fields_score
|
27
|
-
picked = i
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
candidates[picked]
|
32
|
-
end
|
33
|
-
|
34
|
-
# Checks whether a tax code has been properly read. Shouldn't trigger except in case of very specific regex breaks
|
35
|
-
# due to unsupported diacritics.
|
36
|
-
# @param candidate [Hash] A candidate for the tax.
|
37
|
-
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
38
|
-
# @return [Boolean]
|
39
|
-
def self.valid_candidate?(candidate, tax_names)
|
40
|
-
return false if tax_names.empty? || candidate.nil? || candidate['code'].nil?
|
41
|
-
|
42
|
-
tax_names.each do |tax_name|
|
43
|
-
return true if remove_accents(tax_name.downcase) == remove_accents(candidate['code'].downcase)
|
44
|
-
end
|
45
|
-
false
|
46
|
-
end
|
47
|
-
|
48
|
-
# [Experimental] computes the score of a valid candidate for a tax.
|
49
|
-
# @param candidate [Hash] A candidate for the tax.
|
50
|
-
# @param index [Integer]
|
51
|
-
def self.calculate_score(candidate, index)
|
52
|
-
score = index + 1
|
53
|
-
unless candidate['rate'].nil?
|
54
|
-
score += 1
|
55
|
-
score -= 2 if candidate['rate'] > 100
|
56
|
-
score -= 1 if candidate['rate'] > 30
|
57
|
-
end
|
58
|
-
score += 4 unless candidate['value'].nil?
|
59
|
-
score += 1 unless candidate['base'].nil?
|
60
|
-
score
|
61
|
-
end
|
62
|
-
|
63
|
-
# Curates tax values based on simple rules to avoid improbable data
|
64
|
-
# @param found_hash [Hash] Hash of currently retrieved values
|
65
|
-
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
66
|
-
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
67
|
-
# @return [Hash]
|
68
|
-
def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
|
69
|
-
reconstructed_hash = { 'code' => nil, 'page_id' => nil, 'rate' => nil, 'base' => nil, 'value' => nil }
|
70
|
-
return reconstructed_hash if found_hash.nil?
|
71
|
-
|
72
|
-
reconstructed_hash['code'] =
|
73
|
-
found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
|
74
|
-
|
75
|
-
if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
|
76
|
-
found_hash['rate'] =
|
77
|
-
found_hash['rate'] * 100
|
78
|
-
end
|
79
|
-
found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
|
80
|
-
found_hash = decimate_rates_if_needed(found_hash)
|
81
|
-
found_hash = fix_rate(found_hash)
|
82
|
-
reconstructed_hash['rate'] = found_hash['rate']
|
83
|
-
set_base_and_value(reconstructed_hash, found_hash)
|
84
|
-
end
|
85
|
-
|
86
|
-
# Swaps the rate with base or value if rate is out of bounds
|
87
|
-
# @param found_hash [Hash] Hash of currently retrieved values
|
88
|
-
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
89
|
-
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
90
|
-
# @return [Hash]
|
91
|
-
def self.swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
|
92
|
-
if found_hash['rate'] && (found_hash['rate'] > max_rate_percentage || found_hash['rate'] < min_rate_percentage)
|
93
|
-
if valid_percentage?(found_hash['base'], min_rate_percentage, max_rate_percentage)
|
94
|
-
found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
|
95
|
-
elsif valid_percentage?(found_hash['value'], min_rate_percentage, max_rate_percentage)
|
96
|
-
found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
|
97
|
-
end
|
98
|
-
end
|
99
|
-
found_hash
|
100
|
-
end
|
101
|
-
|
102
|
-
# Rates can't be negative if set.
|
103
|
-
# @param found_hash [Hash] Hash of currently retrieved values
|
104
|
-
def self.fix_rate(found_hash)
|
105
|
-
found_hash['rate'] = found_hash['rate'].abs unless found_hash['rate'].nil?
|
106
|
-
found_hash
|
107
|
-
end
|
108
|
-
|
109
|
-
# Swaps the rate with base or value if rate is out of bounds
|
110
|
-
# @param found_hash [Hash] Hash of currently retrieved values
|
111
|
-
# @return [Hash]
|
112
|
-
def self.decimate_rates_if_needed(found_hash)
|
113
|
-
if found_hash['rate'] && found_hash['rate'] > 100
|
114
|
-
if !found_hash['base'].nil? && found_hash['rate'] > found_hash['base']
|
115
|
-
found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
|
116
|
-
elsif !found_hash['value'].nil? && found_hash['rate'] > found_hash['value']
|
117
|
-
found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
|
118
|
-
end
|
119
|
-
end
|
120
|
-
found_hash
|
121
|
-
end
|
122
|
-
|
123
|
-
# Sets the base and value in the reconstructed hash based on certain conditions
|
124
|
-
# @param reconstructed_hash [Hash] Hash being reconstructed with new values
|
125
|
-
# @param found_hash [Hash] Hash of currently retrieved values
|
126
|
-
# @return [Hash]
|
127
|
-
def self.set_base_and_value(reconstructed_hash, found_hash)
|
128
|
-
if found_hash['base'].nil?
|
129
|
-
reconstructed_hash['base'] = found_hash['base']
|
130
|
-
reconstructed_hash['value'] = found_hash['value']
|
131
|
-
elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
|
132
|
-
reconstructed_hash['base'] = found_hash['value']
|
133
|
-
reconstructed_hash['value'] = found_hash['base']
|
134
|
-
else
|
135
|
-
reconstructed_hash['value'] = found_hash['value']
|
136
|
-
end
|
137
|
-
reconstructed_hash
|
138
|
-
end
|
139
|
-
|
140
|
-
# Extracts a single custom type of tax.
|
141
|
-
# For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
|
142
|
-
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
|
143
|
-
# @param tax_names [Array<String>] list of all possible names the tax can have.
|
144
|
-
# @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
|
145
|
-
# @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
|
146
|
-
# @return [Mindee::Parsing::Standard::TaxField, nil]
|
147
|
-
def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_rate_percentage = 100)
|
148
|
-
return nil if ocr_result.is_a?(Mindee::Parsing::Common::Ocr) || tax_names.empty?
|
149
|
-
|
150
|
-
tax_names.sort!
|
151
|
-
found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
|
152
|
-
# a tax is considered found horizontally if it has a value, otherwise it is vertical
|
153
|
-
if found_hash.nil? || found_hash['value'].nil?
|
154
|
-
found_hash = extract_vertical_tax(ocr_result, tax_names,
|
155
|
-
found_hash)
|
156
|
-
end
|
157
|
-
found_hash = curate_values(found_hash, min_rate_percentage, max_rate_percentage)
|
158
|
-
|
159
|
-
return if found_hash.nil? || found_hash.empty?
|
160
|
-
|
161
|
-
create_tax_field(found_hash)
|
162
|
-
end
|
163
|
-
|
164
|
-
# Creates a tax field from a given hash.
|
165
|
-
# @param found_hash [Hash] Hash of currently retrieved values
|
166
|
-
# @return [Mindee::Parsing::Standard::TaxField]
|
167
|
-
def self.create_tax_field(found_hash)
|
168
|
-
Mindee::Parsing::Standard::TaxField.new(
|
169
|
-
found_hash,
|
170
|
-
found_hash.key?('page_id') ? found_hash['page_id'] : nil
|
171
|
-
)
|
172
|
-
end
|
173
|
-
|
174
|
-
# Extracts the rate and code, if found, from matches into the found_hash.
|
175
|
-
# @param matches [MatchData] RegEx matches of the values for taxes
|
176
|
-
# @param found_hash [Hash] Hash of currently retrieved values
|
177
|
-
# @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
|
178
|
-
# @return [Hash]
|
179
|
-
def self.extract_percentage_from_tax(matches, found_hash, percent_first)
|
180
|
-
if percent_first
|
181
|
-
found_hash['code'] = matches[2].strip unless matches[2].nil?
|
182
|
-
found_hash['rate'] = parse_amount(matches[1].gsub('%', '')) unless matches[1].nil?
|
183
|
-
else
|
184
|
-
found_hash['code'] = matches[1].strip unless matches[1].nil?
|
185
|
-
found_hash['rate'] = parse_amount(matches[2].gsub('%', '')) unless matches[2].nil?
|
186
|
-
end
|
187
|
-
found_hash
|
188
|
-
end
|
189
|
-
|
190
|
-
# rubocop:disable Metrics/CyclomaticComplexity
|
191
|
-
# rubocop:disable Metrics/PerceivedComplexity
|
192
|
-
|
193
|
-
# Extracts the basis and value of a tax from regex matches, independent of the order.
|
194
|
-
# @param matches [MatchData] RegEx matches of the values for taxes
|
195
|
-
# @param found_hash [Hash] Hash of currently retrieved values
|
196
|
-
# @return [Hash]
|
197
|
-
def self.extract_basis_and_value(matches, found_hash)
|
198
|
-
if matches[4].nil? && !matches[3].nil?
|
199
|
-
found_hash['value'] = parse_amount(matches[3]) unless matches[3].nil?
|
200
|
-
elsif matches[3].nil? && !matches[4].nil?
|
201
|
-
found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
|
202
|
-
elsif !matches[3].nil? && !matches[4].nil?
|
203
|
-
found_hash['base'] = parse_amount(matches[3]) unless matches[3].nil?
|
204
|
-
found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
|
205
|
-
end
|
206
|
-
found_hash
|
207
|
-
end
|
208
|
-
|
209
|
-
# rubocop:enable Metrics/CyclomaticComplexity
|
210
|
-
# rubocop:enable Metrics/PerceivedComplexity
|
211
|
-
|
212
|
-
# Extracts tax information from a horizontal line.
|
213
|
-
# @param line [String] Line to be processed.
|
214
|
-
# @param pattern [Regexp] RegEx pattern to search the line with.
|
215
|
-
# @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
|
216
|
-
# @return [Hash]
|
217
|
-
def self.extract_tax_from_horizontal_line(line, pattern, page_id, percent_first)
|
218
|
-
found_hash = {}
|
219
|
-
|
220
|
-
matches = line.match(pattern)
|
221
|
-
|
222
|
-
# Edge case for when the tax is split-up between two pages, we'll consider that
|
223
|
-
# the answer belongs to the first one.
|
224
|
-
found_hash['page_id'] = page_id unless found_hash.key?('page_id')
|
225
|
-
return found_hash if matches.nil?
|
226
|
-
|
227
|
-
found_hash = extract_percentage_from_tax(matches, found_hash, percent_first)
|
228
|
-
extract_basis_and_value(matches, found_hash)
|
229
|
-
end
|
230
|
-
|
231
|
-
# rubocop:disable Metrics/CyclomaticComplexity
|
232
|
-
# rubocop:disable Metrics/PerceivedComplexity
|
233
|
-
|
234
|
-
# Processes a horizontal line for tax extraction. Returns a hash of collected values.
|
235
|
-
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] Processed OCR results.
|
236
|
-
# @param tax_names [Array<String>] Possible tax names candidates.
|
237
|
-
# @return [Array<Hash>]
|
238
|
-
def self.extract_horizontal_tax(ocr_result, tax_names)
|
239
|
-
candidates = [{ 'code' => nil, 'value' => nil, 'base' => nil, 'rate' => nil }]
|
240
|
-
linear_pattern_percent_first = %r{
|
241
|
-
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
|
242
|
-
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
|
243
|
-
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
|
244
|
-
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
|
245
|
-
}x
|
246
|
-
linear_pattern_percent_second = %r{
|
247
|
-
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
|
248
|
-
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
|
249
|
-
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
|
250
|
-
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
|
251
|
-
}x
|
252
|
-
ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
|
253
|
-
page.all_lines.each do |line|
|
254
|
-
clean_line = remove_currency_symbols(line.to_s.scrub.gsub(%r{[+(\[)\]¿?*_]}, '')).gsub(%r{\.{2,}}, ' ')
|
255
|
-
.gsub(%r{ +}, ' ').strip
|
256
|
-
|
257
|
-
next if match_index(clean_line, tax_names).nil?
|
258
|
-
|
259
|
-
unless clean_line.match(linear_pattern_percent_second).nil?
|
260
|
-
candidates.append(extract_tax_from_horizontal_line(clean_line[match_index(clean_line, tax_names)..],
|
261
|
-
linear_pattern_percent_second, page_id, false))
|
262
|
-
end
|
263
|
-
if clean_line.include?('%') && !clean_line.match(linear_pattern_percent_first).nil?
|
264
|
-
candidates.append(extract_tax_from_horizontal_line(clean_line[clean_line.index(%r{\d*[.,]?\d* ?%})..],
|
265
|
-
linear_pattern_percent_first, page_id, true))
|
266
|
-
elsif !clean_line.match(linear_pattern_percent_first).nil?
|
267
|
-
candidates.append(extract_tax_from_horizontal_line(clean_line,
|
268
|
-
linear_pattern_percent_first, page_id, true))
|
269
|
-
end
|
270
|
-
end
|
271
|
-
end
|
272
|
-
candidates
|
273
|
-
end
|
274
|
-
# rubocop:enable Metrics/CyclomaticComplexity
|
275
|
-
# rubocop:enable Metrics/PerceivedComplexity
|
276
|
-
|
277
|
-
# Processes a vertical reconstructed line for tax extraction. Returns a hash of collected values.
|
278
|
-
# @param line [Mindee::Parsing::Common::Ocr::OcrLine] Processed OCR results.
|
279
|
-
# @param found_hash [Hash] Hash containing previously found values, if any.
|
280
|
-
# @return [Hash]
|
281
|
-
def self.extract_vertical_tax_values(line, found_hash)
|
282
|
-
amounts = []
|
283
|
-
line.each do |reconstructed_word|
|
284
|
-
amounts.push(parse_amount(reconstructed_word.text)) unless parse_amount(reconstructed_word.text).nil?
|
285
|
-
end
|
286
|
-
if amounts.length == 1 && !found_hash.key?('value')
|
287
|
-
found_hash['value'] = amounts[0]
|
288
|
-
else
|
289
|
-
found_hash['rate'] = amounts[0] if found_hash['rate'].nil?
|
290
|
-
found_hash['value'] = amounts[1] if found_hash['value'].nil?
|
291
|
-
end
|
292
|
-
found_hash
|
293
|
-
end
|
294
|
-
|
295
|
-
# Extracts tax data from a vertical reconstructed row.
|
296
|
-
# @param ocr_result [Mindee::Parsing::Common::Ocr] OCR raw results
|
297
|
-
# @param tax_names [Array<String>] Array of possible names a tax can have
|
298
|
-
# @param found_hash [Hash] Hash of currently retrieved values
|
299
|
-
def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
|
300
|
-
found_hash = { 'code' => nil, 'page_id' => nil } if found_hash.nil?
|
301
|
-
|
302
|
-
ocr_result.mvision_v1.pages.each_with_index do |page, page_id|
|
303
|
-
page.all_words.each do |word|
|
304
|
-
next if match_index(word.text, tax_names).nil?
|
305
|
-
|
306
|
-
reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
|
307
|
-
found_hash['page_id'] = page_id if found_hash['page_id'].nil?
|
308
|
-
found_hash['code'] = word.text.strip if found_hash['code'].nil?
|
309
|
-
found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
|
310
|
-
end
|
311
|
-
end
|
312
|
-
found_hash
|
313
|
-
end
|
314
|
-
|
315
|
-
private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
|
316
|
-
:extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
|
317
|
-
:create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
|
318
|
-
:decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
|
319
|
-
:swap_rates_if_needed
|
320
|
-
end
|
321
|
-
end
|
322
|
-
end
|
3
|
+
require_relative 'tax_extractor/tax_extractor'
|
data/lib/mindee/extraction.rb
CHANGED
data/lib/mindee/http/endpoint.rb
CHANGED
@@ -48,12 +48,19 @@ module Mindee
|
|
48
48
|
# Call the prediction API.
|
49
49
|
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
50
50
|
# @param all_words [Boolean] Whether the full word extraction needs to be performed
|
51
|
+
# @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs
|
51
52
|
# @param close_file [Boolean] Whether the file will be closed after reading
|
52
53
|
# @param cropper [Boolean] Whether a cropping operation will be applied
|
53
54
|
# @return [Array]
|
54
|
-
def predict(input_source, all_words, close_file, cropper)
|
55
|
+
def predict(input_source, all_words, full_text, close_file, cropper)
|
55
56
|
check_api_key
|
56
|
-
response = predict_req_post(
|
57
|
+
response = predict_req_post(
|
58
|
+
input_source,
|
59
|
+
all_words: all_words,
|
60
|
+
full_text: full_text,
|
61
|
+
close_file: close_file,
|
62
|
+
cropper: cropper
|
63
|
+
)
|
57
64
|
hashed_response = JSON.parse(response.body, object_class: Hash)
|
58
65
|
return [hashed_response, response.body] if ResponseValidation.valid_sync_response?(response)
|
59
66
|
|
@@ -65,12 +72,13 @@ module Mindee
|
|
65
72
|
# Call the prediction API.
|
66
73
|
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
67
74
|
# @param all_words [Boolean] Whether the full word extraction needs to be performed
|
75
|
+
# @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
|
68
76
|
# @param close_file [Boolean] Whether the file will be closed after reading
|
69
77
|
# @param cropper [Boolean] Whether a cropping operation will be applied
|
70
78
|
# @return [Array]
|
71
|
-
def predict_async(input_source, all_words, close_file, cropper)
|
79
|
+
def predict_async(input_source, all_words, full_text, close_file, cropper)
|
72
80
|
check_api_key
|
73
|
-
response = document_queue_req_get(input_source, all_words, close_file, cropper)
|
81
|
+
response = document_queue_req_get(input_source, all_words, full_text, close_file, cropper)
|
74
82
|
hashed_response = JSON.parse(response.body, object_class: Hash)
|
75
83
|
return [hashed_response, response.body] if ResponseValidation.valid_async_response?(response)
|
76
84
|
|
@@ -97,14 +105,16 @@ module Mindee
|
|
97
105
|
|
98
106
|
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
99
107
|
# @param all_words [Boolean] Whether the full word extraction needs to be performed
|
108
|
+
# @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
|
100
109
|
# @param close_file [Boolean] Whether the file will be closed after reading
|
101
110
|
# @param cropper [Boolean] Whether a cropping operation will be applied
|
102
111
|
# @return [Net::HTTPResponse, nil]
|
103
|
-
def predict_req_post(input_source, all_words: false, close_file: true, cropper: false)
|
112
|
+
def predict_req_post(input_source, all_words: false, full_text: false, close_file: true, cropper: false)
|
104
113
|
uri = URI("#{@url_root}/predict")
|
105
114
|
|
106
115
|
params = {}
|
107
116
|
params[:cropper] = 'true' if cropper
|
117
|
+
params[:full_text_ocr] = 'true' if full_text
|
108
118
|
uri.query = URI.encode_www_form(params)
|
109
119
|
|
110
120
|
headers = {
|
@@ -129,14 +139,16 @@ module Mindee
|
|
129
139
|
|
130
140
|
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
131
141
|
# @param all_words [Boolean] Whether the full word extraction needs to be performed
|
142
|
+
# @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
|
132
143
|
# @param close_file [Boolean] Whether the file will be closed after reading
|
133
144
|
# @param cropper [Boolean] Whether a cropping operation will be applied
|
134
145
|
# @return [Net::HTTPResponse, nil]
|
135
|
-
def document_queue_req_get(input_source, all_words, close_file, cropper)
|
146
|
+
def document_queue_req_get(input_source, all_words, full_text, close_file, cropper)
|
136
147
|
uri = URI("#{@url_root}/predict_async")
|
137
148
|
|
138
149
|
params = {}
|
139
150
|
params[:cropper] = 'true' if cropper
|
151
|
+
params[:full_text_ocr] = 'true' if full_text
|
140
152
|
uri.query = URI.encode_www_form(params)
|
141
153
|
|
142
154
|
headers = {
|
@@ -103,7 +103,7 @@ module Mindee
|
|
103
103
|
# @return [String]
|
104
104
|
attr_reader :raw_http
|
105
105
|
|
106
|
-
# @param product_class [
|
106
|
+
# @param product_class [Mindee::Inference]
|
107
107
|
# @param http_response [Hash]
|
108
108
|
# @param raw_http [String]
|
109
109
|
def initialize(product_class, http_response, raw_http)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative 'inference'
|
4
|
+
require_relative 'extras'
|
4
5
|
|
5
6
|
module Mindee
|
6
7
|
module Parsing
|
@@ -13,6 +14,8 @@ module Mindee
|
|
13
14
|
attr_reader :name
|
14
15
|
# @return [String] Mindee ID of the document
|
15
16
|
attr_reader :id
|
17
|
+
# @return [Mindee::Parsing::Common::Extras::Extras] Potential Extras fields sent back along the prediction.
|
18
|
+
attr_reader :extras
|
16
19
|
# @return [Mindee::Parsing::Common::Ocr::Ocr, nil] OCR text results (limited availability)
|
17
20
|
attr_reader :ocr
|
18
21
|
# @return [Integer] Amount of pages of the document
|
@@ -27,13 +30,22 @@ module Mindee
|
|
27
30
|
Ocr::Ocr.new(ocr_prediction)
|
28
31
|
end
|
29
32
|
|
30
|
-
|
33
|
+
def self.load_extras(http_response)
|
34
|
+
extras_prediction = http_response['inference'].fetch('extras', nil)
|
35
|
+
return nil if extras_prediction.nil? || extras_prediction.fetch('mvision-v1', nil).nil?
|
36
|
+
|
37
|
+
Extras::Extras::Extras.new(extras_prediction)
|
38
|
+
end
|
39
|
+
|
40
|
+
# @param product_class [Mindee::Inference]
|
31
41
|
# @param http_response [Hash]
|
32
42
|
def initialize(product_class, http_response)
|
33
43
|
@id = http_response['id']
|
34
44
|
@name = http_response['name']
|
35
45
|
@inference = product_class.new(http_response['inference'])
|
36
46
|
@ocr = self.class.load_ocr(http_response)
|
47
|
+
@extras = self.class.load_extras(http_response)
|
48
|
+
inject_full_text_ocr(http_response)
|
37
49
|
@n_pages = http_response['n_pages']
|
38
50
|
end
|
39
51
|
|
@@ -45,6 +57,24 @@ module Mindee
|
|
45
57
|
out_str << "\n:Filename: #{@name}"
|
46
58
|
out_str << "\n\n#{@inference}"
|
47
59
|
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def inject_full_text_ocr(raw_prediction)
|
64
|
+
return unless raw_prediction.dig('inference', 'pages') &&
|
65
|
+
raw_prediction['inference']['pages'][0]['extras']['full_text_ocr']
|
66
|
+
|
67
|
+
full_text_ocr = String.new
|
68
|
+
raw_prediction.dig('inference', 'pages').each do |page|
|
69
|
+
full_text_ocr << (page['extras']['full_text_ocr']['content'])
|
70
|
+
end
|
71
|
+
artificial_text_obj = { 'content' => full_text_ocr }
|
72
|
+
if @extras.nil? || @extras.empty?
|
73
|
+
@extras = Extras::Extras.new({ 'full_text_ocr' => artificial_text_obj })
|
74
|
+
else
|
75
|
+
@extras.add_artificial_extra({ 'full_text_ocr' => artificial_text_obj })
|
76
|
+
end
|
77
|
+
end
|
48
78
|
end
|
49
79
|
end
|
50
80
|
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../../standard/position_field'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Parsing
|
7
|
+
module Common
|
8
|
+
module Extras
|
9
|
+
# Contains information on the cropping of a prediction.
|
10
|
+
class CropperExtra
|
11
|
+
# Cropper extra initialization.
|
12
|
+
# @return [Array<Mindee::Parsing::Standard::PositionField>]
|
13
|
+
attr_reader :croppings
|
14
|
+
|
15
|
+
def initialize(raw_prediction, page_id = nil)
|
16
|
+
@croppings = []
|
17
|
+
raw_prediction['cropping']&.each do |crop|
|
18
|
+
@croppings.push(Mindee::Parsing::Standard::PositionField.new(crop, page_id))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
@croppings.map(&:to_s).join("\n ")
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../../standard/position_field'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Parsing
|
7
|
+
module Common
|
8
|
+
# Extras namespace.
|
9
|
+
module Extras
|
10
|
+
# Extra information added to the prediction.
|
11
|
+
class Extras
|
12
|
+
# @return [CropperExtra, nil]
|
13
|
+
attr_reader :cropper
|
14
|
+
# @return [FullTextOCRExtra, nil]
|
15
|
+
attr_reader :full_text_ocr
|
16
|
+
|
17
|
+
def initialize(raw_prediction)
|
18
|
+
@cropper = CropperExtra.new(raw_prediction['cropper']) if raw_prediction['cropper']
|
19
|
+
@full_text_ocr = FullTextOCRExtra.new(raw_prediction['full_text_ocr']) if raw_prediction['full_text_ocr']
|
20
|
+
|
21
|
+
raw_prediction.each do |key, value|
|
22
|
+
instance_variable_set("@#{key}", value) unless ['cropper', 'full_text_ocr'].include?(key)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
out_str = String.new
|
28
|
+
instance_variables.each do |var|
|
29
|
+
out_str << "#{var}: #{instance_variable_get(var)}"
|
30
|
+
end
|
31
|
+
out_str
|
32
|
+
end
|
33
|
+
|
34
|
+
# Adds artificial extra data for reconstructed extras. Currently only used for full_text_ocr.
|
35
|
+
#
|
36
|
+
# @param [Hash] raw_prediction Raw prediction used by the document.
|
37
|
+
def add_artificial_extra(raw_prediction)
|
38
|
+
return unless raw_prediction['full_text_ocr']
|
39
|
+
|
40
|
+
@full_text_ocr << FullTextOCRExtra.new(raw_prediction)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def empty?
|
45
|
+
instance_variables.all? { |var| instance_variable_get(var).nil? }
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../../standard/position_field'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Parsing
|
7
|
+
module Common
|
8
|
+
module Extras
|
9
|
+
# Full Text OCR result.
|
10
|
+
class FullTextOCRExtra
|
11
|
+
# Contents of the full text OCR result.
|
12
|
+
# @return [String, nil]
|
13
|
+
attr_reader :contents
|
14
|
+
# Language used on the page.
|
15
|
+
# @return [String, nil]
|
16
|
+
attr_reader :language
|
17
|
+
|
18
|
+
def initialize(raw_prediction)
|
19
|
+
@contents = raw_prediction['content'] if raw_prediction['content']
|
20
|
+
return unless raw_prediction['language']
|
21
|
+
|
22
|
+
@language = raw_prediction['language']
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
@contents || ''
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative 'product'
|
4
|
+
require_relative 'extras'
|
4
5
|
|
5
6
|
module Mindee
|
6
7
|
module Parsing
|
@@ -18,11 +19,15 @@ module Mindee
|
|
18
19
|
# Page prediction
|
19
20
|
# @return [Mindee::Parsing::Common::Prediction]
|
20
21
|
attr_reader :prediction
|
22
|
+
# Additional page-level information.
|
23
|
+
# @return [Mindee::Parsing::Common::Extras::Extras]
|
24
|
+
attr_reader :extras
|
21
25
|
|
22
26
|
# @param raw_prediction [Hash]
|
23
27
|
def initialize(raw_prediction)
|
24
28
|
@page_id = raw_prediction['id']
|
25
29
|
@orientation = Orientation.new(raw_prediction['orientation'], @page_id)
|
30
|
+
@extras = Extras::Extras.new(raw_prediction['extras']) unless raw_prediction['extras'].nil?
|
26
31
|
end
|
27
32
|
|
28
33
|
# @return [String]
|
@@ -98,6 +98,7 @@ module Mindee
|
|
98
98
|
return '' if in_str.nil?
|
99
99
|
return in_str if max_col_size.nil?
|
100
100
|
|
101
|
+
in_str = in_str.gsub(%r{[\n\r\t]}, "\n" => '\\n', "\r" => '\\r', "\t" => '\\t')
|
101
102
|
in_str.length <= max_col_size ? in_str : "#{in_str[0..max_col_size - 4]}..."
|
102
103
|
end
|
103
104
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../../parsing'
|
4
|
+
require_relative 'bill_of_lading_v1_document'
|
5
|
+
require_relative 'bill_of_lading_v1_page'
|
6
|
+
|
7
|
+
module Mindee
|
8
|
+
module Product
|
9
|
+
# Bill of Lading module.
|
10
|
+
module BillOfLading
|
11
|
+
# Bill of Lading API version 1 inference prediction.
|
12
|
+
class BillOfLadingV1 < Mindee::Parsing::Common::Inference
|
13
|
+
@endpoint_name = 'bill_of_lading'
|
14
|
+
@endpoint_version = '1'
|
15
|
+
|
16
|
+
# @param prediction [Hash]
|
17
|
+
def initialize(prediction)
|
18
|
+
super
|
19
|
+
@prediction = BillOfLadingV1Document.new(prediction['prediction'], nil)
|
20
|
+
@pages = []
|
21
|
+
prediction['pages'].each do |page|
|
22
|
+
if page.key?('prediction') && !page['prediction'].nil? && !page['prediction'].empty?
|
23
|
+
@pages.push(BillOfLadingV1Page.new(page))
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class << self
|
29
|
+
# Name of the endpoint for this product.
|
30
|
+
# @return [String]
|
31
|
+
attr_reader :endpoint_name
|
32
|
+
# Version for this product.
|
33
|
+
# @return [String]
|
34
|
+
attr_reader :endpoint_version
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|