mindee 3.12.0 → 3.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +1 -1
  3. data/CHANGELOG.md +18 -0
  4. data/README.md +23 -23
  5. data/Rakefile +5 -0
  6. data/docs/bank_account_details_v2.md +5 -1
  7. data/docs/bank_check_v1.md +6 -2
  8. data/docs/bank_statement_fr_v1.md +3 -0
  9. data/docs/barcode_reader_v1.md +5 -1
  10. data/docs/bill_of_lading_v1.md +202 -0
  11. data/docs/carte_grise_v1.md +5 -1
  12. data/docs/carte_vitale_v1.md +5 -1
  13. data/docs/code_samples/bill_of_lading_v1_async.txt +19 -0
  14. data/docs/code_samples/energy_bill_fra_v1_async.txt +19 -0
  15. data/docs/code_samples/invoices_v4_async.txt +19 -0
  16. data/docs/code_samples/nutrition_facts_v1_async.txt +19 -0
  17. data/docs/code_samples/payslip_fra_v2_async.txt +19 -0
  18. data/docs/cropper_v1.md +6 -2
  19. data/docs/custom_v1.md +5 -3
  20. data/docs/energy_bill_fra_v1.md +249 -0
  21. data/docs/eu_driver_license_v1.md +6 -2
  22. data/docs/expense_receipts_v5.md +26 -1
  23. data/docs/financial_document_v1.md +29 -1
  24. data/docs/generated_v1.md +3 -0
  25. data/docs/getting_started.md +3 -0
  26. data/docs/idcard_fr_v2.md +15 -2
  27. data/docs/international_id_v2.md +13 -1
  28. data/docs/invoice_splitter_v1.md +16 -13
  29. data/docs/invoices_v4.md +54 -21
  30. data/docs/license_plates_v1.md +5 -1
  31. data/docs/multi_receipts_detector_v1.md +5 -1
  32. data/docs/nutrition_facts_v1.md +295 -0
  33. data/docs/passport_v1.md +5 -1
  34. data/docs/payslip_fra_v2.md +218 -0
  35. data/docs/proof_of_address_v1.md +5 -1
  36. data/docs/resume_v1.md +24 -1
  37. data/docs/us_driver_license_v1.md +6 -2
  38. data/docs/us_healthcare_cards_v1.md +5 -1
  39. data/docs/us_mail_v2.md +5 -1
  40. data/docs/us_w9_v1.md +6 -2
  41. data/examples/auto_invoice_splitter_extraction.rb +43 -0
  42. data/lib/mindee/client.rb +20 -8
  43. data/lib/mindee/{image_extraction → extraction}/common/image_extractor.rb +2 -4
  44. data/lib/mindee/{image_extraction → extraction}/common.rb +1 -0
  45. data/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +55 -0
  46. data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +111 -0
  47. data/lib/mindee/extraction/pdf_extractor.rb +4 -0
  48. data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +322 -0
  49. data/lib/mindee/extraction/tax_extractor.rb +1 -320
  50. data/lib/mindee/extraction.rb +3 -0
  51. data/lib/mindee/http/endpoint.rb +18 -6
  52. data/lib/mindee/parsing/common/api_response.rb +1 -1
  53. data/lib/mindee/parsing/common/document.rb +31 -1
  54. data/lib/mindee/parsing/common/extras/cropper_extra.rb +29 -0
  55. data/lib/mindee/parsing/common/extras/extras.rb +50 -0
  56. data/lib/mindee/parsing/common/extras/full_text_ocr_extra.rb +32 -0
  57. data/lib/mindee/parsing/common/extras.rb +5 -0
  58. data/lib/mindee/parsing/common/page.rb +5 -0
  59. data/lib/mindee/parsing/standard/base_field.rb +1 -0
  60. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1.rb +39 -0
  61. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier.rb +52 -0
  62. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier_item.rb +95 -0
  63. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_consignee.rb +58 -0
  64. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_document.rb +136 -0
  65. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_notify_party.rb +58 -0
  66. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_page.rb +32 -0
  67. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_shipper.rb +58 -0
  68. data/lib/mindee/product/financial_document/financial_document_v1_line_item.rb +15 -1
  69. data/lib/mindee/product/fr/bank_account_details/bank_account_details_v2_bban.rb +4 -15
  70. data/lib/mindee/product/fr/energy_bill/energy_bill_v1.rb +41 -0
  71. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_document.rb +235 -0
  72. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_consumer.rb +48 -0
  73. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_supplier.rb +48 -0
  74. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_usage.rb +97 -0
  75. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_meter_detail.rb +54 -0
  76. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_page.rb +34 -0
  77. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_subscription.rb +97 -0
  78. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_taxes_and_contribution.rb +97 -0
  79. data/lib/mindee/product/fr/payslip/payslip_v2.rb +41 -0
  80. data/lib/mindee/product/fr/payslip/payslip_v2_bank_account_detail.rb +54 -0
  81. data/lib/mindee/product/fr/payslip/payslip_v2_document.rb +128 -0
  82. data/lib/mindee/product/fr/payslip/payslip_v2_employee.rb +78 -0
  83. data/lib/mindee/product/fr/payslip/payslip_v2_employer.rb +78 -0
  84. data/lib/mindee/product/fr/payslip/payslip_v2_employment.rb +72 -0
  85. data/lib/mindee/product/fr/payslip/payslip_v2_page.rb +34 -0
  86. data/lib/mindee/product/fr/payslip/payslip_v2_pay_detail.rb +100 -0
  87. data/lib/mindee/product/fr/payslip/payslip_v2_pay_period.rb +66 -0
  88. data/lib/mindee/product/fr/payslip/payslip_v2_pto.rb +56 -0
  89. data/lib/mindee/product/fr/payslip/payslip_v2_salary_detail.rb +81 -0
  90. data/lib/mindee/product/invoice/invoice_v4_line_item.rb +15 -1
  91. data/lib/mindee/product/invoice_splitter/invoice_splitter_v1_document.rb +1 -1
  92. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1.rb +39 -0
  93. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_added_sugar.rb +52 -0
  94. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_calorie.rb +52 -0
  95. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_cholesterol.rb +52 -0
  96. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_dietary_fiber.rb +52 -0
  97. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_document.rb +173 -0
  98. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_nutrient.rb +87 -0
  99. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_page.rb +32 -0
  100. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_protein.rb +52 -0
  101. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_saturated_fat.rb +52 -0
  102. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_serving_size.rb +46 -0
  103. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_sodium.rb +58 -0
  104. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_carbohydrate.rb +52 -0
  105. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_fat.rb +52 -0
  106. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_sugar.rb +52 -0
  107. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_trans_fat.rb +52 -0
  108. data/lib/mindee/product/receipt/receipt_v5_line_item.rb +11 -1
  109. data/lib/mindee/product/resume/resume_v1_certificate.rb +11 -1
  110. data/lib/mindee/product/resume/resume_v1_education.rb +14 -1
  111. data/lib/mindee/product/resume/resume_v1_language.rb +9 -1
  112. data/lib/mindee/product/resume/resume_v1_professional_experience.rb +15 -1
  113. data/lib/mindee/product/resume/resume_v1_social_networks_url.rb +9 -1
  114. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_copay.rb +9 -1
  115. data/lib/mindee/product/us/us_mail/us_mail_v2_recipient_address.rb +14 -1
  116. data/lib/mindee/product/us/us_mail/us_mail_v2_sender_address.rb +5 -17
  117. data/lib/mindee/product.rb +5 -1
  118. data/lib/mindee/version.rb +1 -1
  119. metadata +70 -9
  120. data/lib/mindee/image_extraction.rb +0 -4
  121. /data/lib/mindee/{image_extraction → extraction}/common/extracted_image.rb +0 -0
  122. /data/lib/mindee/{image_extraction → extraction}/multi_receipts_extractor/multi_receipts_extractor.rb +0 -0
  123. /data/lib/mindee/{image_extraction → extraction}/multi_receipts_extractor.rb +0 -0
  124. /data/lib/mindee/extraction/{ocr_extractor.rb → tax_extractor/ocr_extractor.rb} +0 -0
@@ -1,322 +1,3 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'ocr_extractor'
4
-
5
- module Mindee
6
- module Extraction
7
- # Tax extractor class
8
- class TaxExtractor < OcrExtractor
9
- # Extracts the most relevant candidate.
10
- # @param candidates [Array<Hash>] a candidate for the tax.
11
- # @param tax_names [Array<String>] list of all possible names the tax can have.
12
- # @return [Hash, nil]
13
- def self.pick_best(candidates, tax_names)
14
- return candidates[0] if candidates.size == 1
15
- return nil if candidates.empty?
16
-
17
- picked = 0
18
- picked_score = 0
19
-
20
- candidates.each_with_index do |candidate, i|
21
- next unless valid_candidate?(candidate, tax_names)
22
-
23
- sum_fields_score = calculate_score(candidate, i)
24
-
25
- if picked_score < sum_fields_score
26
- picked_score = sum_fields_score
27
- picked = i
28
- end
29
- end
30
-
31
- candidates[picked]
32
- end
33
-
34
- # Checks whether a tax code has been properly read. Shouldn't trigger except in case of very specific regex breaks
35
- # due to unsupported diacritics.
36
- # @param candidate [Hash] A candidate for the tax.
37
- # @param tax_names [Array<String>] list of all possible names the tax can have.
38
- # @return [Boolean]
39
- def self.valid_candidate?(candidate, tax_names)
40
- return false if tax_names.empty? || candidate.nil? || candidate['code'].nil?
41
-
42
- tax_names.each do |tax_name|
43
- return true if remove_accents(tax_name.downcase) == remove_accents(candidate['code'].downcase)
44
- end
45
- false
46
- end
47
-
48
- # [Experimental] computes the score of a valid candidate for a tax.
49
- # @param candidate [Hash] A candidate for the tax.
50
- # @param index [Integer]
51
- def self.calculate_score(candidate, index)
52
- score = index + 1
53
- unless candidate['rate'].nil?
54
- score += 1
55
- score -= 2 if candidate['rate'] > 100
56
- score -= 1 if candidate['rate'] > 30
57
- end
58
- score += 4 unless candidate['value'].nil?
59
- score += 1 unless candidate['base'].nil?
60
- score
61
- end
62
-
63
- # Curates tax values based on simple rules to avoid improbable data
64
- # @param found_hash [Hash] Hash of currently retrieved values
65
- # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
66
- # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
67
- # @return [Hash]
68
- def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
69
- reconstructed_hash = { 'code' => nil, 'page_id' => nil, 'rate' => nil, 'base' => nil, 'value' => nil }
70
- return reconstructed_hash if found_hash.nil?
71
-
72
- reconstructed_hash['code'] =
73
- found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
74
-
75
- if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
76
- found_hash['rate'] =
77
- found_hash['rate'] * 100
78
- end
79
- found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
80
- found_hash = decimate_rates_if_needed(found_hash)
81
- found_hash = fix_rate(found_hash)
82
- reconstructed_hash['rate'] = found_hash['rate']
83
- set_base_and_value(reconstructed_hash, found_hash)
84
- end
85
-
86
- # Swaps the rate with base or value if rate is out of bounds
87
- # @param found_hash [Hash] Hash of currently retrieved values
88
- # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
89
- # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
90
- # @return [Hash]
91
- def self.swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
92
- if found_hash['rate'] && (found_hash['rate'] > max_rate_percentage || found_hash['rate'] < min_rate_percentage)
93
- if valid_percentage?(found_hash['base'], min_rate_percentage, max_rate_percentage)
94
- found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
95
- elsif valid_percentage?(found_hash['value'], min_rate_percentage, max_rate_percentage)
96
- found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
97
- end
98
- end
99
- found_hash
100
- end
101
-
102
- # Rates can't be negative if set.
103
- # @param found_hash [Hash] Hash of currently retrieved values
104
- def self.fix_rate(found_hash)
105
- found_hash['rate'] = found_hash['rate'].abs unless found_hash['rate'].nil?
106
- found_hash
107
- end
108
-
109
- # Swaps the rate with base or value if rate is out of bounds
110
- # @param found_hash [Hash] Hash of currently retrieved values
111
- # @return [Hash]
112
- def self.decimate_rates_if_needed(found_hash)
113
- if found_hash['rate'] && found_hash['rate'] > 100
114
- if !found_hash['base'].nil? && found_hash['rate'] > found_hash['base']
115
- found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
116
- elsif !found_hash['value'].nil? && found_hash['rate'] > found_hash['value']
117
- found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
118
- end
119
- end
120
- found_hash
121
- end
122
-
123
- # Sets the base and value in the reconstructed hash based on certain conditions
124
- # @param reconstructed_hash [Hash] Hash being reconstructed with new values
125
- # @param found_hash [Hash] Hash of currently retrieved values
126
- # @return [Hash]
127
- def self.set_base_and_value(reconstructed_hash, found_hash)
128
- if found_hash['base'].nil?
129
- reconstructed_hash['base'] = found_hash['base']
130
- reconstructed_hash['value'] = found_hash['value']
131
- elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
132
- reconstructed_hash['base'] = found_hash['value']
133
- reconstructed_hash['value'] = found_hash['base']
134
- else
135
- reconstructed_hash['value'] = found_hash['value']
136
- end
137
- reconstructed_hash
138
- end
139
-
140
- # Extracts a single custom type of tax.
141
- # For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
142
- # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
143
- # @param tax_names [Array<String>] list of all possible names the tax can have.
144
- # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
145
- # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
146
- # @return [Mindee::Parsing::Standard::TaxField, nil]
147
- def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_rate_percentage = 100)
148
- return nil if ocr_result.is_a?(Mindee::Parsing::Common::Ocr) || tax_names.empty?
149
-
150
- tax_names.sort!
151
- found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
152
- # a tax is considered found horizontally if it has a value, otherwise it is vertical
153
- if found_hash.nil? || found_hash['value'].nil?
154
- found_hash = extract_vertical_tax(ocr_result, tax_names,
155
- found_hash)
156
- end
157
- found_hash = curate_values(found_hash, min_rate_percentage, max_rate_percentage)
158
-
159
- return if found_hash.nil? || found_hash.empty?
160
-
161
- create_tax_field(found_hash)
162
- end
163
-
164
- # Creates a tax field from a given hash.
165
- # @param found_hash [Hash] Hash of currently retrieved values
166
- # @return [Mindee::Parsing::Standard::TaxField]
167
- def self.create_tax_field(found_hash)
168
- Mindee::Parsing::Standard::TaxField.new(
169
- found_hash,
170
- found_hash.key?('page_id') ? found_hash['page_id'] : nil
171
- )
172
- end
173
-
174
- # Extracts the rate and code, if found, from matches into the found_hash.
175
- # @param matches [MatchData] RegEx matches of the values for taxes
176
- # @param found_hash [Hash] Hash of currently retrieved values
177
- # @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
178
- # @return [Hash]
179
- def self.extract_percentage_from_tax(matches, found_hash, percent_first)
180
- if percent_first
181
- found_hash['code'] = matches[2].strip unless matches[2].nil?
182
- found_hash['rate'] = parse_amount(matches[1].gsub('%', '')) unless matches[1].nil?
183
- else
184
- found_hash['code'] = matches[1].strip unless matches[1].nil?
185
- found_hash['rate'] = parse_amount(matches[2].gsub('%', '')) unless matches[2].nil?
186
- end
187
- found_hash
188
- end
189
-
190
- # rubocop:disable Metrics/CyclomaticComplexity
191
- # rubocop:disable Metrics/PerceivedComplexity
192
-
193
- # Extracts the basis and value of a tax from regex matches, independent of the order.
194
- # @param matches [MatchData] RegEx matches of the values for taxes
195
- # @param found_hash [Hash] Hash of currently retrieved values
196
- # @return [Hash]
197
- def self.extract_basis_and_value(matches, found_hash)
198
- if matches[4].nil? && !matches[3].nil?
199
- found_hash['value'] = parse_amount(matches[3]) unless matches[3].nil?
200
- elsif matches[3].nil? && !matches[4].nil?
201
- found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
202
- elsif !matches[3].nil? && !matches[4].nil?
203
- found_hash['base'] = parse_amount(matches[3]) unless matches[3].nil?
204
- found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
205
- end
206
- found_hash
207
- end
208
-
209
- # rubocop:enable Metrics/CyclomaticComplexity
210
- # rubocop:enable Metrics/PerceivedComplexity
211
-
212
- # Extracts tax information from a horizontal line.
213
- # @param line [String] Line to be processed.
214
- # @param pattern [Regexp] RegEx pattern to search the line with.
215
- # @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
216
- # @return [Hash]
217
- def self.extract_tax_from_horizontal_line(line, pattern, page_id, percent_first)
218
- found_hash = {}
219
-
220
- matches = line.match(pattern)
221
-
222
- # Edge case for when the tax is split-up between two pages, we'll consider that
223
- # the answer belongs to the first one.
224
- found_hash['page_id'] = page_id unless found_hash.key?('page_id')
225
- return found_hash if matches.nil?
226
-
227
- found_hash = extract_percentage_from_tax(matches, found_hash, percent_first)
228
- extract_basis_and_value(matches, found_hash)
229
- end
230
-
231
- # rubocop:disable Metrics/CyclomaticComplexity
232
- # rubocop:disable Metrics/PerceivedComplexity
233
-
234
- # Processes a horizontal line for tax extraction. Returns a hash of collected values.
235
- # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] Processed OCR results.
236
- # @param tax_names [Array<String>] Possible tax names candidates.
237
- # @return [Array<Hash>]
238
- def self.extract_horizontal_tax(ocr_result, tax_names)
239
- candidates = [{ 'code' => nil, 'value' => nil, 'base' => nil, 'rate' => nil }]
240
- linear_pattern_percent_first = %r{
241
- ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
242
- ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
243
- ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
244
- ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
245
- }x
246
- linear_pattern_percent_second = %r{
247
- ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
248
- ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
249
- ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
250
- ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
251
- }x
252
- ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
253
- page.all_lines.each do |line|
254
- clean_line = remove_currency_symbols(line.to_s.scrub.gsub(%r{[+(\[)\]¿?*_]}, '')).gsub(%r{\.{2,}}, ' ')
255
- .gsub(%r{ +}, ' ').strip
256
-
257
- next if match_index(clean_line, tax_names).nil?
258
-
259
- unless clean_line.match(linear_pattern_percent_second).nil?
260
- candidates.append(extract_tax_from_horizontal_line(clean_line[match_index(clean_line, tax_names)..],
261
- linear_pattern_percent_second, page_id, false))
262
- end
263
- if clean_line.include?('%') && !clean_line.match(linear_pattern_percent_first).nil?
264
- candidates.append(extract_tax_from_horizontal_line(clean_line[clean_line.index(%r{\d*[.,]?\d* ?%})..],
265
- linear_pattern_percent_first, page_id, true))
266
- elsif !clean_line.match(linear_pattern_percent_first).nil?
267
- candidates.append(extract_tax_from_horizontal_line(clean_line,
268
- linear_pattern_percent_first, page_id, true))
269
- end
270
- end
271
- end
272
- candidates
273
- end
274
- # rubocop:enable Metrics/CyclomaticComplexity
275
- # rubocop:enable Metrics/PerceivedComplexity
276
-
277
- # Processes a vertical reconstructed line for tax extraction. Returns a hash of collected values.
278
- # @param line [Mindee::Parsing::Common::Ocr::OcrLine] Processed OCR results.
279
- # @param found_hash [Hash] Hash containing previously found values, if any.
280
- # @return [Hash]
281
- def self.extract_vertical_tax_values(line, found_hash)
282
- amounts = []
283
- line.each do |reconstructed_word|
284
- amounts.push(parse_amount(reconstructed_word.text)) unless parse_amount(reconstructed_word.text).nil?
285
- end
286
- if amounts.length == 1 && !found_hash.key?('value')
287
- found_hash['value'] = amounts[0]
288
- else
289
- found_hash['rate'] = amounts[0] if found_hash['rate'].nil?
290
- found_hash['value'] = amounts[1] if found_hash['value'].nil?
291
- end
292
- found_hash
293
- end
294
-
295
- # Extracts tax data from a vertical reconstructed row.
296
- # @param ocr_result [Mindee::Parsing::Common::Ocr] OCR raw results
297
- # @param tax_names [Array<String>] Array of possible names a tax can have
298
- # @param found_hash [Hash] Hash of currently retrieved values
299
- def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
300
- found_hash = { 'code' => nil, 'page_id' => nil } if found_hash.nil?
301
-
302
- ocr_result.mvision_v1.pages.each_with_index do |page, page_id|
303
- page.all_words.each do |word|
304
- next if match_index(word.text, tax_names).nil?
305
-
306
- reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
307
- found_hash['page_id'] = page_id if found_hash['page_id'].nil?
308
- found_hash['code'] = word.text.strip if found_hash['code'].nil?
309
- found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
310
- end
311
- end
312
- found_hash
313
- end
314
-
315
- private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
316
- :extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
317
- :create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
318
- :decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
319
- :swap_rates_if_needed
320
- end
321
- end
322
- end
3
+ require_relative 'tax_extractor/tax_extractor'
@@ -1,3 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'extraction/tax_extractor'
4
+ require_relative 'extraction/multi_receipts_extractor'
5
+ require_relative 'extraction/common'
6
+ require_relative 'extraction/pdf_extractor'
@@ -48,12 +48,19 @@ module Mindee
48
48
  # Call the prediction API.
49
49
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
50
50
  # @param all_words [Boolean] Whether the full word extraction needs to be performed
51
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs
51
52
  # @param close_file [Boolean] Whether the file will be closed after reading
52
53
  # @param cropper [Boolean] Whether a cropping operation will be applied
53
54
  # @return [Array]
54
- def predict(input_source, all_words, close_file, cropper)
55
+ def predict(input_source, all_words, full_text, close_file, cropper)
55
56
  check_api_key
56
- response = predict_req_post(input_source, all_words: all_words, close_file: close_file, cropper: cropper)
57
+ response = predict_req_post(
58
+ input_source,
59
+ all_words: all_words,
60
+ full_text: full_text,
61
+ close_file: close_file,
62
+ cropper: cropper
63
+ )
57
64
  hashed_response = JSON.parse(response.body, object_class: Hash)
58
65
  return [hashed_response, response.body] if ResponseValidation.valid_sync_response?(response)
59
66
 
@@ -65,12 +72,13 @@ module Mindee
65
72
  # Call the prediction API.
66
73
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
67
74
  # @param all_words [Boolean] Whether the full word extraction needs to be performed
75
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
68
76
  # @param close_file [Boolean] Whether the file will be closed after reading
69
77
  # @param cropper [Boolean] Whether a cropping operation will be applied
70
78
  # @return [Array]
71
- def predict_async(input_source, all_words, close_file, cropper)
79
+ def predict_async(input_source, all_words, full_text, close_file, cropper)
72
80
  check_api_key
73
- response = document_queue_req_get(input_source, all_words, close_file, cropper)
81
+ response = document_queue_req_get(input_source, all_words, full_text, close_file, cropper)
74
82
  hashed_response = JSON.parse(response.body, object_class: Hash)
75
83
  return [hashed_response, response.body] if ResponseValidation.valid_async_response?(response)
76
84
 
@@ -97,14 +105,16 @@ module Mindee
97
105
 
98
106
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
99
107
  # @param all_words [Boolean] Whether the full word extraction needs to be performed
108
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
100
109
  # @param close_file [Boolean] Whether the file will be closed after reading
101
110
  # @param cropper [Boolean] Whether a cropping operation will be applied
102
111
  # @return [Net::HTTPResponse, nil]
103
- def predict_req_post(input_source, all_words: false, close_file: true, cropper: false)
112
+ def predict_req_post(input_source, all_words: false, full_text: false, close_file: true, cropper: false)
104
113
  uri = URI("#{@url_root}/predict")
105
114
 
106
115
  params = {}
107
116
  params[:cropper] = 'true' if cropper
117
+ params[:full_text_ocr] = 'true' if full_text
108
118
  uri.query = URI.encode_www_form(params)
109
119
 
110
120
  headers = {
@@ -129,14 +139,16 @@ module Mindee
129
139
 
130
140
  # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
131
141
  # @param all_words [Boolean] Whether the full word extraction needs to be performed
142
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
132
143
  # @param close_file [Boolean] Whether the file will be closed after reading
133
144
  # @param cropper [Boolean] Whether a cropping operation will be applied
134
145
  # @return [Net::HTTPResponse, nil]
135
- def document_queue_req_get(input_source, all_words, close_file, cropper)
146
+ def document_queue_req_get(input_source, all_words, full_text, close_file, cropper)
136
147
  uri = URI("#{@url_root}/predict_async")
137
148
 
138
149
  params = {}
139
150
  params[:cropper] = 'true' if cropper
151
+ params[:full_text_ocr] = 'true' if full_text
140
152
  uri.query = URI.encode_www_form(params)
141
153
 
142
154
  headers = {
@@ -103,7 +103,7 @@ module Mindee
103
103
  # @return [String]
104
104
  attr_reader :raw_http
105
105
 
106
- # @param product_class [Class<Mindee::Product>]
106
+ # @param product_class [Mindee::Inference]
107
107
  # @param http_response [Hash]
108
108
  # @param raw_http [String]
109
109
  def initialize(product_class, http_response, raw_http)
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'inference'
4
+ require_relative 'extras'
4
5
 
5
6
  module Mindee
6
7
  module Parsing
@@ -13,6 +14,8 @@ module Mindee
13
14
  attr_reader :name
14
15
  # @return [String] Mindee ID of the document
15
16
  attr_reader :id
17
+ # @return [Mindee::Parsing::Common::Extras::Extras] Potential Extras fields sent back along the prediction.
18
+ attr_reader :extras
16
19
  # @return [Mindee::Parsing::Common::Ocr::Ocr, nil] OCR text results (limited availability)
17
20
  attr_reader :ocr
18
21
  # @return [Integer] Amount of pages of the document
@@ -27,13 +30,22 @@ module Mindee
27
30
  Ocr::Ocr.new(ocr_prediction)
28
31
  end
29
32
 
30
- # @param product_class [Class<Mindee::Product>]
33
+ def self.load_extras(http_response)
34
+ extras_prediction = http_response['inference'].fetch('extras', nil)
35
+ return nil if extras_prediction.nil? || extras_prediction.fetch('mvision-v1', nil).nil?
36
+
37
+ Extras::Extras::Extras.new(extras_prediction)
38
+ end
39
+
40
+ # @param product_class [Mindee::Inference]
31
41
  # @param http_response [Hash]
32
42
  def initialize(product_class, http_response)
33
43
  @id = http_response['id']
34
44
  @name = http_response['name']
35
45
  @inference = product_class.new(http_response['inference'])
36
46
  @ocr = self.class.load_ocr(http_response)
47
+ @extras = self.class.load_extras(http_response)
48
+ inject_full_text_ocr(http_response)
37
49
  @n_pages = http_response['n_pages']
38
50
  end
39
51
 
@@ -45,6 +57,24 @@ module Mindee
45
57
  out_str << "\n:Filename: #{@name}"
46
58
  out_str << "\n\n#{@inference}"
47
59
  end
60
+
61
+ private
62
+
63
+ def inject_full_text_ocr(raw_prediction)
64
+ return unless raw_prediction.dig('inference', 'pages') &&
65
+ raw_prediction['inference']['pages'][0]['extras']['full_text_ocr']
66
+
67
+ full_text_ocr = String.new
68
+ raw_prediction.dig('inference', 'pages').each do |page|
69
+ full_text_ocr << (page['extras']['full_text_ocr']['content'])
70
+ end
71
+ artificial_text_obj = { 'content' => full_text_ocr }
72
+ if @extras.nil? || @extras.empty?
73
+ @extras = Extras::Extras.new({ 'full_text_ocr' => artificial_text_obj })
74
+ else
75
+ @extras.add_artificial_extra({ 'full_text_ocr' => artificial_text_obj })
76
+ end
77
+ end
48
78
  end
49
79
  end
50
80
  end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../standard/position_field'
4
+
5
+ module Mindee
6
+ module Parsing
7
+ module Common
8
+ module Extras
9
+ # Contains information on the cropping of a prediction.
10
+ class CropperExtra
11
+ # Cropper extra initialization.
12
+ # @return [Array<Mindee::Parsing::Standard::PositionField>]
13
+ attr_reader :croppings
14
+
15
+ def initialize(raw_prediction, page_id = nil)
16
+ @croppings = []
17
+ raw_prediction['cropping']&.each do |crop|
18
+ @croppings.push(Mindee::Parsing::Standard::PositionField.new(crop, page_id))
19
+ end
20
+ end
21
+
22
+ def to_s
23
+ @croppings.map(&:to_s).join("\n ")
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../standard/position_field'
4
+
5
+ module Mindee
6
+ module Parsing
7
+ module Common
8
+ # Extras namespace.
9
+ module Extras
10
+ # Extra information added to the prediction.
11
+ class Extras
12
+ # @return [CropperExtra, nil]
13
+ attr_reader :cropper
14
+ # @return [FullTextOCRExtra, nil]
15
+ attr_reader :full_text_ocr
16
+
17
+ def initialize(raw_prediction)
18
+ @cropper = CropperExtra.new(raw_prediction['cropper']) if raw_prediction['cropper']
19
+ @full_text_ocr = FullTextOCRExtra.new(raw_prediction['full_text_ocr']) if raw_prediction['full_text_ocr']
20
+
21
+ raw_prediction.each do |key, value|
22
+ instance_variable_set("@#{key}", value) unless ['cropper', 'full_text_ocr'].include?(key)
23
+ end
24
+ end
25
+
26
+ def to_s
27
+ out_str = String.new
28
+ instance_variables.each do |var|
29
+ out_str << "#{var}: #{instance_variable_get(var)}"
30
+ end
31
+ out_str
32
+ end
33
+
34
+ # Adds artificial extra data for reconstructed extras. Currently only used for full_text_ocr.
35
+ #
36
+ # @param [Hash] raw_prediction Raw prediction used by the document.
37
+ def add_artificial_extra(raw_prediction)
38
+ return unless raw_prediction['full_text_ocr']
39
+
40
+ @full_text_ocr << FullTextOCRExtra.new(raw_prediction)
41
+ end
42
+ end
43
+
44
+ def empty?
45
+ instance_variables.all? { |var| instance_variable_get(var).nil? }
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../standard/position_field'
4
+
5
+ module Mindee
6
+ module Parsing
7
+ module Common
8
+ module Extras
9
+ # Full Text OCR result.
10
+ class FullTextOCRExtra
11
+ # Contents of the full text OCR result.
12
+ # @return [String, nil]
13
+ attr_reader :contents
14
+ # Language used on the page.
15
+ # @return [String, nil]
16
+ attr_reader :language
17
+
18
+ def initialize(raw_prediction)
19
+ @contents = raw_prediction['content'] if raw_prediction['content']
20
+ return unless raw_prediction['language']
21
+
22
+ @language = raw_prediction['language']
23
+ end
24
+
25
+ def to_s
26
+ @contents || ''
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'extras/extras'
4
+ require_relative 'extras/cropper_extra'
5
+ require_relative 'extras/full_text_ocr_extra'
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'product'
4
+ require_relative 'extras'
4
5
 
5
6
  module Mindee
6
7
  module Parsing
@@ -18,11 +19,15 @@ module Mindee
18
19
  # Page prediction
19
20
  # @return [Mindee::Parsing::Common::Prediction]
20
21
  attr_reader :prediction
22
+ # Additional page-level information.
23
+ # @return [Mindee::Parsing::Common::Extras::Extras]
24
+ attr_reader :extras
21
25
 
22
26
  # @param raw_prediction [Hash]
23
27
  def initialize(raw_prediction)
24
28
  @page_id = raw_prediction['id']
25
29
  @orientation = Orientation.new(raw_prediction['orientation'], @page_id)
30
+ @extras = Extras::Extras.new(raw_prediction['extras']) unless raw_prediction['extras'].nil?
26
31
  end
27
32
 
28
33
  # @return [String]
@@ -98,6 +98,7 @@ module Mindee
98
98
  return '' if in_str.nil?
99
99
  return in_str if max_col_size.nil?
100
100
 
101
+ in_str = in_str.gsub(%r{[\n\r\t]}, "\n" => '\\n', "\r" => '\\r', "\t" => '\\t')
101
102
  in_str.length <= max_col_size ? in_str : "#{in_str[0..max_col_size - 4]}..."
102
103
  end
103
104
  end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../../parsing'
4
+ require_relative 'bill_of_lading_v1_document'
5
+ require_relative 'bill_of_lading_v1_page'
6
+
7
+ module Mindee
8
+ module Product
9
+ # Bill of Lading module.
10
+ module BillOfLading
11
+ # Bill of Lading API version 1 inference prediction.
12
+ class BillOfLadingV1 < Mindee::Parsing::Common::Inference
13
+ @endpoint_name = 'bill_of_lading'
14
+ @endpoint_version = '1'
15
+
16
+ # @param prediction [Hash]
17
+ def initialize(prediction)
18
+ super
19
+ @prediction = BillOfLadingV1Document.new(prediction['prediction'], nil)
20
+ @pages = []
21
+ prediction['pages'].each do |page|
22
+ if page.key?('prediction') && !page['prediction'].nil? && !page['prediction'].empty?
23
+ @pages.push(BillOfLadingV1Page.new(page))
24
+ end
25
+ end
26
+ end
27
+
28
+ class << self
29
+ # Name of the endpoint for this product.
30
+ # @return [String]
31
+ attr_reader :endpoint_name
32
+ # Version for this product.
33
+ # @return [String]
34
+ attr_reader :endpoint_version
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end