mindee 3.12.0 → 3.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +1 -1
  3. data/CHANGELOG.md +18 -0
  4. data/README.md +23 -23
  5. data/Rakefile +5 -0
  6. data/docs/bank_account_details_v2.md +5 -1
  7. data/docs/bank_check_v1.md +6 -2
  8. data/docs/bank_statement_fr_v1.md +3 -0
  9. data/docs/barcode_reader_v1.md +5 -1
  10. data/docs/bill_of_lading_v1.md +202 -0
  11. data/docs/carte_grise_v1.md +5 -1
  12. data/docs/carte_vitale_v1.md +5 -1
  13. data/docs/code_samples/bill_of_lading_v1_async.txt +19 -0
  14. data/docs/code_samples/energy_bill_fra_v1_async.txt +19 -0
  15. data/docs/code_samples/invoices_v4_async.txt +19 -0
  16. data/docs/code_samples/nutrition_facts_v1_async.txt +19 -0
  17. data/docs/code_samples/payslip_fra_v2_async.txt +19 -0
  18. data/docs/cropper_v1.md +6 -2
  19. data/docs/custom_v1.md +5 -3
  20. data/docs/energy_bill_fra_v1.md +249 -0
  21. data/docs/eu_driver_license_v1.md +6 -2
  22. data/docs/expense_receipts_v5.md +26 -1
  23. data/docs/financial_document_v1.md +29 -1
  24. data/docs/generated_v1.md +3 -0
  25. data/docs/getting_started.md +3 -0
  26. data/docs/idcard_fr_v2.md +15 -2
  27. data/docs/international_id_v2.md +13 -1
  28. data/docs/invoice_splitter_v1.md +16 -13
  29. data/docs/invoices_v4.md +54 -21
  30. data/docs/license_plates_v1.md +5 -1
  31. data/docs/multi_receipts_detector_v1.md +5 -1
  32. data/docs/nutrition_facts_v1.md +295 -0
  33. data/docs/passport_v1.md +5 -1
  34. data/docs/payslip_fra_v2.md +218 -0
  35. data/docs/proof_of_address_v1.md +5 -1
  36. data/docs/resume_v1.md +24 -1
  37. data/docs/us_driver_license_v1.md +6 -2
  38. data/docs/us_healthcare_cards_v1.md +5 -1
  39. data/docs/us_mail_v2.md +5 -1
  40. data/docs/us_w9_v1.md +6 -2
  41. data/examples/auto_invoice_splitter_extraction.rb +43 -0
  42. data/lib/mindee/client.rb +20 -8
  43. data/lib/mindee/{image_extraction → extraction}/common/image_extractor.rb +2 -4
  44. data/lib/mindee/{image_extraction → extraction}/common.rb +1 -0
  45. data/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +55 -0
  46. data/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +111 -0
  47. data/lib/mindee/extraction/pdf_extractor.rb +4 -0
  48. data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +322 -0
  49. data/lib/mindee/extraction/tax_extractor.rb +1 -320
  50. data/lib/mindee/extraction.rb +3 -0
  51. data/lib/mindee/http/endpoint.rb +18 -6
  52. data/lib/mindee/parsing/common/api_response.rb +1 -1
  53. data/lib/mindee/parsing/common/document.rb +31 -1
  54. data/lib/mindee/parsing/common/extras/cropper_extra.rb +29 -0
  55. data/lib/mindee/parsing/common/extras/extras.rb +50 -0
  56. data/lib/mindee/parsing/common/extras/full_text_ocr_extra.rb +32 -0
  57. data/lib/mindee/parsing/common/extras.rb +5 -0
  58. data/lib/mindee/parsing/common/page.rb +5 -0
  59. data/lib/mindee/parsing/standard/base_field.rb +1 -0
  60. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1.rb +39 -0
  61. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier.rb +52 -0
  62. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_carrier_item.rb +95 -0
  63. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_consignee.rb +58 -0
  64. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_document.rb +136 -0
  65. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_notify_party.rb +58 -0
  66. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_page.rb +32 -0
  67. data/lib/mindee/product/bill_of_lading/bill_of_lading_v1_shipper.rb +58 -0
  68. data/lib/mindee/product/financial_document/financial_document_v1_line_item.rb +15 -1
  69. data/lib/mindee/product/fr/bank_account_details/bank_account_details_v2_bban.rb +4 -15
  70. data/lib/mindee/product/fr/energy_bill/energy_bill_v1.rb +41 -0
  71. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_document.rb +235 -0
  72. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_consumer.rb +48 -0
  73. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_supplier.rb +48 -0
  74. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_energy_usage.rb +97 -0
  75. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_meter_detail.rb +54 -0
  76. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_page.rb +34 -0
  77. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_subscription.rb +97 -0
  78. data/lib/mindee/product/fr/energy_bill/energy_bill_v1_taxes_and_contribution.rb +97 -0
  79. data/lib/mindee/product/fr/payslip/payslip_v2.rb +41 -0
  80. data/lib/mindee/product/fr/payslip/payslip_v2_bank_account_detail.rb +54 -0
  81. data/lib/mindee/product/fr/payslip/payslip_v2_document.rb +128 -0
  82. data/lib/mindee/product/fr/payslip/payslip_v2_employee.rb +78 -0
  83. data/lib/mindee/product/fr/payslip/payslip_v2_employer.rb +78 -0
  84. data/lib/mindee/product/fr/payslip/payslip_v2_employment.rb +72 -0
  85. data/lib/mindee/product/fr/payslip/payslip_v2_page.rb +34 -0
  86. data/lib/mindee/product/fr/payslip/payslip_v2_pay_detail.rb +100 -0
  87. data/lib/mindee/product/fr/payslip/payslip_v2_pay_period.rb +66 -0
  88. data/lib/mindee/product/fr/payslip/payslip_v2_pto.rb +56 -0
  89. data/lib/mindee/product/fr/payslip/payslip_v2_salary_detail.rb +81 -0
  90. data/lib/mindee/product/invoice/invoice_v4_line_item.rb +15 -1
  91. data/lib/mindee/product/invoice_splitter/invoice_splitter_v1_document.rb +1 -1
  92. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1.rb +39 -0
  93. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_added_sugar.rb +52 -0
  94. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_calorie.rb +52 -0
  95. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_cholesterol.rb +52 -0
  96. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_dietary_fiber.rb +52 -0
  97. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_document.rb +173 -0
  98. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_nutrient.rb +87 -0
  99. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_page.rb +32 -0
  100. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_protein.rb +52 -0
  101. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_saturated_fat.rb +52 -0
  102. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_serving_size.rb +46 -0
  103. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_sodium.rb +58 -0
  104. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_carbohydrate.rb +52 -0
  105. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_fat.rb +52 -0
  106. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_total_sugar.rb +52 -0
  107. data/lib/mindee/product/nutrition_facts_label/nutrition_facts_label_v1_trans_fat.rb +52 -0
  108. data/lib/mindee/product/receipt/receipt_v5_line_item.rb +11 -1
  109. data/lib/mindee/product/resume/resume_v1_certificate.rb +11 -1
  110. data/lib/mindee/product/resume/resume_v1_education.rb +14 -1
  111. data/lib/mindee/product/resume/resume_v1_language.rb +9 -1
  112. data/lib/mindee/product/resume/resume_v1_professional_experience.rb +15 -1
  113. data/lib/mindee/product/resume/resume_v1_social_networks_url.rb +9 -1
  114. data/lib/mindee/product/us/healthcare_card/healthcare_card_v1_copay.rb +9 -1
  115. data/lib/mindee/product/us/us_mail/us_mail_v2_recipient_address.rb +14 -1
  116. data/lib/mindee/product/us/us_mail/us_mail_v2_sender_address.rb +5 -17
  117. data/lib/mindee/product.rb +5 -1
  118. data/lib/mindee/version.rb +1 -1
  119. metadata +70 -9
  120. data/lib/mindee/image_extraction.rb +0 -4
  121. /data/lib/mindee/{image_extraction → extraction}/common/extracted_image.rb +0 -0
  122. /data/lib/mindee/{image_extraction → extraction}/multi_receipts_extractor/multi_receipts_extractor.rb +0 -0
  123. /data/lib/mindee/{image_extraction → extraction}/multi_receipts_extractor.rb +0 -0
  124. /data/lib/mindee/extraction/{ocr_extractor.rb → tax_extractor/ocr_extractor.rb} +0 -0
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Pdf Extraction Module.
5
+ module Extraction
6
+ # Pdf Extraction class.
7
+ module PdfExtractor
8
+ # Pdf extraction class.
9
+ class PdfExtractor
10
+ # @param local_input [Mindee::Input::Source::LocalInputSource]
11
+ def initialize(local_input)
12
+ @filename = local_input.filename
13
+ if local_input.pdf?
14
+ @source_pdf = local_input.io_stream
15
+ else
16
+ pdf_image = ImageExtraction.attach_image_as_new_file(local_input.io_stream)
17
+ io_buffer = StringIO.new
18
+ pdf_image.save(io_buffer)
19
+
20
+ @source_pdf = io_buffer
21
+ end
22
+ end
23
+
24
+ # Retrieves the page count for the Pdf object.
25
+ # @return [Integer]
26
+ def page_count
27
+ Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size
28
+ end
29
+
30
+ # Creates a new Pdf from pages and save it into a buffer.
31
+ # @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
32
+ # @return [StreamIO] The buffer containing the new Pdf.
33
+ def cut_pages(page_indexes)
34
+ options = {
35
+ page_indexes: page_indexes,
36
+ }
37
+
38
+ Mindee::PDF::PdfProcessor.parse(@source_pdf, options)
39
+ end
40
+
41
+ # Extract the sub-documents from the main pdf, based on the given list of page indexes.
42
+ # @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
43
+ # @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>] The buffer containing the new Pdf.
44
+ def extract_sub_documents(page_indexes)
45
+ extracted_pdfs = []
46
+ extension = File.extname(@filename)
47
+ basename = File.basename(@filename, extension)
48
+ page_indexes.each do |page_index_list|
49
+ if page_index_list.empty? || page_index_list.nil?
50
+ raise "Empty indexes aren't allowed for extraction #{page_index_list}"
51
+ end
52
+
53
+ page_index_list.each do |page_index|
54
+ raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative?
55
+ end
56
+ formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
57
+ field_filename = "#{basename}_#{format('%03d',
58
+ (page_index_list[0] + 1))}-#{formatted_max_index}#{extension}"
59
+ extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list),
60
+ field_filename)
61
+ extracted_pdfs << extracted_pdf
62
+ end
63
+ extracted_pdfs
64
+ end
65
+
66
+ # rubocop:disable Metrics/CyclomaticComplexity
67
+ # rubocop:disable Metrics/PerceivedComplexity
68
+ # Extracts invoices as complete PDFs from the document.
69
+ # @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
70
+ # @param strict [Boolean]
71
+ # @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>]
72
+ def extract_invoices(page_indexes, strict: false)
73
+ raise 'No indexes provided.' if page_indexes.empty?
74
+ unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup)
75
+ return extract_sub_documents(page_indexes)
76
+ end
77
+ return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict
78
+
79
+ correct_page_indexes = []
80
+ current_list = []
81
+ previous_confidence = nil
82
+ page_indexes.each_with_index do |page_index, i|
83
+ confidence = page_index.confidence
84
+ page_list = page_index.page_indexes
85
+
86
+ if confidence >= 0.5 && previous_confidence.nil?
87
+ current_list = page_list
88
+ elsif confidence >= 0.5 && i < page_indexes.length - 1
89
+ correct_page_indexes << current_list
90
+ current_list = page_list
91
+ elsif confidence < 0.5 && i == page_indexes.length - 1
92
+ current_list.concat page_list
93
+ correct_page_indexes << current_list
94
+ else
95
+ correct_page_indexes << current_list
96
+ correct_page_indexes << page_list
97
+ end
98
+ previous_confidence = confidence
99
+ end
100
+ extract_sub_documents(correct_page_indexes)
101
+ end
102
+ # rubocop:enable Metrics/CyclomaticComplexity
103
+ # rubocop:enable Metrics/PerceivedComplexity
104
+
105
+ private
106
+
107
+ attr_reader :source_pdf, :filename
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'pdf_extractor/pdf_extractor'
4
+ require_relative 'pdf_extractor/extracted_pdf'
@@ -0,0 +1,322 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'ocr_extractor'
4
+
5
+ module Mindee
6
+ module Extraction
7
+ # Tax extractor class
8
+ class TaxExtractor < OcrExtractor
9
+ # Extracts the most relevant candidate.
10
+ # @param candidates [Array<Hash>] a candidate for the tax.
11
+ # @param tax_names [Array<String>] list of all possible names the tax can have.
12
+ # @return [Hash, nil]
13
+ def self.pick_best(candidates, tax_names)
14
+ return candidates[0] if candidates.size == 1
15
+ return nil if candidates.empty?
16
+
17
+ picked = 0
18
+ picked_score = 0
19
+
20
+ candidates.each_with_index do |candidate, i|
21
+ next unless valid_candidate?(candidate, tax_names)
22
+
23
+ sum_fields_score = calculate_score(candidate, i)
24
+
25
+ if picked_score < sum_fields_score
26
+ picked_score = sum_fields_score
27
+ picked = i
28
+ end
29
+ end
30
+
31
+ candidates[picked]
32
+ end
33
+
34
+ # Checks whether a tax code has been properly read. Shouldn't trigger except in case of very specific regex breaks
35
+ # due to unsupported diacritics.
36
+ # @param candidate [Hash] A candidate for the tax.
37
+ # @param tax_names [Array<String>] list of all possible names the tax can have.
38
+ # @return [Boolean]
39
+ def self.valid_candidate?(candidate, tax_names)
40
+ return false if tax_names.empty? || candidate.nil? || candidate['code'].nil?
41
+
42
+ tax_names.each do |tax_name|
43
+ return true if remove_accents(tax_name.downcase) == remove_accents(candidate['code'].downcase)
44
+ end
45
+ false
46
+ end
47
+
48
+ # [Experimental] computes the score of a valid candidate for a tax.
49
+ # @param candidate [Hash] A candidate for the tax.
50
+ # @param index [Integer]
51
+ def self.calculate_score(candidate, index)
52
+ score = index + 1
53
+ unless candidate['rate'].nil?
54
+ score += 1
55
+ score -= 2 if candidate['rate'] > 100
56
+ score -= 1 if candidate['rate'] > 30
57
+ end
58
+ score += 4 unless candidate['value'].nil?
59
+ score += 1 unless candidate['base'].nil?
60
+ score
61
+ end
62
+
63
+ # Curates tax values based on simple rules to avoid improbable data
64
+ # @param found_hash [Hash] Hash of currently retrieved values
65
+ # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
66
+ # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
67
+ # @return [Hash]
68
+ def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
69
+ reconstructed_hash = { 'code' => nil, 'page_id' => nil, 'rate' => nil, 'base' => nil, 'value' => nil }
70
+ return reconstructed_hash if found_hash.nil?
71
+
72
+ reconstructed_hash['code'] =
73
+ found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
74
+
75
+ if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
76
+ found_hash['rate'] =
77
+ found_hash['rate'] * 100
78
+ end
79
+ found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
80
+ found_hash = decimate_rates_if_needed(found_hash)
81
+ found_hash = fix_rate(found_hash)
82
+ reconstructed_hash['rate'] = found_hash['rate']
83
+ set_base_and_value(reconstructed_hash, found_hash)
84
+ end
85
+
86
+ # Swaps the rate with base or value if rate is out of bounds
87
+ # @param found_hash [Hash] Hash of currently retrieved values
88
+ # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
89
+ # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
90
+ # @return [Hash]
91
+ def self.swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
92
+ if found_hash['rate'] && (found_hash['rate'] > max_rate_percentage || found_hash['rate'] < min_rate_percentage)
93
+ if valid_percentage?(found_hash['base'], min_rate_percentage, max_rate_percentage)
94
+ found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
95
+ elsif valid_percentage?(found_hash['value'], min_rate_percentage, max_rate_percentage)
96
+ found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
97
+ end
98
+ end
99
+ found_hash
100
+ end
101
+
102
+ # Rates can't be negative if set.
103
+ # @param found_hash [Hash] Hash of currently retrieved values
104
+ def self.fix_rate(found_hash)
105
+ found_hash['rate'] = found_hash['rate'].abs unless found_hash['rate'].nil?
106
+ found_hash
107
+ end
108
+
109
+ # Swaps the rate with base or value if rate is out of bounds
110
+ # @param found_hash [Hash] Hash of currently retrieved values
111
+ # @return [Hash]
112
+ def self.decimate_rates_if_needed(found_hash)
113
+ if found_hash['rate'] && found_hash['rate'] > 100
114
+ if !found_hash['base'].nil? && found_hash['rate'] > found_hash['base']
115
+ found_hash['rate'], found_hash['base'] = found_hash['base'], found_hash['rate']
116
+ elsif !found_hash['value'].nil? && found_hash['rate'] > found_hash['value']
117
+ found_hash['rate'], found_hash['value'] = found_hash['value'], found_hash['rate']
118
+ end
119
+ end
120
+ found_hash
121
+ end
122
+
123
+ # Sets the base and value in the reconstructed hash based on certain conditions
124
+ # @param reconstructed_hash [Hash] Hash being reconstructed with new values
125
+ # @param found_hash [Hash] Hash of currently retrieved values
126
+ # @return [Hash]
127
+ def self.set_base_and_value(reconstructed_hash, found_hash)
128
+ if found_hash['base'].nil?
129
+ reconstructed_hash['base'] = found_hash['base']
130
+ reconstructed_hash['value'] = found_hash['value']
131
+ elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
132
+ reconstructed_hash['base'] = found_hash['value']
133
+ reconstructed_hash['value'] = found_hash['base']
134
+ else
135
+ reconstructed_hash['value'] = found_hash['value']
136
+ end
137
+ reconstructed_hash
138
+ end
139
+
140
+ # Extracts a single custom type of tax.
141
+ # For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
142
+ # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
143
+ # @param tax_names [Array<String>] list of all possible names the tax can have.
144
+ # @param min_rate_percentage [Integer] Minimum allowed rate on the tax.
145
+ # @param max_rate_percentage [Integer] Maximum allowed rate on the tax.
146
+ # @return [Mindee::Parsing::Standard::TaxField, nil]
147
+ def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_rate_percentage = 100)
148
+ return nil if ocr_result.is_a?(Mindee::Parsing::Common::Ocr) || tax_names.empty?
149
+
150
+ tax_names.sort!
151
+ found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
152
+ # a tax is considered found horizontally if it has a value, otherwise it is vertical
153
+ if found_hash.nil? || found_hash['value'].nil?
154
+ found_hash = extract_vertical_tax(ocr_result, tax_names,
155
+ found_hash)
156
+ end
157
+ found_hash = curate_values(found_hash, min_rate_percentage, max_rate_percentage)
158
+
159
+ return if found_hash.nil? || found_hash.empty?
160
+
161
+ create_tax_field(found_hash)
162
+ end
163
+
164
+ # Creates a tax field from a given hash.
165
+ # @param found_hash [Hash] Hash of currently retrieved values
166
+ # @return [Mindee::Parsing::Standard::TaxField]
167
+ def self.create_tax_field(found_hash)
168
+ Mindee::Parsing::Standard::TaxField.new(
169
+ found_hash,
170
+ found_hash.key?('page_id') ? found_hash['page_id'] : nil
171
+ )
172
+ end
173
+
174
+ # Extracts the rate and code, if found, from matches into the found_hash.
175
+ # @param matches [MatchData] RegEx matches of the values for taxes
176
+ # @param found_hash [Hash] Hash of currently retrieved values
177
+ # @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
178
+ # @return [Hash]
179
+ def self.extract_percentage_from_tax(matches, found_hash, percent_first)
180
+ if percent_first
181
+ found_hash['code'] = matches[2].strip unless matches[2].nil?
182
+ found_hash['rate'] = parse_amount(matches[1].gsub('%', '')) unless matches[1].nil?
183
+ else
184
+ found_hash['code'] = matches[1].strip unless matches[1].nil?
185
+ found_hash['rate'] = parse_amount(matches[2].gsub('%', '')) unless matches[2].nil?
186
+ end
187
+ found_hash
188
+ end
189
+
190
+ # rubocop:disable Metrics/CyclomaticComplexity
191
+ # rubocop:disable Metrics/PerceivedComplexity
192
+
193
+ # Extracts the basis and value of a tax from regex matches, independent of the order.
194
+ # @param matches [MatchData] RegEx matches of the values for taxes
195
+ # @param found_hash [Hash] Hash of currently retrieved values
196
+ # @return [Hash]
197
+ def self.extract_basis_and_value(matches, found_hash)
198
+ if matches[4].nil? && !matches[3].nil?
199
+ found_hash['value'] = parse_amount(matches[3]) unless matches[3].nil?
200
+ elsif matches[3].nil? && !matches[4].nil?
201
+ found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
202
+ elsif !matches[3].nil? && !matches[4].nil?
203
+ found_hash['base'] = parse_amount(matches[3]) unless matches[3].nil?
204
+ found_hash['value'] = parse_amount(matches[4]) unless matches[4].nil?
205
+ end
206
+ found_hash
207
+ end
208
+
209
+ # rubocop:enable Metrics/CyclomaticComplexity
210
+ # rubocop:enable Metrics/PerceivedComplexity
211
+
212
+ # Extracts tax information from a horizontal line.
213
+ # @param line [String] Line to be processed.
214
+ # @param pattern [Regexp] RegEx pattern to search the line with.
215
+ # @param percent_first [Boolean] Whether the percentage was found before or after the tax name.
216
+ # @return [Hash]
217
+ def self.extract_tax_from_horizontal_line(line, pattern, page_id, percent_first)
218
+ found_hash = {}
219
+
220
+ matches = line.match(pattern)
221
+
222
+ # Edge case for when the tax is split-up between two pages, we'll consider that
223
+ # the answer belongs to the first one.
224
+ found_hash['page_id'] = page_id unless found_hash.key?('page_id')
225
+ return found_hash if matches.nil?
226
+
227
+ found_hash = extract_percentage_from_tax(matches, found_hash, percent_first)
228
+ extract_basis_and_value(matches, found_hash)
229
+ end
230
+
231
+ # rubocop:disable Metrics/CyclomaticComplexity
232
+ # rubocop:disable Metrics/PerceivedComplexity
233
+
234
+ # Processes a horizontal line for tax extraction. Returns a hash of collected values.
235
+ # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] Processed OCR results.
236
+ # @param tax_names [Array<String>] Possible tax names candidates.
237
+ # @return [Array<Hash>]
238
+ def self.extract_horizontal_tax(ocr_result, tax_names)
239
+ candidates = [{ 'code' => nil, 'value' => nil, 'base' => nil, 'rate' => nil }]
240
+ linear_pattern_percent_first = %r{
241
+ ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
242
+ ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
243
+ ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
244
+ ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
245
+ }x
246
+ linear_pattern_percent_second = %r{
247
+ ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
248
+ ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
249
+ ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
250
+ ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
251
+ }x
252
+ ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
253
+ page.all_lines.each do |line|
254
+ clean_line = remove_currency_symbols(line.to_s.scrub.gsub(%r{[+(\[)\]¿?*_]}, '')).gsub(%r{\.{2,}}, ' ')
255
+ .gsub(%r{ +}, ' ').strip
256
+
257
+ next if match_index(clean_line, tax_names).nil?
258
+
259
+ unless clean_line.match(linear_pattern_percent_second).nil?
260
+ candidates.append(extract_tax_from_horizontal_line(clean_line[match_index(clean_line, tax_names)..],
261
+ linear_pattern_percent_second, page_id, false))
262
+ end
263
+ if clean_line.include?('%') && !clean_line.match(linear_pattern_percent_first).nil?
264
+ candidates.append(extract_tax_from_horizontal_line(clean_line[clean_line.index(%r{\d*[.,]?\d* ?%})..],
265
+ linear_pattern_percent_first, page_id, true))
266
+ elsif !clean_line.match(linear_pattern_percent_first).nil?
267
+ candidates.append(extract_tax_from_horizontal_line(clean_line,
268
+ linear_pattern_percent_first, page_id, true))
269
+ end
270
+ end
271
+ end
272
+ candidates
273
+ end
274
+ # rubocop:enable Metrics/CyclomaticComplexity
275
+ # rubocop:enable Metrics/PerceivedComplexity
276
+
277
+ # Processes a vertical reconstructed line for tax extraction. Returns a hash of collected values.
278
+ # @param line [Mindee::Parsing::Common::Ocr::OcrLine] Processed OCR results.
279
+ # @param found_hash [Hash] Hash containing previously found values, if any.
280
+ # @return [Hash]
281
+ def self.extract_vertical_tax_values(line, found_hash)
282
+ amounts = []
283
+ line.each do |reconstructed_word|
284
+ amounts.push(parse_amount(reconstructed_word.text)) unless parse_amount(reconstructed_word.text).nil?
285
+ end
286
+ if amounts.length == 1 && !found_hash.key?('value')
287
+ found_hash['value'] = amounts[0]
288
+ else
289
+ found_hash['rate'] = amounts[0] if found_hash['rate'].nil?
290
+ found_hash['value'] = amounts[1] if found_hash['value'].nil?
291
+ end
292
+ found_hash
293
+ end
294
+
295
+ # Extracts tax data from a vertical reconstructed row.
296
+ # @param ocr_result [Mindee::Parsing::Common::Ocr] OCR raw results
297
+ # @param tax_names [Array<String>] Array of possible names a tax can have
298
+ # @param found_hash [Hash] Hash of currently retrieved values
299
+ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
300
+ found_hash = { 'code' => nil, 'page_id' => nil } if found_hash.nil?
301
+
302
+ ocr_result.mvision_v1.pages.each_with_index do |page, page_id|
303
+ page.all_words.each do |word|
304
+ next if match_index(word.text, tax_names).nil?
305
+
306
+ reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
307
+ found_hash['page_id'] = page_id if found_hash['page_id'].nil?
308
+ found_hash['code'] = word.text.strip if found_hash['code'].nil?
309
+ found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
310
+ end
311
+ end
312
+ found_hash
313
+ end
314
+
315
+ private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
316
+ :extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
317
+ :create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
318
+ :decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
319
+ :swap_rates_if_needed
320
+ end
321
+ end
322
+ end