mindee 3.16.0 → 3.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/README.md +4 -4
  4. data/bin/mindee.rb +20 -8
  5. data/docs/code_samples/{international_id_v1_async.txt → driver_license_v1_async.txt} +1 -1
  6. data/docs/code_samples/french_healthcard_v1_async.txt +19 -0
  7. data/docs/code_samples/{carte_vitale_v1.txt → payslip_fra_v3_async.txt} +2 -2
  8. data/docs/code_samples/workflow_execution.txt +29 -0
  9. data/docs/custom_v1.md +1 -1
  10. data/docs/driver_license_v1.md +156 -0
  11. data/docs/{carte_vitale_v1.md → french_healthcard_v1.md} +14 -24
  12. data/docs/getting_started.md +5 -5
  13. data/docs/payslip_fra_v3.md +319 -0
  14. data/lib/mindee/client.rb +40 -0
  15. data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +34 -19
  16. data/lib/mindee/http/workflow_endpoint.rb +90 -0
  17. data/lib/mindee/http.rb +1 -0
  18. data/lib/mindee/input/sources/base64_input_source.rb +31 -0
  19. data/lib/mindee/input/sources/bytes_input_source.rb +21 -0
  20. data/lib/mindee/input/sources/file_input_source.rb +20 -0
  21. data/lib/mindee/input/sources/local_input_source.rb +183 -0
  22. data/lib/mindee/input/sources/path_input_source.rb +20 -0
  23. data/lib/mindee/input/sources/url_input_source.rb +127 -0
  24. data/lib/mindee/input/sources.rb +6 -248
  25. data/lib/mindee/parsing/common/api_response.rb +22 -1
  26. data/lib/mindee/parsing/common/execution.rb +73 -0
  27. data/lib/mindee/parsing/common/execution_file.rb +24 -0
  28. data/lib/mindee/parsing/common/execution_priority.rb +30 -0
  29. data/lib/mindee/parsing/common.rb +3 -0
  30. data/lib/mindee/product/{international_id/international_id_v1.rb → driver_license/driver_license_v1.rb} +9 -9
  31. data/lib/mindee/product/driver_license/driver_license_v1_document.rb +91 -0
  32. data/lib/mindee/product/{international_id/international_id_v1_page.rb → driver_license/driver_license_v1_page.rb} +7 -7
  33. data/lib/mindee/product/fr/{carte_vitale/carte_vitale_v1.rb → health_card/health_card_v1.rb} +9 -9
  34. data/lib/mindee/product/fr/{carte_vitale/carte_vitale_v1_document.rb → health_card/health_card_v1_document.rb} +6 -6
  35. data/lib/mindee/product/fr/{carte_vitale/carte_vitale_v1_page.rb → health_card/health_card_v1_page.rb} +7 -7
  36. data/lib/mindee/product/fr/payslip/payslip_v3.rb +41 -0
  37. data/lib/mindee/product/fr/payslip/payslip_v3_bank_account_detail.rb +54 -0
  38. data/lib/mindee/product/fr/payslip/payslip_v3_document.rb +166 -0
  39. data/lib/mindee/product/fr/payslip/payslip_v3_employee.rb +78 -0
  40. data/lib/mindee/product/fr/payslip/payslip_v3_employer.rb +78 -0
  41. data/lib/mindee/product/fr/payslip/payslip_v3_employment.rb +78 -0
  42. data/lib/mindee/product/fr/payslip/payslip_v3_page.rb +34 -0
  43. data/lib/mindee/product/fr/payslip/payslip_v3_paid_time_off.rb +89 -0
  44. data/lib/mindee/product/fr/payslip/payslip_v3_pay_detail.rb +100 -0
  45. data/lib/mindee/product/fr/payslip/payslip_v3_pay_period.rb +66 -0
  46. data/lib/mindee/product/fr/payslip/payslip_v3_salary_detail.rb +89 -0
  47. data/lib/mindee/product/resume/resume_v1_document.rb +1 -1
  48. data/lib/mindee/product/resume/resume_v1_page.rb +1 -1
  49. data/lib/mindee/product.rb +3 -2
  50. data/lib/mindee/version.rb +1 -1
  51. metadata +36 -14
  52. data/docs/eu_driver_license_v1.md +0 -227
  53. data/docs/proof_of_address_v1.md +0 -211
  54. data/docs/us_driver_license_v1.md +0 -272
  55. data/lib/mindee/product/international_id/international_id_v1_document.rb +0 -109
@@ -0,0 +1,319 @@
1
+ ---
2
+ title: FR Payslip OCR Ruby
3
+ category: 622b805aaec68102ea7fcbc2
4
+ slug: ruby-fr-payslip-ocr
5
+ parentDoc: 6294d97ee723f1008d2ab28e
6
+ ---
7
+ The Ruby OCR SDK supports the [Payslip API](https://platform.mindee.com/mindee/payslip_fra).
8
+
9
+ Using the [sample below](https://github.com/mindee/client-lib-test-data/blob/main/products/payslip_fra/default_sample.jpg), we are going to illustrate how to extract the data that we want using the OCR SDK.
10
+ ![Payslip sample](https://github.com/mindee/client-lib-test-data/blob/main/products/payslip_fra/default_sample.jpg?raw=true)
11
+
12
+ # Quick-Start
13
+ ```rb
14
+ require 'mindee'
15
+
16
+ # Init a new client
17
+ mindee_client = Mindee::Client.new(api_key: 'my-api-key')
18
+
19
+ # Load a file from disk
20
+ input_source = mindee_client.source_from_path('/path/to/the/file.ext')
21
+
22
+ # Parse the file
23
+ result = mindee_client.enqueue_and_parse(
24
+ input_source,
25
+ Mindee::Product::FR::Payslip::PayslipV3
26
+ )
27
+
28
+ # Print a full summary of the parsed data in RST format
29
+ puts result.document
30
+
31
+ # Print the document-level parsed data
32
+ # puts result.document.inference.prediction
33
+
34
+ ```
35
+
36
+ **Output (RST):**
37
+ ```rst
38
+ ########
39
+ Document
40
+ ########
41
+ :Mindee ID: a479e3e7-6838-4e82-9a7d-99289f34ec7f
42
+ :Filename: default_sample.jpg
43
+
44
+ Inference
45
+ #########
46
+ :Product: mindee/payslip_fra v3.0
47
+ :Rotation applied: Yes
48
+
49
+ Prediction
50
+ ==========
51
+ :Pay Period:
52
+ :End Date: 2023-03-31
53
+ :Month: 03
54
+ :Payment Date: 2023-03-29
55
+ :Start Date: 2023-03-01
56
+ :Year: 2023
57
+ :Employee:
58
+ :Address: 52 RUE DES FLEURS 33500 LIBOURNE FRANCE
59
+ :Date of Birth:
60
+ :First Name: Jean Luc
61
+ :Last Name: Picard
62
+ :Phone Number:
63
+ :Registration Number:
64
+ :Social Security Number: 123456789012345
65
+ :Employer:
66
+ :Address: 1 RUE DU TONNOT 25210 DOUBS
67
+ :Company ID: 12345678901234
68
+ :Company Site:
69
+ :NAF Code: 1234A
70
+ :Name: DEMO COMPANY
71
+ :Phone Number:
72
+ :URSSAF Number:
73
+ :Bank Account Details:
74
+ :Bank Name:
75
+ :IBAN:
76
+ :SWIFT:
77
+ :Employment:
78
+ :Category: Cadre
79
+ :Coefficient: 600,000
80
+ :Collective Agreement: Construction -- Promotion
81
+ :Job Title: Directeur Régional du Développement
82
+ :Position Level: Niveau 5 Echelon 3
83
+ :Seniority Date:
84
+ :Start Date: 2022-05-01
85
+ :Salary Details:
86
+ +--------------+-----------+--------------------------------------+--------+-----------+
87
+ | Amount | Base | Description | Number | Rate |
88
+ +==============+===========+======================================+========+===========+
89
+ | 6666.67 | | Salaire de base | | |
90
+ +--------------+-----------+--------------------------------------+--------+-----------+
91
+ | 9.30 | | Part patronale Mutuelle NR | | |
92
+ +--------------+-----------+--------------------------------------+--------+-----------+
93
+ | 508.30 | | Avantages en nature voiture | | |
94
+ +--------------+-----------+--------------------------------------+--------+-----------+
95
+ :Pay Detail:
96
+ :Gross Salary: 7184.27
97
+ :Gross Salary YTD: 18074.81
98
+ :Income Tax Rate: 17.60
99
+ :Income Tax Withheld: 1030.99
100
+ :Net Paid: 3868.32
101
+ :Net Paid Before Tax: 4899.31
102
+ :Net Taxable: 5857.90
103
+ :Net Taxable YTD: 14752.73
104
+ :Total Cost Employer: 10486.94
105
+ :Total Taxes and Deductions: 1650.36
106
+ :Paid Time Off:
107
+ +-----------+--------+-------------+-----------+-----------+
108
+ | Accrued | Period | Type | Remaining | Used |
109
+ +===========+========+=============+===========+===========+
110
+ | | N-1 | VACATION | | |
111
+ +-----------+--------+-------------+-----------+-----------+
112
+ | 6.17 | N | VACATION | 6.17 | |
113
+ +-----------+--------+-------------+-----------+-----------+
114
+ | 2.01 | N | RTT | 2.01 | |
115
+ +-----------+--------+-------------+-----------+-----------+
116
+ ```
117
+
118
+ # Field Types
119
+ ## Standard Fields
120
+ These fields are generic and used in several products.
121
+
122
+ ### Basic Field
123
+ Each prediction object contains a set of fields that inherit from the generic `Field` class.
124
+ A typical `Field` object will have the following attributes:
125
+
126
+ * **value** (`String`, `Float`, `Integer`, `Boolean`): corresponds to the field value. Can be `nil` if no value was extracted.
127
+ * **confidence** (Float, nil): the confidence score of the field prediction.
128
+ * **bounding_box** (`Mindee::Geometry::Quadrilateral`, `nil`): contains exactly 4 relative vertices (points) coordinates of a right rectangle containing the field in the document.
129
+ * **polygon** (`Mindee::Geometry::Polygon`, `nil`): contains the relative vertices coordinates (`Point`) of a polygon containing the field in the image.
130
+ * **page_id** (`Integer`, `nil`): the ID of the page, always `nil` when at document-level.
131
+ * **reconstructed** (`Boolean`): indicates whether an object was reconstructed (not extracted as the API gave it).
132
+
133
+
134
+ Aside from the previous attributes, all basic fields have access to a `to_s` method that can be used to print their value as a string.
135
+
136
+ ## Specific Fields
137
+ Fields which are specific to this product; they are not used in any other product.
138
+
139
+ ### Bank Account Details Field
140
+ Information about the employee's bank account.
141
+
142
+ A `PayslipV3BankAccountDetail` implements the following attributes:
143
+
144
+ * `bank_name` (String): The name of the bank.
145
+ * `iban` (String): The IBAN of the bank account.
146
+ * `swift` (String): The SWIFT code of the bank.
147
+ Fields which are specific to this product; they are not used in any other product.
148
+
149
+ ### Employee Field
150
+ Information about the employee.
151
+
152
+ A `PayslipV3Employee` implements the following attributes:
153
+
154
+ * `address` (String): The address of the employee.
155
+ * `date_of_birth` (String): The date of birth of the employee.
156
+ * `first_name` (String): The first name of the employee.
157
+ * `last_name` (String): The last name of the employee.
158
+ * `phone_number` (String): The phone number of the employee.
159
+ * `registration_number` (String): The registration number of the employee.
160
+ * `social_security_number` (String): The social security number of the employee.
161
+ Fields which are specific to this product; they are not used in any other product.
162
+
163
+ ### Employer Field
164
+ Information about the employer.
165
+
166
+ A `PayslipV3Employer` implements the following attributes:
167
+
168
+ * `address` (String): The address of the employer.
169
+ * `company_id` (String): The company ID of the employer.
170
+ * `company_site` (String): The site of the company.
171
+ * `naf_code` (String): The NAF code of the employer.
172
+ * `name` (String): The name of the employer.
173
+ * `phone_number` (String): The phone number of the employer.
174
+ * `urssaf_number` (String): The URSSAF number of the employer.
175
+ Fields which are specific to this product; they are not used in any other product.
176
+
177
+ ### Employment Field
178
+ Information about the employment.
179
+
180
+ A `PayslipV3Employment` implements the following attributes:
181
+
182
+ * `category` (String): The category of the employment.
183
+ * `coefficient` (String): The coefficient of the employment.
184
+ * `collective_agreement` (String): The collective agreement of the employment.
185
+ * `job_title` (String): The job title of the employee.
186
+ * `position_level` (String): The position level of the employment.
187
+ * `seniority_date` (String): The seniority date of the employment.
188
+ * `start_date` (String): The start date of the employment.
189
+ Fields which are specific to this product; they are not used in any other product.
190
+
191
+ ### Paid Time Off Field
192
+ Information about paid time off.
193
+
194
+ A `PayslipV3PaidTimeOff` implements the following attributes:
195
+
196
+ * `accrued` (Float): The amount of paid time off accrued in the period.
197
+ * `period` (String): The paid time off period.
198
+
199
+ #### Possible values include:
200
+ - N
201
+ - N-1
202
+ - N-2
203
+
204
+ * `pto_type` (String): The type of paid time off.
205
+
206
+ #### Possible values include:
207
+ - VACATION
208
+ - RTT
209
+ - COMPENSATORY
210
+
211
+ * `remaining` (Float): The remaining amount of paid time off at the end of the period.
212
+ * `used` (Float): The amount of paid time off used in the period.
213
+ Fields which are specific to this product; they are not used in any other product.
214
+
215
+ ### Pay Detail Field
216
+ Detailed information about the pay.
217
+
218
+ A `PayslipV3PayDetail` implements the following attributes:
219
+
220
+ * `gross_salary` (Float): The gross salary of the employee.
221
+ * `gross_salary_ytd` (Float): The year-to-date gross salary of the employee.
222
+ * `income_tax_rate` (Float): The income tax rate of the employee.
223
+ * `income_tax_withheld` (Float): The income tax withheld from the employee's pay.
224
+ * `net_paid` (Float): The net paid amount of the employee.
225
+ * `net_paid_before_tax` (Float): The net paid amount before tax of the employee.
226
+ * `net_taxable` (Float): The net taxable amount of the employee.
227
+ * `net_taxable_ytd` (Float): The year-to-date net taxable amount of the employee.
228
+ * `total_cost_employer` (Float): The total cost to the employer.
229
+ * `total_taxes_and_deductions` (Float): The total taxes and deductions of the employee.
230
+ Fields which are specific to this product; they are not used in any other product.
231
+
232
+ ### Pay Period Field
233
+ Information about the pay period.
234
+
235
+ A `PayslipV3PayPeriod` implements the following attributes:
236
+
237
+ * `end_date` (String): The end date of the pay period.
238
+ * `month` (String): The month of the pay period.
239
+ * `payment_date` (String): The date of payment for the pay period.
240
+ * `start_date` (String): The start date of the pay period.
241
+ * `year` (String): The year of the pay period.
242
+ Fields which are specific to this product; they are not used in any other product.
243
+
244
+ ### Salary Details Field
245
+ Detailed information about the earnings.
246
+
247
+ A `PayslipV3SalaryDetail` implements the following attributes:
248
+
249
+ * `amount` (Float): The amount of the earning.
250
+ * `base` (Float): The base rate value of the earning.
251
+ * `description` (String): The description of the earnings.
252
+ * `number` (Float): The number of units in the earning.
253
+ * `rate` (Float): The rate of the earning.
254
+
255
+ # Attributes
256
+ The following fields are extracted for Payslip V3:
257
+
258
+ ## Bank Account Details
259
+ **bank_account_details** ([PayslipV3BankAccountDetail](#bank-account-details-field)): Information about the employee's bank account.
260
+
261
+ ```rb
262
+ puts result.document.inference.prediction.bank_account_details.value
263
+ ```
264
+
265
+ ## Employee
266
+ **employee** ([PayslipV3Employee](#employee-field)): Information about the employee.
267
+
268
+ ```rb
269
+ puts result.document.inference.prediction.employee.value
270
+ ```
271
+
272
+ ## Employer
273
+ **employer** ([PayslipV3Employer](#employer-field)): Information about the employer.
274
+
275
+ ```rb
276
+ puts result.document.inference.prediction.employer.value
277
+ ```
278
+
279
+ ## Employment
280
+ **employment** ([PayslipV3Employment](#employment-field)): Information about the employment.
281
+
282
+ ```rb
283
+ puts result.document.inference.prediction.employment.value
284
+ ```
285
+
286
+ ## Paid Time Off
287
+ **paid_time_off** (Array<[PayslipV3PaidTimeOff](#paid-time-off-field)>): Information about paid time off.
288
+
289
+ ```rb
290
+ for paid_time_off_elem in result.document.inference.prediction.paid_time_off do
291
+ puts paid_time_off_elem.value
292
+ end
293
+ ```
294
+
295
+ ## Pay Detail
296
+ **pay_detail** ([PayslipV3PayDetail](#pay-detail-field)): Detailed information about the pay.
297
+
298
+ ```rb
299
+ puts result.document.inference.prediction.pay_detail.value
300
+ ```
301
+
302
+ ## Pay Period
303
+ **pay_period** ([PayslipV3PayPeriod](#pay-period-field)): Information about the pay period.
304
+
305
+ ```rb
306
+ puts result.document.inference.prediction.pay_period.value
307
+ ```
308
+
309
+ ## Salary Details
310
+ **salary_details** (Array<[PayslipV3SalaryDetail](#salary-details-field)>): Detailed information about the earnings.
311
+
312
+ ```rb
313
+ for salary_details_elem in result.document.inference.prediction.salary_details do
314
+ puts salary_details_elem.value
315
+ end
316
+ ```
317
+
318
+ # Questions?
319
+ [Join our Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-2d0ds7dtz-DPAF81ZqTy20chsYpQBW5g)
data/lib/mindee/client.rb CHANGED
@@ -195,6 +195,46 @@ module Mindee
195
195
 
196
196
  # rubocop:enable Metrics/ParameterLists
197
197
 
198
+ # Sends a document to a workflow.
199
+ #
200
+ # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
201
+ # @param document_alias [String, nil] Alias to give to the document.
202
+ # @param priority [Symbol, nil] Priority to give to the document.
203
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
204
+ # This performs a full OCR operation on the server and may increase response time.
205
+ #
206
+ # @param public_url [String, nil] A unique, encrypted URL for accessing the document validation interface without
207
+ # requiring authentication.
208
+ # @param page_options [Hash, nil] Page cutting/merge options:
209
+ #
210
+ # * `:page_indexes` Zero-based list of page indexes.
211
+ # * `:operation` Operation to apply on the document, given the `page_indexes specified:
212
+ # * `:KEEP_ONLY` - keep only the specified pages, and remove all others.
213
+ # * `:REMOVE` - remove the specified pages, and keep all others.
214
+ # * `:on_min_pages` Apply the operation only if document has at least this many pages.
215
+ #
216
+ #
217
+ # @return [Mindee::Parsing::Common::WorkflowResponse]
218
+ def execute_workflow(
219
+ input_source,
220
+ workflow_id,
221
+ document_alias: nil,
222
+ priority: nil,
223
+ full_text: false,
224
+ public_url: nil,
225
+ page_options: nil
226
+ )
227
+ if input_source.is_a?(Mindee::Input::Source::LocalInputSource) && !page_options.nil? && input_source.pdf?
228
+ input_source.process_pdf(page_options)
229
+ end
230
+
231
+ workflow_endpoint = Mindee::HTTP::WorkflowEndpoint.new(workflow_id, api_key: @api_key)
232
+ prediction, raw_http = workflow_endpoint.execute_workflow(input_source, full_text, document_alias, priority,
233
+ public_url)
234
+ Mindee::Parsing::Common::WorkflowResponse.new(Product::Generated::GeneratedV1,
235
+ prediction, raw_http)
236
+ end
237
+
198
238
  # Load a prediction.
199
239
  #
200
240
  # @param product_class [Mindee::Inference] class of the product
@@ -2,6 +2,8 @@
2
2
 
3
3
  require_relative 'ocr_extractor'
4
4
 
5
+ # rubocop:disable Metrics/ClassLength
6
+
5
7
  module Mindee
6
8
  module Extraction
7
9
  # Tax extractor class
@@ -72,9 +74,12 @@ module Mindee
72
74
  reconstructed_hash['code'] =
73
75
  found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
74
76
 
75
- if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
76
- found_hash['rate'] =
77
- found_hash['rate'] * 100
77
+ if found_hash['rate']
78
+ if found_hash['rate'].abs < 1
79
+ found_hash['rate'] *= 10
80
+ elsif found_hash['rate'].abs > 100
81
+ found_hash['rate'] /= 10
82
+ end
78
83
  end
79
84
  found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
80
85
  found_hash = decimate_rates_if_needed(found_hash)
@@ -125,18 +130,28 @@ module Mindee
125
130
  # @param found_hash [Hash] Hash of currently retrieved values
126
131
  # @return [Hash]
127
132
  def self.set_base_and_value(reconstructed_hash, found_hash)
128
- if found_hash['base'].nil?
129
- reconstructed_hash['base'] = found_hash['base']
130
- reconstructed_hash['value'] = found_hash['value']
131
- elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
132
- reconstructed_hash['base'] = found_hash['value']
133
- reconstructed_hash['value'] = found_hash['base']
134
- else
135
- reconstructed_hash['value'] = found_hash['value']
133
+ base = found_hash['base']
134
+ value = found_hash['value']
135
+
136
+ if base && value
137
+ reconstructed_hash['base'], reconstructed_hash['value'] = [base, value].minmax
138
+ elsif base
139
+ reconstructed_hash['base'] = base
140
+ elsif value
141
+ reconstructed_hash['value'] = value
142
+ calculate_base(reconstructed_hash)
136
143
  end
144
+
137
145
  reconstructed_hash
138
146
  end
139
147
 
148
+ def self.calculate_base(hash)
149
+ rate = hash['rate']
150
+ return unless rate&.positive?
151
+
152
+ hash['base'] = hash['value'] / (rate / 100.0)
153
+ end
154
+
140
155
  # Extracts a single custom type of tax.
141
156
  # For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
142
157
  # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
@@ -149,7 +164,6 @@ module Mindee
149
164
 
150
165
  tax_names.sort!
151
166
  found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
152
- # a tax is considered found horizontally if it has a value, otherwise it is vertical
153
167
  if found_hash.nil? || found_hash['value'].nil?
154
168
  found_hash = extract_vertical_tax(ocr_result, tax_names,
155
169
  found_hash)
@@ -240,14 +254,14 @@ module Mindee
240
254
  linear_pattern_percent_first = %r{
241
255
  ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
242
256
  ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
243
- ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
244
- ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
257
+ ((?:\s*-\s*)?(?:\d*[.,])+\d+)?[ .]*
258
+ ((?:\s*-\s*)?(\d*[.,])*\d+)?
245
259
  }x
246
260
  linear_pattern_percent_second = %r{
247
261
  ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
248
262
  ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
249
- ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
250
- ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
263
+ ((?:\s*-\s*)?(?:\d*[.,])+\d+)?[ .]*
264
+ ((?:\s*-\s*)?(\d*[.,])*\d+)?
251
265
  }x
252
266
  ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
253
267
  page.all_lines.each do |line|
@@ -304,7 +318,7 @@ module Mindee
304
318
  page.all_words.each do |word|
305
319
  next if match_index(word.text, tax_names).nil?
306
320
 
307
- reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
321
+ reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id, 0.25)
308
322
  found_hash['page_id'] = page_id if found_hash['page_id'].nil?
309
323
  found_hash['code'] = word.text.strip if found_hash['code'].nil?
310
324
  found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
@@ -316,8 +330,9 @@ module Mindee
316
330
  private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
317
331
  :extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
318
332
  :create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
319
- :decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
320
- :swap_rates_if_needed
333
+ :decimate_rates_if_needed, :set_base_and_value, :valid_candidate?,
334
+ :swap_rates_if_needed, :calculate_base
321
335
  end
322
336
  end
323
337
  end
338
+ # rubocop:enable Metrics/ClassLength
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'net/http'
5
+ require_relative 'error'
6
+
7
+ module Mindee
8
+ module HTTP
9
+ # Handles the routing for workflow calls.
10
+ class WorkflowEndpoint
11
+ # @return [String]
12
+ attr_reader :api_key
13
+ # @return [Integer]
14
+ attr_reader :request_timeout
15
+ # @return [String]
16
+ attr_reader :url
17
+
18
+ def initialize(workflow_id, api_key: '')
19
+ @request_timeout = ENV.fetch(REQUEST_TIMEOUT_ENV_NAME, TIMEOUT_DEFAULT).to_i
20
+ @api_key = api_key.nil? || api_key.empty? ? ENV.fetch(API_KEY_ENV_NAME, API_KEY_DEFAULT) : api_key
21
+ base_url = ENV.fetch(BASE_URL_ENV_NAME, BASE_URL_DEFAULT)
22
+ @url = "#{base_url.chomp('/')}/workflows/#{workflow_id}/executions"
23
+ end
24
+
25
+ # Sends a document to the workflow.
26
+ # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
27
+ # @param document_alias [String, nil] Alias to give to the document.
28
+ # @param priority [Symbol, nil] Priority to give to the document.
29
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
30
+ # @param public_url [String, nil] A unique, encrypted URL for accessing the document validation interface without
31
+ # requiring authentication.
32
+ # @return [Array]
33
+ def execute_workflow(input_source, full_text, document_alias, priority, public_url)
34
+ check_api_key
35
+ response = workflow_execution_req_post(input_source, document_alias, priority, full_text, public_url)
36
+ hashed_response = JSON.parse(response.body, object_class: Hash)
37
+ return [hashed_response, response.body] if ResponseValidation.valid_async_response?(response)
38
+
39
+ ResponseValidation.clean_request!(response)
40
+ error = Error.handle_error(@url_name, response)
41
+ raise error
42
+ end
43
+
44
+ # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
45
+ # @param document_alias [String, nil] Alias to give to the document.
46
+ # @param priority [Symbol, nil] Priority to give to the document.
47
+ # @param full_text [Boolean] Whether to include the full OCR text response in compatible APIs.
48
+ # @param public_url [String, nil] A unique, encrypted URL for accessing the document validation interface without
49
+ # requiring authentication.
50
+ # @return [Net::HTTPResponse, nil]
51
+ def workflow_execution_req_post(input_source, document_alias, priority, full_text, public_url)
52
+ uri = URI(@url)
53
+ params = {}
54
+ params[:full_text_ocr] = 'true' if full_text
55
+ uri.query = URI.encode_www_form(params)
56
+
57
+ headers = {
58
+ 'Authorization' => "Token #{@api_key}",
59
+ 'User-Agent' => USER_AGENT,
60
+ }
61
+ req = Net::HTTP::Post.new(uri, headers)
62
+ form_data = if input_source.is_a?(Mindee::Input::Source::UrlInputSource)
63
+ [['document', input_source.url]]
64
+ else
65
+ [input_source.read_document]
66
+ end
67
+ form_data.push ['alias', document_alias] if document_alias
68
+ form_data.push ['public_url', public_url] if public_url
69
+ form_data.push ['priority', priority.to_s] if priority
70
+
71
+ req.set_form(form_data, 'multipart/form-data')
72
+
73
+ response = nil
74
+ Net::HTTP.start(uri.hostname, uri.port, use_ssl: true, read_timeout: @request_timeout) do |http|
75
+ response = http.request(req)
76
+ end
77
+ response
78
+ end
79
+
80
+ # Checks API key
81
+ def check_api_key
82
+ return unless @api_key.nil? || @api_key.empty?
83
+
84
+ raise "Missing API key. Check your Client Configuration.\n" \
85
+ 'You can set this using the ' \
86
+ "'#{HTTP::API_KEY_ENV_NAME}' environment variable."
87
+ end
88
+ end
89
+ end
90
+ end
data/lib/mindee/http.rb CHANGED
@@ -2,3 +2,4 @@
2
2
 
3
3
  require_relative 'http/endpoint'
4
4
  require_relative 'http/error'
5
+ require_relative 'http/workflow_endpoint'
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ module Mindee
6
+ module Input
7
+ module Source
8
+ # Load a document from a base64 string.
9
+ class Base64InputSource < LocalInputSource
10
+ # @param base64_string [String]
11
+ # @param filename [String]
12
+ # @param fix_pdf [Boolean]
13
+ def initialize(base64_string, filename, fix_pdf: false)
14
+ io_stream = StringIO.new(base64_string.unpack1('m*'))
15
+ io_stream.set_encoding Encoding::BINARY
16
+ super(io_stream, filename, fix_pdf: fix_pdf)
17
+ end
18
+
19
+ # Overload of the same function to prevent a base64 from being re-encoded.
20
+ # @param close [Boolean]
21
+ # @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
22
+ def read_document(close: true)
23
+ @io_stream.seek(0)
24
+ data = @io_stream.read
25
+ @io_stream.close if close
26
+ ['document', [data].pack('m'), { filename: Source.convert_to_unicode_escape(@filename) }]
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ module Mindee
6
+ module Input
7
+ module Source
8
+ # Load a document from raw bytes.
9
+ class BytesInputSource < LocalInputSource
10
+ # @param raw_bytes [String]
11
+ # @param filename [String]
12
+ # @param fix_pdf [Boolean]
13
+ def initialize(raw_bytes, filename, fix_pdf: false)
14
+ io_stream = StringIO.new(raw_bytes)
15
+ io_stream.set_encoding Encoding::BINARY
16
+ super(io_stream, filename, fix_pdf: fix_pdf)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ module Mindee
6
+ module Input
7
+ module Source
8
+ # Load a document from a file handle.
9
+ class FileInputSource < LocalInputSource
10
+ # @param input_file [File]
11
+ # @param filename [String]
12
+ # @param fix_pdf [Boolean]
13
+ def initialize(input_file, filename, fix_pdf: false)
14
+ io_stream = input_file
15
+ super(io_stream, filename, fix_pdf: fix_pdf)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end