mindee 3.1.1 → 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/README.md +57 -7
- data/bin/mindee.rb +160 -83
- data/docs/bank_account_details_v2.md +137 -0
- data/docs/bank_check_v1.md +179 -0
- data/docs/barcode_reader_v1.md +104 -0
- data/docs/carte_vitale_v1.md +123 -0
- data/docs/code_samples/barcode_reader_v1.txt +19 -0
- data/docs/code_samples/cropper_v1.txt +16 -0
- data/docs/code_samples/idcard_fr_v2.txt +19 -0
- data/docs/code_samples/invoice_splitter_v1_async.txt +6 -54
- data/docs/code_samples/multi_receipts_detector_v1.txt +19 -0
- data/docs/code_samples/us_w9_v1.txt +16 -0
- data/docs/cropper_v1.md +97 -0
- data/docs/custom_v1.md +101 -0
- data/docs/expense_receipts_v5.md +306 -0
- data/docs/financial_document_v1.md +384 -0
- data/docs/{ruby-getting-started.md → getting_started.md} +22 -6
- data/docs/idcard_fr_v2.md +253 -0
- data/docs/invoice_splitter_v1.md +85 -0
- data/docs/invoices_v4.md +369 -0
- data/docs/license_plates_v1.md +91 -0
- data/docs/multi_receipts_detector_v1.md +105 -0
- data/docs/passport_v1.md +186 -0
- data/docs/proof_of_address_v1.md +207 -0
- data/docs/us_driver_license_v1.md +268 -0
- data/docs/us_w9_v1.md +207 -0
- data/lib/mindee/client.rb +95 -16
- data/lib/mindee/geometry/quadrilateral.rb +5 -0
- data/lib/mindee/http/.rubocop.yml +8 -0
- data/lib/mindee/http/endpoint.rb +14 -6
- data/lib/mindee/http/error.rb +104 -0
- data/lib/mindee/http.rb +1 -0
- data/lib/mindee/input/sources.rb +83 -14
- data/lib/mindee/parsing/common/api_response.rb +11 -1
- data/lib/mindee/parsing/common/inference.rb +2 -2
- data/lib/mindee/parsing/common/ocr/ocr.rb +1 -0
- data/lib/mindee/parsing/common.rb +0 -1
- data/lib/mindee/parsing/standard/company_registration_field.rb +1 -1
- data/lib/mindee/parsing/standard/locale_field.rb +1 -1
- data/lib/mindee/parsing/standard/payment_details_field.rb +1 -1
- data/lib/mindee/parsing/standard/position_field.rb +10 -3
- data/lib/mindee/parsing/standard/{text_field.rb → string_field.rb} +1 -1
- data/lib/mindee/parsing/standard.rb +1 -1
- data/lib/mindee/pdf/pdf_processing.rb +2 -1
- data/lib/mindee/product/barcode_reader/barcode_reader_v1.rb +37 -0
- data/lib/mindee/product/barcode_reader/barcode_reader_v1_document.rb +44 -0
- data/lib/mindee/product/barcode_reader/barcode_reader_v1_page.rb +32 -0
- data/lib/mindee/product/cropper/cropper_v1.rb +37 -0
- data/lib/mindee/product/cropper/cropper_v1_document.rb +13 -0
- data/lib/mindee/product/cropper/cropper_v1_page.rb +49 -0
- data/lib/mindee/product/custom/custom_v1.rb +1 -0
- data/lib/mindee/product/eu/license_plate/license_plate_v1.rb +1 -0
- data/lib/mindee/product/eu/license_plate/license_plate_v1_document.rb +2 -2
- data/lib/mindee/product/financial_document/financial_document_v1.rb +1 -0
- data/lib/mindee/product/financial_document/financial_document_v1_document.rb +24 -24
- data/lib/mindee/product/fr/bank_account_details/bank_account_details_v1.rb +1 -0
- data/lib/mindee/product/fr/bank_account_details/bank_account_details_v1_document.rb +6 -6
- data/lib/mindee/product/fr/bank_account_details/bank_account_details_v2.rb +1 -0
- data/lib/mindee/product/fr/bank_account_details/bank_account_details_v2_document.rb +6 -6
- data/lib/mindee/product/fr/carte_vitale/carte_vitale_v1.rb +1 -0
- data/lib/mindee/product/fr/carte_vitale/carte_vitale_v1_document.rb +6 -6
- data/lib/mindee/product/fr/id_card/id_card_v1.rb +1 -0
- data/lib/mindee/product/fr/id_card/id_card_v1_document.rb +16 -16
- data/lib/mindee/product/fr/id_card/id_card_v2.rb +39 -0
- data/lib/mindee/product/fr/id_card/id_card_v2_document.rb +107 -0
- data/lib/mindee/product/fr/id_card/id_card_v2_page.rb +53 -0
- data/lib/mindee/product/invoice/invoice_v4.rb +1 -0
- data/lib/mindee/product/invoice/invoice_v4_document.rb +24 -24
- data/lib/mindee/product/invoice_splitter/invoice_splitter_v1.rb +1 -0
- data/lib/mindee/product/invoice_splitter/invoice_splitter_v1_document.rb +5 -3
- data/lib/mindee/product/multi_receipts_detector/multi_receipts_detector_v1.rb +37 -0
- data/lib/mindee/product/multi_receipts_detector/multi_receipts_detector_v1_document.rb +35 -0
- data/lib/mindee/product/multi_receipts_detector/multi_receipts_detector_v1_page.rb +32 -0
- data/lib/mindee/product/passport/passport_v1.rb +1 -0
- data/lib/mindee/product/passport/passport_v1_document.rb +16 -16
- data/lib/mindee/product/proof_of_address/proof_of_address_v1.rb +1 -0
- data/lib/mindee/product/proof_of_address/proof_of_address_v1_document.rb +14 -14
- data/lib/mindee/product/receipt/receipt_v4_document.rb +6 -6
- data/lib/mindee/product/receipt/receipt_v5.rb +1 -0
- data/lib/mindee/product/receipt/receipt_v5_document.rb +12 -12
- data/lib/mindee/product/us/bank_check/bank_check_v1.rb +1 -0
- data/lib/mindee/product/us/bank_check/bank_check_v1_document.rb +8 -8
- data/lib/mindee/product/us/driver_license/driver_license_v1.rb +1 -0
- data/lib/mindee/product/us/driver_license/driver_license_v1_document.rb +28 -28
- data/lib/mindee/product/us/w9/w9_v1.rb +39 -0
- data/lib/mindee/product/us/w9/w9_v1_document.rb +15 -0
- data/lib/mindee/product/us/w9/w9_v1_page.rb +102 -0
- data/lib/mindee/product.rb +5 -0
- data/lib/mindee/version.rb +5 -1
- data/lib/mindee.rb +47 -0
- metadata +43 -9
- data/docs/ruby-api-builder.md +0 -123
- data/docs/ruby-invoice-ocr.md +0 -271
- data/docs/ruby-passport-ocr.md +0 -165
- data/docs/ruby-receipt-ocr.md +0 -196
- data/lib/mindee/parsing/common/error.rb +0 -24
data/docs/us_w9_v1.md
ADDED
@@ -0,0 +1,207 @@
|
|
1
|
+
---
|
2
|
+
title: US W9 OCR Ruby
|
3
|
+
---
|
4
|
+
The Ruby OCR SDK supports the [US W9 API](https://platform.mindee.com/mindee/us_w9).
|
5
|
+
|
6
|
+
Using the [sample below](https://github.com/mindee/client-lib-test-data/blob/main/products/us_w9/default_sample.jpg), we are going to illustrate how to extract the data that we want using the OCR SDK.
|
7
|
+
![US W9 sample](https://github.com/mindee/client-lib-test-data/blob/main/products/us_w9/default_sample.jpg?raw=true)
|
8
|
+
|
9
|
+
# Quick-Start
|
10
|
+
```rb
|
11
|
+
require 'mindee'
|
12
|
+
|
13
|
+
# Init a new client
|
14
|
+
mindee_client = Mindee::Client.new(api_key: 'my-api-key')
|
15
|
+
|
16
|
+
# Load a file from disk
|
17
|
+
input_source = mindee_client.source_from_path('/path/to/the/file.ext')
|
18
|
+
|
19
|
+
# Parse the file
|
20
|
+
result = mindee_client.parse(
|
21
|
+
input_source,
|
22
|
+
Mindee::Product::US::W9::W9V1
|
23
|
+
)
|
24
|
+
|
25
|
+
# Print a full summary of the parsed data in RST format
|
26
|
+
puts result.document
|
27
|
+
```
|
28
|
+
|
29
|
+
**Output (RST):**
|
30
|
+
```rst
|
31
|
+
########
|
32
|
+
Document
|
33
|
+
########
|
34
|
+
:Mindee ID: d7c5b25f-e0d3-4491-af54-6183afa1aaab
|
35
|
+
:Filename: default_sample.jpg
|
36
|
+
|
37
|
+
Inference
|
38
|
+
#########
|
39
|
+
:Product: mindee/us_w9 v1.0
|
40
|
+
:Rotation applied: Yes
|
41
|
+
|
42
|
+
Prediction
|
43
|
+
==========
|
44
|
+
|
45
|
+
Page Predictions
|
46
|
+
================
|
47
|
+
|
48
|
+
Page 0
|
49
|
+
------
|
50
|
+
:Name: Stephen W Hawking
|
51
|
+
:SSN: 560758145
|
52
|
+
:Address: Somewhere In Milky Way
|
53
|
+
:City State Zip: Probably Still At Cambridge P O Box CB1
|
54
|
+
:Business Name:
|
55
|
+
:EIN: 942203664
|
56
|
+
:Tax Classification: individual
|
57
|
+
:Tax Classification Other Details:
|
58
|
+
:W9 Revision Date: august 2013
|
59
|
+
:Signature Position: Polygon with 4 points.
|
60
|
+
:Signature Date Position:
|
61
|
+
:Tax Classification LLC:
|
62
|
+
```
|
63
|
+
|
64
|
+
# Field Types
|
65
|
+
## Standard Fields
|
66
|
+
These fields are generic and used in several products.
|
67
|
+
|
68
|
+
### Basic Field
|
69
|
+
Each prediction object contains a set of fields that inherit from the generic `Field` class.
|
70
|
+
A typical `Field` object will have the following attributes:
|
71
|
+
|
72
|
+
* **value** (`String`, `Float`, `Integer`, `Boolean`): corresponds to the field value. Can be `nil` if no value was extracted.
|
73
|
+
* **confidence** (Float, nil): the confidence score of the field prediction.
|
74
|
+
* **bounding_box** (`Mindee::Geometry::Quadrilateral`, `nil`): contains exactly 4 relative vertices (points) coordinates of a right rectangle containing the field in the document.
|
75
|
+
* **polygon** (`Mindee::Geometry::Polygon`, `nil`): contains the relative vertices coordinates (`Point`) of a polygon containing the field in the image.
|
76
|
+
* **page_id** (`Integer`, `nil`): the ID of the page, is `nil` when at document-level.
|
77
|
+
* **reconstructed** (`Boolean`): indicates whether or not an object was reconstructed (not extracted as the API gave it).
|
78
|
+
|
79
|
+
|
80
|
+
Aside from the previous attributes, all basic fields have access to a `to_s` method that can be used to print their value as a string.
|
81
|
+
|
82
|
+
|
83
|
+
### Position Field
|
84
|
+
The position field `PositionField` does not implement all the basic `Field` attributes, only **bounding_box**, **polygon** and **page_id**. On top of these, it has access to:
|
85
|
+
|
86
|
+
* **rectangle** (`Mindee::Geometry::Quadrilateral`): a Polygon with four points that may be oriented (even beyond canvas).
|
87
|
+
* **quadrangle** (`Mindee::Geometry::Quadrilateral`): a free polygon made up of four points.
|
88
|
+
|
89
|
+
### String Field
|
90
|
+
The text field `StringField` only has one constraint: it's **value** is a `String` (or `nil`).
|
91
|
+
|
92
|
+
## Page-Level Fields
|
93
|
+
Some fields are constrained to the page level, and so will not be retrievable to through the document.
|
94
|
+
|
95
|
+
# Attributes
|
96
|
+
The following fields are extracted for US W9 V1:
|
97
|
+
|
98
|
+
## Address
|
99
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**address** ([StringField](#string-field)): The street address (number, street, and apt. or suite no.) of the applicant.
|
100
|
+
|
101
|
+
```rb
|
102
|
+
for address_elem in result.document.address do
|
103
|
+
puts address_elem.value
|
104
|
+
end
|
105
|
+
```
|
106
|
+
|
107
|
+
## Business Name
|
108
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**business_name** ([StringField](#string-field)): The business name or disregarded entity name, if different from Name.
|
109
|
+
|
110
|
+
```rb
|
111
|
+
for business_name_elem in result.document.business_name do
|
112
|
+
puts business_name_elem.value
|
113
|
+
end
|
114
|
+
```
|
115
|
+
|
116
|
+
## City State Zip
|
117
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**city_state_zip** ([StringField](#string-field)): The city, state, and ZIP code of the applicant.
|
118
|
+
|
119
|
+
```rb
|
120
|
+
for city_state_zip_elem in result.document.city_state_zip do
|
121
|
+
puts city_state_zip_elem.value
|
122
|
+
end
|
123
|
+
```
|
124
|
+
|
125
|
+
## EIN
|
126
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**ein** ([StringField](#string-field)): The employer identification number.
|
127
|
+
|
128
|
+
```rb
|
129
|
+
for ein_elem in result.document.ein do
|
130
|
+
puts ein_elem.value
|
131
|
+
end
|
132
|
+
```
|
133
|
+
|
134
|
+
## Name
|
135
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**name** ([StringField](#string-field)): Name as shown on the applicant's income tax return.
|
136
|
+
|
137
|
+
```rb
|
138
|
+
for name_elem in result.document.name do
|
139
|
+
puts name_elem.value
|
140
|
+
end
|
141
|
+
```
|
142
|
+
|
143
|
+
## Signature Date Position
|
144
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**signature_date_position** ([PositionField](#position-field)): Position of the signature date on the document.
|
145
|
+
|
146
|
+
```rb
|
147
|
+
for signature_date_position_elem in result.document.signature_date_position do
|
148
|
+
puts signature_date_position_elem.polygon
|
149
|
+
end
|
150
|
+
```
|
151
|
+
|
152
|
+
## Signature Position
|
153
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**signature_position** ([PositionField](#position-field)): Position of the signature on the document.
|
154
|
+
|
155
|
+
```rb
|
156
|
+
for signature_position_elem in result.document.signature_position do
|
157
|
+
puts signature_position_elem.polygon
|
158
|
+
end
|
159
|
+
```
|
160
|
+
|
161
|
+
## SSN
|
162
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**ssn** ([StringField](#string-field)): The applicant's social security number.
|
163
|
+
|
164
|
+
```rb
|
165
|
+
for ssn_elem in result.document.ssn do
|
166
|
+
puts ssn_elem.value
|
167
|
+
end
|
168
|
+
```
|
169
|
+
|
170
|
+
## Tax Classification
|
171
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**tax_classification** ([StringField](#string-field)): The federal tax classification, which can vary depending on the revision date.
|
172
|
+
|
173
|
+
```rb
|
174
|
+
for tax_classification_elem in result.document.tax_classification do
|
175
|
+
puts tax_classification_elem.value
|
176
|
+
end
|
177
|
+
```
|
178
|
+
|
179
|
+
## Tax Classification LLC
|
180
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**tax_classification_llc** ([StringField](#string-field)): Depending on revision year, among S, C, P or D for Limited Liability Company Classification.
|
181
|
+
|
182
|
+
```rb
|
183
|
+
for tax_classification_llc_elem in result.document.tax_classification_llc do
|
184
|
+
puts tax_classification_llc_elem.value
|
185
|
+
end
|
186
|
+
```
|
187
|
+
|
188
|
+
## Tax Classification Other Details
|
189
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**tax_classification_other_details** ([StringField](#string-field)): Tax Classification Other Details.
|
190
|
+
|
191
|
+
```rb
|
192
|
+
for tax_classification_other_details_elem in result.document.tax_classification_other_details do
|
193
|
+
puts tax_classification_other_details_elem.value
|
194
|
+
end
|
195
|
+
```
|
196
|
+
|
197
|
+
## W9 Revision Date
|
198
|
+
[📄](#page-level-fields "This field is only present on individual pages.")**w9_revision_date** ([StringField](#string-field)): The Revision month and year of the W9 form.
|
199
|
+
|
200
|
+
```rb
|
201
|
+
for w9_revision_date_elem in result.document.w9_revision_date do
|
202
|
+
puts w9_revision_date_elem.value
|
203
|
+
end
|
204
|
+
```
|
205
|
+
|
206
|
+
# Questions?
|
207
|
+
[Join our Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-1jv6nawjq-FDgFcF2T5CmMmRpl9LLptw)
|
data/lib/mindee/client.rb
CHANGED
@@ -17,7 +17,7 @@ module Mindee
|
|
17
17
|
# Call prediction API on a document and parse the results.
|
18
18
|
#
|
19
19
|
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
20
|
-
#
|
20
|
+
# @param product_class [Mindee::Product] class of the product
|
21
21
|
# @param endpoint [HTTP::Endpoint] Endpoint of the API
|
22
22
|
# Doesn't need to be set in the case of OTS APIs.
|
23
23
|
#
|
@@ -52,14 +52,14 @@ module Mindee
|
|
52
52
|
input_source.process_pdf(page_options)
|
53
53
|
end
|
54
54
|
endpoint = initialize_endpoint(product_class) if endpoint.nil?
|
55
|
-
prediction = endpoint.predict(input_source, all_words, close_file, cropper)
|
56
|
-
Mindee::Parsing::Common::ApiResponse.new(product_class, prediction)
|
55
|
+
prediction, raw_http = endpoint.predict(input_source, all_words, close_file, cropper)
|
56
|
+
Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http)
|
57
57
|
end
|
58
58
|
|
59
59
|
# Enqueue a document for async parsing
|
60
60
|
#
|
61
61
|
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
62
|
-
#
|
62
|
+
# @param product_class [Mindee::Product] class of the product
|
63
63
|
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
|
64
64
|
# Doesn't need to be set in the case of OTS APIs.
|
65
65
|
#
|
@@ -94,17 +94,18 @@ module Mindee
|
|
94
94
|
input_source.process_pdf(page_options)
|
95
95
|
end
|
96
96
|
endpoint = initialize_endpoint(product_class) if endpoint.nil?
|
97
|
+
prediction, raw_http = endpoint.predict_async(input_source, all_words, close_file, cropper)
|
97
98
|
Mindee::Parsing::Common::ApiResponse.new(product_class,
|
98
|
-
|
99
|
+
prediction, raw_http)
|
99
100
|
end
|
100
101
|
|
101
102
|
# Parses a queued document
|
102
103
|
#
|
104
|
+
# @param job_id [String] Id of the job (queue) to poll from
|
105
|
+
# @param product_class [Mindee::Product] class of the product
|
103
106
|
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API
|
104
107
|
# Doesn't need to be set in the case of OTS APIs.
|
105
108
|
#
|
106
|
-
# @param job_id [String] Id of the job (queue) to poll from
|
107
|
-
#
|
108
109
|
# @return [Mindee::Parsing::Common::ApiResponse]
|
109
110
|
def parse_queued(
|
110
111
|
job_id,
|
@@ -112,38 +113,106 @@ module Mindee
|
|
112
113
|
endpoint: nil
|
113
114
|
)
|
114
115
|
endpoint = initialize_endpoint(product_class) if endpoint.nil?
|
115
|
-
|
116
|
+
prediction, raw_http = endpoint.parse_async(job_id)
|
117
|
+
Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http)
|
116
118
|
end
|
117
119
|
|
120
|
+
# rubocop:disable Metrics/ParameterLists
|
121
|
+
|
122
|
+
# Enqueue a document for async parsing and automatically try to retrieve it
|
123
|
+
#
|
124
|
+
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
|
125
|
+
# @param product_class [Mindee::Product] class of the product
|
126
|
+
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
|
127
|
+
# Doesn't need to be set in the case of OTS APIs.
|
128
|
+
# @param all_words [Boolean] Whether to extract all the words on each page.
|
129
|
+
# This performs a full OCR operation on the server and will increase response time.
|
130
|
+
# @param close_file [Boolean] Whether to `close()` the file after parsing it.
|
131
|
+
# Set to false if you need to access the file after this operation.
|
132
|
+
# @param page_options [Hash, nil] Page cutting/merge options:
|
133
|
+
# * `:page_indexes` Zero-based list of page indexes.
|
134
|
+
# * `:operation` Operation to apply on the document, given the `page_indexes specified:
|
135
|
+
# * `:KEEP_ONLY` - keep only the specified pages, and remove all others.
|
136
|
+
# * `:REMOVE` - remove the specified pages, and keep all others.
|
137
|
+
# * `:on_min_pages` Apply the operation only if document has at least this many pages.
|
138
|
+
# @param cropper [Boolean, nil] Whether to include cropper results for each page.
|
139
|
+
# This performs a cropping operation on the server and will increase response time.
|
140
|
+
# @param initial_delay_sec [Integer, Float, nil] initial delay before polling. Defaults to 6.
|
141
|
+
# @param delay_sec [Integer, Float, nil] delay between polling attempts. Defaults to 3.
|
142
|
+
# @param max_retries [Integer, nil] maximum amount of retries. Defaults to 10.
|
143
|
+
# @return [Mindee::Parsing::Common::ApiResponse]
|
144
|
+
def enqueue_and_parse(
|
145
|
+
input_source,
|
146
|
+
product_class,
|
147
|
+
endpoint: nil,
|
148
|
+
all_words: false,
|
149
|
+
close_file: true,
|
150
|
+
page_options: nil,
|
151
|
+
cropper: false,
|
152
|
+
initial_delay_sec: 6,
|
153
|
+
delay_sec: 3,
|
154
|
+
max_retries: 10
|
155
|
+
)
|
156
|
+
enqueue_res = enqueue(
|
157
|
+
input_source,
|
158
|
+
product_class,
|
159
|
+
endpoint: endpoint,
|
160
|
+
all_words: all_words,
|
161
|
+
close_file: close_file,
|
162
|
+
page_options: page_options,
|
163
|
+
cropper: cropper
|
164
|
+
)
|
165
|
+
sleep(initial_delay_sec)
|
166
|
+
polling_attempts = 1
|
167
|
+
job_id = enqueue_res.job.id
|
168
|
+
queue_res = parse_queued(job_id, product_class, endpoint: endpoint)
|
169
|
+
while (queue_res.job.status != Mindee::Parsing::Common::JobStatus::COMPLETED) && (polling_attempts < max_retries)
|
170
|
+
sleep(delay_sec)
|
171
|
+
queue_res = parse_queued(job_id, product_class, endpoint: endpoint)
|
172
|
+
polling_attempts += 1
|
173
|
+
end
|
174
|
+
if queue_res.job.status != Mindee::Parsing::Common::JobStatus::COMPLETED
|
175
|
+
elapsed = initial_delay_sec + (polling_attempts * delay_sec)
|
176
|
+
raise "Asynchronous parsing request timed out after #{elapsed} seconds (#{polling_attempts} tries)"
|
177
|
+
end
|
178
|
+
|
179
|
+
queue_res
|
180
|
+
end
|
181
|
+
# rubocop:enable Metrics/ParameterLists
|
182
|
+
|
118
183
|
# Load a document from an absolute path, as a string.
|
119
184
|
# @param input_path [String] Path of file to open
|
185
|
+
# @param fix_pdf [Boolean] Attempts to fix broken pdf if true
|
120
186
|
# @return [Mindee::Input::Source::PathInputSource]
|
121
|
-
def source_from_path(input_path)
|
122
|
-
Input::Source::PathInputSource.new(input_path)
|
187
|
+
def source_from_path(input_path, fix_pdf: false)
|
188
|
+
Input::Source::PathInputSource.new(input_path, fix_pdf: fix_pdf)
|
123
189
|
end
|
124
190
|
|
125
191
|
# Load a document from raw bytes.
|
126
192
|
# @param input_bytes [String] Encoding::BINARY byte input
|
127
193
|
# @param filename [String] The name of the file (without the path)
|
194
|
+
# @param fix_pdf [Boolean] Attempts to fix broken pdf if true
|
128
195
|
# @return [Mindee::Input::Source::BytesInputSource]
|
129
|
-
def source_from_bytes(input_bytes, filename)
|
130
|
-
Input::Source::BytesInputSource.new(input_bytes, filename)
|
196
|
+
def source_from_bytes(input_bytes, filename, fix_pdf: false)
|
197
|
+
Input::Source::BytesInputSource.new(input_bytes, filename, fix_pdf: fix_pdf)
|
131
198
|
end
|
132
199
|
|
133
200
|
# Load a document from a base64 encoded string.
|
134
201
|
# @param base64_string [String] Input to parse as base64 string
|
135
202
|
# @param filename [String] The name of the file (without the path)
|
203
|
+
# @param fix_pdf [Boolean] Attempts to fix broken pdf if true
|
136
204
|
# @return [Mindee::Input::Source::Base64InputSource]
|
137
|
-
def source_from_b64string(base64_string, filename)
|
138
|
-
Input::Source::Base64InputSource.new(base64_string, filename)
|
205
|
+
def source_from_b64string(base64_string, filename, fix_pdf: false)
|
206
|
+
Input::Source::Base64InputSource.new(base64_string, filename, fix_pdf: fix_pdf)
|
139
207
|
end
|
140
208
|
|
141
209
|
# Load a document from a normal Ruby `File`.
|
142
210
|
# @param input_file [File] Input file handle
|
143
211
|
# @param filename [String] The name of the file (without the path)
|
212
|
+
# @param fix_pdf [Boolean] Attempts to fix broken pdf if true
|
144
213
|
# @return [Mindee::Input::Source::FileInputSource]
|
145
|
-
def source_from_file(input_file, filename)
|
146
|
-
Input::Source::FileInputSource.new(input_file, filename)
|
214
|
+
def source_from_file(input_file, filename, fix_pdf: false)
|
215
|
+
Input::Source::FileInputSource.new(input_file, filename, fix_pdf: fix_pdf)
|
147
216
|
end
|
148
217
|
|
149
218
|
# Load a document from a secure remote source (HTTPS).
|
@@ -171,6 +240,16 @@ module Mindee
|
|
171
240
|
|
172
241
|
private
|
173
242
|
|
243
|
+
# Validates the parameters for async auto-polling
|
244
|
+
# @param initial_delay_sec [Integer, Float] initial delay before polling
|
245
|
+
# @param delay_sec [Integer, Float] delay between polling attempts
|
246
|
+
# @param max_retries [Integer, nil] maximum amount of retries. Defaults to 10.
|
247
|
+
def validate_async_params(initial_delay_sec, delay_sec, max_retries)
|
248
|
+
raise 'Cannot set auto-poll delay to less than 2 seconds' if delay_sec < 2
|
249
|
+
raise 'Cannot set initial parsing delay to less than 4 seconds' if initial_delay_sec < 4
|
250
|
+
raise 'Cannot set auto-poll delay to less than 2 seconds' unless max_retries.is_a? Integer
|
251
|
+
end
|
252
|
+
|
174
253
|
# Creates an endpoint with the given values. Raises an error if the endpoint is invalid.
|
175
254
|
# @param product_class [Mindee::Product] class of the product
|
176
255
|
#
|
data/lib/mindee/http/endpoint.rb
CHANGED
@@ -2,19 +2,27 @@
|
|
2
2
|
|
3
3
|
require 'json'
|
4
4
|
require 'net/http'
|
5
|
+
require_relative 'error'
|
5
6
|
require_relative '../version'
|
6
7
|
|
7
8
|
module Mindee
|
8
9
|
module HTTP
|
10
|
+
# API key's default environment key name.
|
9
11
|
API_KEY_ENV_NAME = 'MINDEE_API_KEY'
|
12
|
+
# API key's default value.
|
10
13
|
API_KEY_DEFAULT = nil
|
11
14
|
|
15
|
+
# Base URL default environment key name.
|
12
16
|
BASE_URL_ENV_NAME = 'MINDEE_BASE_URL'
|
17
|
+
# Base URL's default value.
|
13
18
|
BASE_URL_DEFAULT = 'https://api.mindee.net/v1'
|
14
19
|
|
20
|
+
# HTTP request timeout default environment key name.
|
15
21
|
REQUEST_TIMEOUT_ENV_NAME = 'MINDEE_REQUEST_TIMEOUT'
|
22
|
+
# HTTP request timeout default value.
|
16
23
|
TIMEOUT_DEFAULT = 120
|
17
24
|
|
25
|
+
# Default value for the user agent.
|
18
26
|
USER_AGENT = "mindee-api-ruby@v#{Mindee::VERSION} ruby-v#{RUBY_VERSION} #{Mindee::PLATFORM}"
|
19
27
|
|
20
28
|
# Generic API endpoint for a product.
|
@@ -43,9 +51,9 @@ module Mindee
|
|
43
51
|
check_api_key
|
44
52
|
response = predict_req_post(input_source, all_words: all_words, close_file: close_file, cropper: cropper)
|
45
53
|
hashed_response = JSON.parse(response.body, object_class: Hash)
|
46
|
-
return hashed_response if (200..299).include?(response.code.to_i)
|
54
|
+
return [hashed_response, response.body] if (200..299).include?(response.code.to_i)
|
47
55
|
|
48
|
-
error =
|
56
|
+
error = Error.handle_error!(@url_name, hashed_response, response.code.to_i)
|
49
57
|
raise error
|
50
58
|
end
|
51
59
|
|
@@ -58,9 +66,9 @@ module Mindee
|
|
58
66
|
check_api_key
|
59
67
|
response = document_queue_req_get(input_source, all_words, close_file, cropper)
|
60
68
|
hashed_response = JSON.parse(response.body, object_class: Hash)
|
61
|
-
return hashed_response if (200..299).include?(response.code.to_i)
|
69
|
+
return [hashed_response, response.body] if (200..299).include?(response.code.to_i)
|
62
70
|
|
63
|
-
error =
|
71
|
+
error = Error.handle_error!(@url_name, hashed_response, response.code.to_i)
|
64
72
|
raise error
|
65
73
|
end
|
66
74
|
|
@@ -71,9 +79,9 @@ module Mindee
|
|
71
79
|
check_api_key
|
72
80
|
response = document_queue_req(job_id)
|
73
81
|
hashed_response = JSON.parse(response.body, object_class: Hash)
|
74
|
-
return hashed_response if (200..299).include?(response.code.to_i)
|
82
|
+
return [hashed_response, response.body] if (200..299).include?(response.code.to_i)
|
75
83
|
|
76
|
-
error =
|
84
|
+
error = Error.handle_error!(@url_name, hashed_response, response.code.to_i)
|
77
85
|
raise error
|
78
86
|
end
|
79
87
|
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
module HTTP
|
5
|
+
# Mindee HTTP error module.
|
6
|
+
module Error
|
7
|
+
module_function
|
8
|
+
|
9
|
+
# Creates an error object based on what's retrieved from a request.
|
10
|
+
# @param response [Hash] dictionary response retrieved by the server
|
11
|
+
def create_error_obj(response)
|
12
|
+
error_obj = response.respond_to?(:each_pair) ? response.dig('api_request', 'error') : nil
|
13
|
+
if error_obj.nil?
|
14
|
+
error_obj = if response.include?('Maximum pdf pages')
|
15
|
+
{
|
16
|
+
'code' => 'TooManyPages',
|
17
|
+
'message' => 'Maximum amound of pdf pages reached.',
|
18
|
+
'details' => response,
|
19
|
+
}
|
20
|
+
elsif response.include?('Max file size is')
|
21
|
+
{
|
22
|
+
'code' => 'FileTooLarge',
|
23
|
+
'message' => 'Maximum file size reached.',
|
24
|
+
'details' => response,
|
25
|
+
}
|
26
|
+
elsif response.include?('Invalid file type')
|
27
|
+
{
|
28
|
+
'code' => 'InvalidFiletype',
|
29
|
+
'message' => 'Invalid file type.',
|
30
|
+
'details' => response,
|
31
|
+
}
|
32
|
+
elsif response.include?('Gateway timeout')
|
33
|
+
{
|
34
|
+
'code' => 'RequestTimeout',
|
35
|
+
'message' => 'Request timed out.',
|
36
|
+
'details' => response,
|
37
|
+
}
|
38
|
+
elsif response.include?('Too Many Requests')
|
39
|
+
{
|
40
|
+
'code' => 'TooManyRequests',
|
41
|
+
'message' => 'Too Many Requests.',
|
42
|
+
'details' => response,
|
43
|
+
}
|
44
|
+
else
|
45
|
+
{
|
46
|
+
'code' => 'UnknownError',
|
47
|
+
'message' => 'Server sent back an unexpected reply.',
|
48
|
+
'details' => response,
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
error_obj
|
54
|
+
end
|
55
|
+
|
56
|
+
# Creates an appropriate HTTP error exception, based on retrieved http error code
|
57
|
+
# @param url [String] the url of the product
|
58
|
+
# @param response [Hash] dictionary response retrieved by the server
|
59
|
+
# @param code [Integer] http error code of the response
|
60
|
+
def handle_error!(url, response, code)
|
61
|
+
error_obj = create_error_obj(response)
|
62
|
+
case code
|
63
|
+
when 400..499
|
64
|
+
MindeeHttpClientError.new(error_obj, url, code)
|
65
|
+
when 500..599
|
66
|
+
MindeeHttpServerError.new(error_obj, url, code)
|
67
|
+
else
|
68
|
+
MindeeHttpError.new(error_obj, url, code)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# API HttpError
|
73
|
+
class MindeeHttpError < StandardError
|
74
|
+
# @return [String]
|
75
|
+
attr_reader :status_code
|
76
|
+
# @return [String]
|
77
|
+
attr_reader :api_code
|
78
|
+
# @return [String]
|
79
|
+
attr_reader :api_details
|
80
|
+
# @return [String]
|
81
|
+
attr_reader :api_message
|
82
|
+
|
83
|
+
# @param http_error [Hash]
|
84
|
+
# @param url [String]
|
85
|
+
# @param code [Integer]
|
86
|
+
def initialize(http_error, url, code)
|
87
|
+
@status_code = code
|
88
|
+
@api_code = http_error['code']
|
89
|
+
@api_details = http_error['details']
|
90
|
+
@api_message = http_error['message']
|
91
|
+
super("#{url} #{@status_code} HTTP error: #{@api_details} - #{@api_message}")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# API client HttpError
|
96
|
+
class MindeeHttpClientError < MindeeHttpError
|
97
|
+
end
|
98
|
+
|
99
|
+
# API server HttpError
|
100
|
+
class MindeeHttpServerError < MindeeHttpError
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
data/lib/mindee/http.rb
CHANGED