mindee 3.17.0 → 3.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/README.md +4 -4
- data/bin/mindee.rb +0 -6
- data/docs/code_samples/{carte_vitale_v1.txt → us_mail_v3_async.txt} +2 -2
- data/docs/custom_v1.md +1 -1
- data/docs/getting_started.md +5 -5
- data/docs/{us_mail_v2.md → us_mail_v3.md} +34 -12
- data/lib/mindee/client.rb +2 -2
- data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +34 -19
- data/lib/mindee/input/sources/base64_input_source.rb +31 -0
- data/lib/mindee/input/sources/bytes_input_source.rb +21 -0
- data/lib/mindee/input/sources/file_input_source.rb +20 -0
- data/lib/mindee/input/sources/local_input_source.rb +183 -0
- data/lib/mindee/input/sources/path_input_source.rb +20 -0
- data/lib/mindee/input/sources/url_input_source.rb +127 -0
- data/lib/mindee/input/sources.rb +6 -248
- data/lib/mindee/parsing/standard/boolean_field.rb +6 -0
- data/lib/mindee/product/ind/indian_passport/indian_passport_v1_document.rb +1 -1
- data/lib/mindee/product/ind/indian_passport/indian_passport_v1_page.rb +1 -1
- data/lib/mindee/product/{fr/carte_vitale/carte_vitale_v1.rb → us/us_mail/us_mail_v3.rb} +11 -11
- data/lib/mindee/product/us/us_mail/us_mail_v3_document.rb +107 -0
- data/lib/mindee/product/{fr/carte_vitale/carte_vitale_v1_page.rb → us/us_mail/us_mail_v3_page.rb} +8 -8
- data/lib/mindee/product/us/us_mail/us_mail_v3_recipient_address.rb +113 -0
- data/lib/mindee/product/us/us_mail/us_mail_v3_sender_address.rb +66 -0
- data/lib/mindee/product.rb +1 -1
- data/lib/mindee/version.rb +1 -1
- metadata +18 -10
- data/lib/mindee/product/fr/carte_vitale/carte_vitale_v1_document.rb +0 -52
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef56483dc0576b931b3d88b57e4512abd44ea4e216ed29dc1097ada3fa5b00fb
|
4
|
+
data.tar.gz: 8d4f6c2b99c113439b2332244cd89619189704bc75655121c26ba0122118d539
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a0461aea7ba2804e36ddc833e1ec24888227484d1ad1f1471b8b8859cfaffc14a94f623672b749fe0f73584c6c215f26d87f2e11cbeb701fc8cc9cf86088ec6f
|
7
|
+
data.tar.gz: aed38431cddde8abc620cf4701ecf7299929a0216b0a9e717d7ef56732211fb38ae12d093fca8839cbd981c43047f0a87088587c44fc7f4c9ff9f4ae56438f7d
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,19 @@
|
|
1
1
|
# Mindee Ruby API Library Changelog
|
2
2
|
|
3
|
+
## v3.19.0 - 2025-01-14
|
4
|
+
### Changes
|
5
|
+
* :sparkles: add support for US Mail V3
|
6
|
+
* :recycle: increase async retry timers
|
7
|
+
|
8
|
+
|
9
|
+
## v3.18.0 - 2024-12-13
|
10
|
+
### Changes
|
11
|
+
* :sparkles: allow local downloading of remote sources
|
12
|
+
* :coffin: remove support for (FR) Carte Vitale V1 in favor of French Health Card V1
|
13
|
+
### Fixes
|
14
|
+
* :bug: fix tax-extraction script
|
15
|
+
|
16
|
+
|
3
17
|
## v3.17.0 - 2024-11-28
|
4
18
|
### Changes
|
5
19
|
* :sparkles: add support for workflows
|
data/README.md
CHANGED
@@ -116,7 +116,7 @@ result = mindee_client.parse(
|
|
116
116
|
puts result.document
|
117
117
|
```
|
118
118
|
|
119
|
-
### Custom
|
119
|
+
### Custom Documents (docTI & Custom APIs)
|
120
120
|
|
121
121
|
```ruby
|
122
122
|
require 'mindee'
|
@@ -131,9 +131,9 @@ endpoint = mindee_client.create_endpoint(
|
|
131
131
|
# Load a file from disk
|
132
132
|
input_source = mindee_client.source_from_path('/path/to/the/file.ext')
|
133
133
|
|
134
|
-
result = mindee_client.
|
134
|
+
result = mindee_client.enqueue_and_parse(
|
135
135
|
input_source,
|
136
|
-
Mindee::Product::
|
136
|
+
Mindee::Product::Generated::GeneratedV1,
|
137
137
|
endpoint: endpoint
|
138
138
|
)
|
139
139
|
|
@@ -181,7 +181,7 @@ customize the experience.
|
|
181
181
|
* [EU Driver License OCR Ruby](https://developers.mindee.com/docs/ruby-eu-driver-license-ocr)
|
182
182
|
* [FR Bank Account Details OCR Ruby](https://developers.mindee.com/docs/ruby-fr-bank-account-details-ocr)
|
183
183
|
* [FR Bank Statement OCR Ruby](https://developers.mindee.com/docs/ruby-fr-bank-statement-ocr)
|
184
|
-
* [FR
|
184
|
+
* [FR Health Card OCR Ruby](https://developers.mindee.com/docs/ruby-fr-health-card-ocr)
|
185
185
|
* [FR ID Card OCR Ruby](https://developers.mindee.com/docs/ruby-fr-carte-nationale-didentite-ocr)
|
186
186
|
* [US Bank Check OCR Ruby](https://developers.mindee.com/docs/ruby-us-bank-check-ocr)
|
187
187
|
* [US Driver License OCR Ruby](https://developers.mindee.com/docs/ruby-us-driver-license-ocr)
|
data/bin/mindee.rb
CHANGED
@@ -97,12 +97,6 @@ DOCUMENTS = {
|
|
97
97
|
sync: false,
|
98
98
|
async: true,
|
99
99
|
},
|
100
|
-
"fr-carte-vitale" => {
|
101
|
-
description: "FR Carte Vitale",
|
102
|
-
doc_class: Mindee::Product::FR::CarteVitale::CarteVitaleV1,
|
103
|
-
sync: true,
|
104
|
-
async: false,
|
105
|
-
},
|
106
100
|
"fr-id-card" => {
|
107
101
|
description: "FR ID Card",
|
108
102
|
doc_class: Mindee::Product::FR::IdCard::IdCardV2,
|
@@ -7,9 +7,9 @@ mindee_client = Mindee::Client.new(api_key: 'my-api-key')
|
|
7
7
|
input_source = mindee_client.source_from_path('/path/to/the/file.ext')
|
8
8
|
|
9
9
|
# Parse the file
|
10
|
-
result = mindee_client.
|
10
|
+
result = mindee_client.enqueue_and_parse(
|
11
11
|
input_source,
|
12
|
-
Mindee::Product::
|
12
|
+
Mindee::Product::US::UsMail::UsMailV3
|
13
13
|
)
|
14
14
|
|
15
15
|
# Print a full summary of the parsed data in RST format
|
data/docs/custom_v1.md
CHANGED
@@ -4,7 +4,7 @@ category: 622b805aaec68102ea7fcbc2
|
|
4
4
|
slug: ruby-api-builder-ocr
|
5
5
|
parentDoc: 6294d97ee723f1008d2ab28e
|
6
6
|
---
|
7
|
-
> 🚧 This product is still supported, but is considered to be deprecated. If you are looking for the
|
7
|
+
> 🚧 This product is still supported, but is considered to be deprecated. If you are looking for the docTI API documentation, you can find it [here](https://developers.mindee.com/docs/ruby-generated-ocr).
|
8
8
|
|
9
9
|
# Quick-Start
|
10
10
|
|
data/docs/getting_started.md
CHANGED
@@ -226,20 +226,20 @@ result = mindee_client.parse(
|
|
226
226
|
)
|
227
227
|
```
|
228
228
|
|
229
|
-
### Custom Documents
|
229
|
+
### Custom Documents (docTI)
|
230
230
|
For custom documents, the endpoint to use must also be set, and it must take in an `endpoint_name`:
|
231
231
|
|
232
232
|
```ruby
|
233
|
-
endpoint = mindee_client.create_endpoint(endpoint_name: 'wnine')
|
233
|
+
endpoint = mindee_client.create_endpoint(endpoint_name: 'wnine', account_name: 'my-account')
|
234
234
|
|
235
|
-
result = mindee_client.
|
235
|
+
result = mindee_client.enqueue_and_parse(
|
236
236
|
input_source,
|
237
|
-
Mindee::Product::
|
237
|
+
Mindee::Product::Generated::GeneratedV1,
|
238
238
|
endpoint: endpoint
|
239
239
|
)
|
240
240
|
```
|
241
241
|
|
242
|
-
This is because the `
|
242
|
+
This is because the `GeneratedV1` class is enough to handle the return processing, but the actual endpoint needs to be specified.
|
243
243
|
|
244
244
|
## Process the Result
|
245
245
|
The response object is common to all documents, including custom documents. The main properties are:
|
@@ -22,7 +22,7 @@ input_source = mindee_client.source_from_path('/path/to/the/file.ext')
|
|
22
22
|
# Parse the file
|
23
23
|
result = mindee_client.enqueue_and_parse(
|
24
24
|
input_source,
|
25
|
-
Mindee::Product::US::UsMail::
|
25
|
+
Mindee::Product::US::UsMail::UsMailV3
|
26
26
|
)
|
27
27
|
|
28
28
|
# Print a full summary of the parsed data in RST format
|
@@ -35,7 +35,20 @@ puts result.document
|
|
35
35
|
|
36
36
|
**Output (RST):**
|
37
37
|
```rst
|
38
|
-
|
38
|
+
########
|
39
|
+
Document
|
40
|
+
########
|
41
|
+
:Mindee ID: f9c36f59-977d-4ddc-9f2d-31c294c456ac
|
42
|
+
:Filename: default_sample.jpg
|
43
|
+
|
44
|
+
Inference
|
45
|
+
#########
|
46
|
+
:Product: mindee/us_mail v3.0
|
47
|
+
:Rotation applied: Yes
|
48
|
+
|
49
|
+
Prediction
|
50
|
+
==========
|
51
|
+
:Sender Name: company zed
|
39
52
|
:Sender Address:
|
40
53
|
:City: Dallas
|
41
54
|
:Complete Address: 54321 Elm Street, Dallas, Texas 54321
|
@@ -44,11 +57,12 @@ puts result.document
|
|
44
57
|
:Street: 54321 Elm Street
|
45
58
|
:Recipient Names: Jane Doe
|
46
59
|
:Recipient Addresses:
|
47
|
-
|
48
|
-
| City | Complete Address | Is Address Change | Postal Code | Private Mailbox Number | State | Street |
|
49
|
-
|
50
|
-
| Detroit | 1234 Market Street PMB 4321, Det... |
|
51
|
-
|
60
|
+
+-----------------+-------------------------------------+-------------------+-------------+------------------------+-------+---------------------------+-----------------+
|
61
|
+
| City | Complete Address | Is Address Change | Postal Code | Private Mailbox Number | State | Street | Unit |
|
62
|
+
+=================+=====================================+===================+=============+========================+=======+===========================+=================+
|
63
|
+
| Detroit | 1234 Market Street PMB 4321, Det... | False | 12345 | 4321 | MI | 1234 Market Street | |
|
64
|
+
+-----------------+-------------------------------------+-------------------+-------------+------------------------+-------+---------------------------+-----------------+
|
65
|
+
:Return to Sender: False
|
52
66
|
```
|
53
67
|
|
54
68
|
# Field Types
|
@@ -78,7 +92,7 @@ Fields which are specific to this product; they are not used in any other produc
|
|
78
92
|
### Recipient Addresses Field
|
79
93
|
The addresses of the recipients.
|
80
94
|
|
81
|
-
A `
|
95
|
+
A `UsMailV3RecipientAddress` implements the following attributes:
|
82
96
|
|
83
97
|
* `city` (String): The city of the recipient's address.
|
84
98
|
* `complete` (String): The complete address of the recipient.
|
@@ -87,12 +101,13 @@ A `UsMailV2RecipientAddress` implements the following attributes:
|
|
87
101
|
* `private_mailbox_number` (String): The private mailbox number of the recipient's address.
|
88
102
|
* `state` (String): Second part of the ISO 3166-2 code, consisting of two letters indicating the US State.
|
89
103
|
* `street` (String): The street of the recipient's address.
|
104
|
+
* `unit` (String): The unit number of the recipient's address.
|
90
105
|
Fields which are specific to this product; they are not used in any other product.
|
91
106
|
|
92
107
|
### Sender Address Field
|
93
108
|
The address of the sender.
|
94
109
|
|
95
|
-
A `
|
110
|
+
A `UsMailV3SenderAddress` implements the following attributes:
|
96
111
|
|
97
112
|
* `city` (String): The city of the sender's address.
|
98
113
|
* `complete` (String): The complete address of the sender.
|
@@ -101,10 +116,17 @@ A `UsMailV2SenderAddress` implements the following attributes:
|
|
101
116
|
* `street` (String): The street of the sender's address.
|
102
117
|
|
103
118
|
# Attributes
|
104
|
-
The following fields are extracted for US Mail
|
119
|
+
The following fields are extracted for US Mail V3:
|
120
|
+
|
121
|
+
## Return to Sender
|
122
|
+
**is_return_to_sender** ([BooleanField](#boolean-field)): Whether the mailing is marked as return to sender.
|
123
|
+
|
124
|
+
```rb
|
125
|
+
puts result.document.inference.prediction.is_return_to_sender.value
|
126
|
+
```
|
105
127
|
|
106
128
|
## Recipient Addresses
|
107
|
-
**recipient_addresses** (Array<[
|
129
|
+
**recipient_addresses** (Array<[UsMailV3RecipientAddress](#recipient-addresses-field)>): The addresses of the recipients.
|
108
130
|
|
109
131
|
```rb
|
110
132
|
for recipient_addresses_elem in result.document.inference.prediction.recipient_addresses do
|
@@ -122,7 +144,7 @@ end
|
|
122
144
|
```
|
123
145
|
|
124
146
|
## Sender Address
|
125
|
-
**sender_address** ([
|
147
|
+
**sender_address** ([UsMailV3SenderAddress](#sender-address-field)): The address of the sender.
|
126
148
|
|
127
149
|
```rb
|
128
150
|
puts result.document.inference.prediction.sender_address.value
|
data/lib/mindee/client.rb
CHANGED
@@ -151,7 +151,7 @@ module Mindee
|
|
151
151
|
# This performs a cropping operation on the server and will increase response time.
|
152
152
|
# @param initial_delay_sec [Integer, Float] initial delay before polling. Defaults to 2.
|
153
153
|
# @param delay_sec [Integer, Float] delay between polling attempts. Defaults to 1.5.
|
154
|
-
# @param max_retries [Integer] maximum amount of retries. Defaults to
|
154
|
+
# @param max_retries [Integer] maximum amount of retries. Defaults to 80.
|
155
155
|
# @return [Mindee::Parsing::Common::ApiResponse]
|
156
156
|
def enqueue_and_parse(
|
157
157
|
input_source,
|
@@ -164,7 +164,7 @@ module Mindee
|
|
164
164
|
cropper: false,
|
165
165
|
initial_delay_sec: 2,
|
166
166
|
delay_sec: 1.5,
|
167
|
-
max_retries:
|
167
|
+
max_retries: 80
|
168
168
|
)
|
169
169
|
enqueue_res = enqueue(
|
170
170
|
input_source,
|
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
require_relative 'ocr_extractor'
|
4
4
|
|
5
|
+
# rubocop:disable Metrics/ClassLength
|
6
|
+
|
5
7
|
module Mindee
|
6
8
|
module Extraction
|
7
9
|
# Tax extractor class
|
@@ -72,9 +74,12 @@ module Mindee
|
|
72
74
|
reconstructed_hash['code'] =
|
73
75
|
found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
|
74
76
|
|
75
|
-
if found_hash['rate']
|
76
|
-
found_hash['rate']
|
77
|
-
found_hash['rate']
|
77
|
+
if found_hash['rate']
|
78
|
+
if found_hash['rate'].abs < 1
|
79
|
+
found_hash['rate'] *= 10
|
80
|
+
elsif found_hash['rate'].abs > 100
|
81
|
+
found_hash['rate'] /= 10
|
82
|
+
end
|
78
83
|
end
|
79
84
|
found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
|
80
85
|
found_hash = decimate_rates_if_needed(found_hash)
|
@@ -125,18 +130,28 @@ module Mindee
|
|
125
130
|
# @param found_hash [Hash] Hash of currently retrieved values
|
126
131
|
# @return [Hash]
|
127
132
|
def self.set_base_and_value(reconstructed_hash, found_hash)
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
reconstructed_hash['base']
|
133
|
-
|
134
|
-
|
135
|
-
|
133
|
+
base = found_hash['base']
|
134
|
+
value = found_hash['value']
|
135
|
+
|
136
|
+
if base && value
|
137
|
+
reconstructed_hash['base'], reconstructed_hash['value'] = [base, value].minmax
|
138
|
+
elsif base
|
139
|
+
reconstructed_hash['base'] = base
|
140
|
+
elsif value
|
141
|
+
reconstructed_hash['value'] = value
|
142
|
+
calculate_base(reconstructed_hash)
|
136
143
|
end
|
144
|
+
|
137
145
|
reconstructed_hash
|
138
146
|
end
|
139
147
|
|
148
|
+
def self.calculate_base(hash)
|
149
|
+
rate = hash['rate']
|
150
|
+
return unless rate&.positive?
|
151
|
+
|
152
|
+
hash['base'] = hash['value'] / (rate / 100.0)
|
153
|
+
end
|
154
|
+
|
140
155
|
# Extracts a single custom type of tax.
|
141
156
|
# For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
|
142
157
|
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
|
@@ -149,7 +164,6 @@ module Mindee
|
|
149
164
|
|
150
165
|
tax_names.sort!
|
151
166
|
found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
|
152
|
-
# a tax is considered found horizontally if it has a value, otherwise it is vertical
|
153
167
|
if found_hash.nil? || found_hash['value'].nil?
|
154
168
|
found_hash = extract_vertical_tax(ocr_result, tax_names,
|
155
169
|
found_hash)
|
@@ -240,14 +254,14 @@ module Mindee
|
|
240
254
|
linear_pattern_percent_first = %r{
|
241
255
|
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
|
242
256
|
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
|
243
|
-
((?:\s*-\s*)?(?:\d*[.,])+\d
|
244
|
-
((?:\s*-\s*)?(\d*[.,])*\d
|
257
|
+
((?:\s*-\s*)?(?:\d*[.,])+\d+)?[ .]*
|
258
|
+
((?:\s*-\s*)?(\d*[.,])*\d+)?
|
245
259
|
}x
|
246
260
|
linear_pattern_percent_second = %r{
|
247
261
|
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
|
248
262
|
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
|
249
|
-
((?:\s*-\s*)?(?:\d*[.,])+\d
|
250
|
-
((?:\s*-\s*)?(\d*[.,])*\d
|
263
|
+
((?:\s*-\s*)?(?:\d*[.,])+\d+)?[ .]*
|
264
|
+
((?:\s*-\s*)?(\d*[.,])*\d+)?
|
251
265
|
}x
|
252
266
|
ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
|
253
267
|
page.all_lines.each do |line|
|
@@ -304,7 +318,7 @@ module Mindee
|
|
304
318
|
page.all_words.each do |word|
|
305
319
|
next if match_index(word.text, tax_names).nil?
|
306
320
|
|
307
|
-
reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
|
321
|
+
reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id, 0.25)
|
308
322
|
found_hash['page_id'] = page_id if found_hash['page_id'].nil?
|
309
323
|
found_hash['code'] = word.text.strip if found_hash['code'].nil?
|
310
324
|
found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
|
@@ -316,8 +330,9 @@ module Mindee
|
|
316
330
|
private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
|
317
331
|
:extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
|
318
332
|
:create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
|
319
|
-
:decimate_rates_if_needed, :
|
320
|
-
:swap_rates_if_needed
|
333
|
+
:decimate_rates_if_needed, :set_base_and_value, :valid_candidate?,
|
334
|
+
:swap_rates_if_needed, :calculate_base
|
321
335
|
end
|
322
336
|
end
|
323
337
|
end
|
338
|
+
# rubocop:enable Metrics/ClassLength
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Input
|
7
|
+
module Source
|
8
|
+
# Load a document from a base64 string.
|
9
|
+
class Base64InputSource < LocalInputSource
|
10
|
+
# @param base64_string [String]
|
11
|
+
# @param filename [String]
|
12
|
+
# @param fix_pdf [Boolean]
|
13
|
+
def initialize(base64_string, filename, fix_pdf: false)
|
14
|
+
io_stream = StringIO.new(base64_string.unpack1('m*'))
|
15
|
+
io_stream.set_encoding Encoding::BINARY
|
16
|
+
super(io_stream, filename, fix_pdf: fix_pdf)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Overload of the same function to prevent a base64 from being re-encoded.
|
20
|
+
# @param close [Boolean]
|
21
|
+
# @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
|
22
|
+
def read_document(close: true)
|
23
|
+
@io_stream.seek(0)
|
24
|
+
data = @io_stream.read
|
25
|
+
@io_stream.close if close
|
26
|
+
['document', [data].pack('m'), { filename: Source.convert_to_unicode_escape(@filename) }]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Input
|
7
|
+
module Source
|
8
|
+
# Load a document from raw bytes.
|
9
|
+
class BytesInputSource < LocalInputSource
|
10
|
+
# @param raw_bytes [String]
|
11
|
+
# @param filename [String]
|
12
|
+
# @param fix_pdf [Boolean]
|
13
|
+
def initialize(raw_bytes, filename, fix_pdf: false)
|
14
|
+
io_stream = StringIO.new(raw_bytes)
|
15
|
+
io_stream.set_encoding Encoding::BINARY
|
16
|
+
super(io_stream, filename, fix_pdf: fix_pdf)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Input
|
7
|
+
module Source
|
8
|
+
# Load a document from a file handle.
|
9
|
+
class FileInputSource < LocalInputSource
|
10
|
+
# @param input_file [File]
|
11
|
+
# @param filename [String]
|
12
|
+
# @param fix_pdf [Boolean]
|
13
|
+
def initialize(input_file, filename, fix_pdf: false)
|
14
|
+
io_stream = input_file
|
15
|
+
super(io_stream, filename, fix_pdf: fix_pdf)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
require 'marcel'
|
5
|
+
|
6
|
+
require_relative '../../pdf'
|
7
|
+
require_relative '../../image'
|
8
|
+
|
9
|
+
module Mindee
|
10
|
+
module Input
|
11
|
+
# Document source handling.
|
12
|
+
module Source
|
13
|
+
# Mime types accepted by the server.
|
14
|
+
ALLOWED_MIME_TYPES = [
|
15
|
+
'application/pdf',
|
16
|
+
'image/heic',
|
17
|
+
'image/png',
|
18
|
+
'image/jpeg',
|
19
|
+
'image/tiff',
|
20
|
+
'image/webp',
|
21
|
+
].freeze
|
22
|
+
|
23
|
+
# Standard error for invalid mime types
|
24
|
+
class MimeTypeError < StandardError
|
25
|
+
end
|
26
|
+
|
27
|
+
# Error sent if the file's mimetype isn't allowed
|
28
|
+
class InvalidMimeTypeError < MimeTypeError
|
29
|
+
# @return [String]
|
30
|
+
attr_reader :invalid_mimetype
|
31
|
+
|
32
|
+
# @param mime_type [String]
|
33
|
+
def initialize(mime_type)
|
34
|
+
@invalid_mimetype = mime_type
|
35
|
+
super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}")
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Error sent if a pdf file couldn't be fixed
|
40
|
+
class UnfixablePDFError < MimeTypeError
|
41
|
+
def initialize
|
42
|
+
super("Corrupted PDF couldn't be repaired.")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Base class for loading documents.
|
47
|
+
class LocalInputSource
|
48
|
+
# @return [String]
|
49
|
+
attr_reader :filename
|
50
|
+
# @return [String]
|
51
|
+
attr_reader :file_mimetype
|
52
|
+
# @return [StringIO]
|
53
|
+
attr_reader :io_stream
|
54
|
+
|
55
|
+
# @param io_stream [StringIO]
|
56
|
+
# @param filename [String]
|
57
|
+
# @param fix_pdf [Boolean]
|
58
|
+
def initialize(io_stream, filename, fix_pdf: false)
|
59
|
+
@io_stream = io_stream
|
60
|
+
@filename = filename
|
61
|
+
@file_mimetype = if fix_pdf
|
62
|
+
Marcel::MimeType.for @io_stream
|
63
|
+
else
|
64
|
+
Marcel::MimeType.for @io_stream, name: @filename
|
65
|
+
end
|
66
|
+
return if ALLOWED_MIME_TYPES.include? @file_mimetype
|
67
|
+
|
68
|
+
if filename.end_with?('.pdf') && fix_pdf
|
69
|
+
rescue_broken_pdf(@io_stream)
|
70
|
+
@file_mimetype = Marcel::MimeType.for @io_stream
|
71
|
+
|
72
|
+
return if ALLOWED_MIME_TYPES.include? @file_mimetype
|
73
|
+
end
|
74
|
+
|
75
|
+
raise InvalidMimeTypeError, @file_mimetype.to_s
|
76
|
+
end
|
77
|
+
|
78
|
+
# Attempts to fix pdf files if mimetype is rejected.
|
79
|
+
# "Broken PDFs" are often a result of third-party injecting invalid headers.
|
80
|
+
# This attempts to remove them and send the file
|
81
|
+
# @param stream [StringIO]
|
82
|
+
def rescue_broken_pdf(stream)
|
83
|
+
stream.gets('%PDF-')
|
84
|
+
raise UnfixablePDFError if stream.eof? || stream.pos > 500
|
85
|
+
|
86
|
+
stream.pos = stream.pos - 5
|
87
|
+
data = stream.read
|
88
|
+
@io_stream.close
|
89
|
+
|
90
|
+
@io_stream = StringIO.new
|
91
|
+
@io_stream << data
|
92
|
+
end
|
93
|
+
|
94
|
+
# Shorthand for pdf mimetype validation.
|
95
|
+
def pdf?
|
96
|
+
@file_mimetype.to_s == 'application/pdf'
|
97
|
+
end
|
98
|
+
|
99
|
+
# Parses a PDF file according to provided options.
|
100
|
+
# @param options [Hash, nil] Page cutting/merge options:
|
101
|
+
#
|
102
|
+
# * `:page_indexes` Zero-based list of page indexes.
|
103
|
+
# * `:operation` Operation to apply on the document, given the `page_indexes specified:
|
104
|
+
# * `:KEEP_ONLY` - keep only the specified pages, and remove all others.
|
105
|
+
# * `:REMOVE` - remove the specified pages, and keep all others.
|
106
|
+
# * `:on_min_pages` Apply the operation only if document has at least this many pages.
|
107
|
+
def process_pdf(options)
|
108
|
+
@io_stream.seek(0)
|
109
|
+
@io_stream = PdfProcessor.parse(@io_stream, options)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Reads a document.
|
113
|
+
# @param close [Boolean]
|
114
|
+
# @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
|
115
|
+
def read_document(close: true)
|
116
|
+
@io_stream.seek(0)
|
117
|
+
# Avoids needlessly re-packing some files
|
118
|
+
data = @io_stream.read
|
119
|
+
@io_stream.close if close
|
120
|
+
['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
|
121
|
+
end
|
122
|
+
|
123
|
+
def count_pdf_pages
|
124
|
+
return 1 unless pdf?
|
125
|
+
|
126
|
+
@io_stream.seek(0)
|
127
|
+
pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
|
128
|
+
pdf_processor.pages.size
|
129
|
+
end
|
130
|
+
|
131
|
+
# Compresses the file, according to the provided info.
|
132
|
+
# @param [Integer] quality Quality of the output file.
|
133
|
+
# @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
|
134
|
+
# @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
|
135
|
+
# @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
|
136
|
+
# This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
|
137
|
+
# WARNING: this operation is strongly discouraged.
|
138
|
+
# @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
|
139
|
+
# not. Needs force_source_text to work.
|
140
|
+
def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
|
141
|
+
buffer = if pdf?
|
142
|
+
Mindee::PDF::PDFCompressor.compress_pdf(
|
143
|
+
@io_stream,
|
144
|
+
quality: quality,
|
145
|
+
force_source_text_compression: force_source_text,
|
146
|
+
disable_source_text: disable_source_text
|
147
|
+
)
|
148
|
+
else
|
149
|
+
Mindee::Image::ImageCompressor.compress_image(
|
150
|
+
@io_stream,
|
151
|
+
quality: quality,
|
152
|
+
max_width: max_width,
|
153
|
+
max_height: max_height
|
154
|
+
)
|
155
|
+
end
|
156
|
+
@io_stream = buffer
|
157
|
+
@io_stream.rewind
|
158
|
+
end
|
159
|
+
|
160
|
+
# Checks whether the file has source text if it is a pdf. False otherwise
|
161
|
+
# @return [Boolean] True if the file is a PDF and has source text.
|
162
|
+
def source_text?
|
163
|
+
Mindee::PDF::PDFTools.source_text?(@io_stream)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
# Replaces non-ASCII characters by their UNICODE escape sequence.
|
168
|
+
# Keeps other characters as is.
|
169
|
+
# @return A clean String.
|
170
|
+
def self.convert_to_unicode_escape(string)
|
171
|
+
unicode_escape_string = ''.dup
|
172
|
+
string.each_char do |char|
|
173
|
+
unicode_escape_string << if char.bytesize > 1
|
174
|
+
"\\u#{char.unpack1('U').to_s(16).rjust(4, '0')}"
|
175
|
+
else
|
176
|
+
char
|
177
|
+
end
|
178
|
+
end
|
179
|
+
unicode_escape_string
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Input
|
7
|
+
# Document source handling.
|
8
|
+
module Source
|
9
|
+
# Load a document from a path.
|
10
|
+
class PathInputSource < LocalInputSource
|
11
|
+
# @param filepath [String]
|
12
|
+
# @param fix_pdf [Boolean]
|
13
|
+
def initialize(filepath, fix_pdf: false)
|
14
|
+
io_stream = File.open(filepath, 'rb')
|
15
|
+
super(io_stream, File.basename(filepath), fix_pdf: fix_pdf)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|