mindee 3.17.0 → 3.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -0
  3. data/README.md +4 -4
  4. data/bin/mindee.rb +0 -6
  5. data/docs/code_samples/{carte_vitale_v1.txt → us_mail_v3_async.txt} +2 -2
  6. data/docs/custom_v1.md +1 -1
  7. data/docs/getting_started.md +5 -5
  8. data/docs/{us_mail_v2.md → us_mail_v3.md} +34 -12
  9. data/lib/mindee/client.rb +2 -2
  10. data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +34 -19
  11. data/lib/mindee/input/sources/base64_input_source.rb +31 -0
  12. data/lib/mindee/input/sources/bytes_input_source.rb +21 -0
  13. data/lib/mindee/input/sources/file_input_source.rb +20 -0
  14. data/lib/mindee/input/sources/local_input_source.rb +183 -0
  15. data/lib/mindee/input/sources/path_input_source.rb +20 -0
  16. data/lib/mindee/input/sources/url_input_source.rb +127 -0
  17. data/lib/mindee/input/sources.rb +6 -248
  18. data/lib/mindee/parsing/standard/boolean_field.rb +6 -0
  19. data/lib/mindee/product/ind/indian_passport/indian_passport_v1_document.rb +1 -1
  20. data/lib/mindee/product/ind/indian_passport/indian_passport_v1_page.rb +1 -1
  21. data/lib/mindee/product/{fr/carte_vitale/carte_vitale_v1.rb → us/us_mail/us_mail_v3.rb} +11 -11
  22. data/lib/mindee/product/us/us_mail/us_mail_v3_document.rb +107 -0
  23. data/lib/mindee/product/{fr/carte_vitale/carte_vitale_v1_page.rb → us/us_mail/us_mail_v3_page.rb} +8 -8
  24. data/lib/mindee/product/us/us_mail/us_mail_v3_recipient_address.rb +113 -0
  25. data/lib/mindee/product/us/us_mail/us_mail_v3_sender_address.rb +66 -0
  26. data/lib/mindee/product.rb +1 -1
  27. data/lib/mindee/version.rb +1 -1
  28. metadata +18 -10
  29. data/lib/mindee/product/fr/carte_vitale/carte_vitale_v1_document.rb +0 -52
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 12e0bc8339961d2e6be4c5dfdc9537d34ab63f449c435275324b95c031299764
4
- data.tar.gz: 24021ef9d0175075365efc9e056d67f3d553b1ad7b95a5a2876d29fcff15031c
3
+ metadata.gz: ef56483dc0576b931b3d88b57e4512abd44ea4e216ed29dc1097ada3fa5b00fb
4
+ data.tar.gz: 8d4f6c2b99c113439b2332244cd89619189704bc75655121c26ba0122118d539
5
5
  SHA512:
6
- metadata.gz: df5dc07d28d67b0e1fe1c64c3123aeb3f9c8926264aed396fdd64858f8b06d6d2485745c61d15c35e0ae3f92ee6773eaacbdc86ba3256cefd7349fb0ff90533a
7
- data.tar.gz: e07e7b776f21398ce3a9197fb54afef57361ffda29c198564da9ffa1fa0eea85830c7d1601177c241219bea60c61cbb9cbcec66ad0c80d55ffcc7aab0e8cfe4f
6
+ metadata.gz: a0461aea7ba2804e36ddc833e1ec24888227484d1ad1f1471b8b8859cfaffc14a94f623672b749fe0f73584c6c215f26d87f2e11cbeb701fc8cc9cf86088ec6f
7
+ data.tar.gz: aed38431cddde8abc620cf4701ecf7299929a0216b0a9e717d7ef56732211fb38ae12d093fca8839cbd981c43047f0a87088587c44fc7f4c9ff9f4ae56438f7d
data/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # Mindee Ruby API Library Changelog
2
2
 
3
+ ## v3.19.0 - 2025-01-14
4
+ ### Changes
5
+ * :sparkles: add support for US Mail V3
6
+ * :recycle: increase async retry timers
7
+
8
+
9
+ ## v3.18.0 - 2024-12-13
10
+ ### Changes
11
+ * :sparkles: allow local downloading of remote sources
12
+ * :coffin: remove support for (FR) Carte Vitale V1 in favor of French Health Card V1
13
+ ### Fixes
14
+ * :bug: fix tax-extraction script
15
+
16
+
3
17
  ## v3.17.0 - 2024-11-28
4
18
  ### Changes
5
19
  * :sparkles: add support for workflows
data/README.md CHANGED
@@ -116,7 +116,7 @@ result = mindee_client.parse(
116
116
  puts result.document
117
117
  ```
118
118
 
119
- ### Custom Document (API Builder)
119
+ ### Custom Documents (docTI & Custom APIs)
120
120
 
121
121
  ```ruby
122
122
  require 'mindee'
@@ -131,9 +131,9 @@ endpoint = mindee_client.create_endpoint(
131
131
  # Load a file from disk
132
132
  input_source = mindee_client.source_from_path('/path/to/the/file.ext')
133
133
 
134
- result = mindee_client.parse(
134
+ result = mindee_client.enqueue_and_parse(
135
135
  input_source,
136
- Mindee::Product::Custom::CustomV1,
136
+ Mindee::Product::Generated::GeneratedV1,
137
137
  endpoint: endpoint
138
138
  )
139
139
 
@@ -181,7 +181,7 @@ customize the experience.
181
181
  * [EU Driver License OCR Ruby](https://developers.mindee.com/docs/ruby-eu-driver-license-ocr)
182
182
  * [FR Bank Account Details OCR Ruby](https://developers.mindee.com/docs/ruby-fr-bank-account-details-ocr)
183
183
  * [FR Bank Statement OCR Ruby](https://developers.mindee.com/docs/ruby-fr-bank-statement-ocr)
184
- * [FR Carte Vitale OCR Ruby](https://developers.mindee.com/docs/ruby-fr-carte-vitale-ocr)
184
+ * [FR Health Card OCR Ruby](https://developers.mindee.com/docs/ruby-fr-health-card-ocr)
185
185
  * [FR ID Card OCR Ruby](https://developers.mindee.com/docs/ruby-fr-carte-nationale-didentite-ocr)
186
186
  * [US Bank Check OCR Ruby](https://developers.mindee.com/docs/ruby-us-bank-check-ocr)
187
187
  * [US Driver License OCR Ruby](https://developers.mindee.com/docs/ruby-us-driver-license-ocr)
data/bin/mindee.rb CHANGED
@@ -97,12 +97,6 @@ DOCUMENTS = {
97
97
  sync: false,
98
98
  async: true,
99
99
  },
100
- "fr-carte-vitale" => {
101
- description: "FR Carte Vitale",
102
- doc_class: Mindee::Product::FR::CarteVitale::CarteVitaleV1,
103
- sync: true,
104
- async: false,
105
- },
106
100
  "fr-id-card" => {
107
101
  description: "FR ID Card",
108
102
  doc_class: Mindee::Product::FR::IdCard::IdCardV2,
@@ -7,9 +7,9 @@ mindee_client = Mindee::Client.new(api_key: 'my-api-key')
7
7
  input_source = mindee_client.source_from_path('/path/to/the/file.ext')
8
8
 
9
9
  # Parse the file
10
- result = mindee_client.parse(
10
+ result = mindee_client.enqueue_and_parse(
11
11
  input_source,
12
- Mindee::Product::FR::CarteVitale::CarteVitaleV1
12
+ Mindee::Product::US::UsMail::UsMailV3
13
13
  )
14
14
 
15
15
  # Print a full summary of the parsed data in RST format
data/docs/custom_v1.md CHANGED
@@ -4,7 +4,7 @@ category: 622b805aaec68102ea7fcbc2
4
4
  slug: ruby-api-builder-ocr
5
5
  parentDoc: 6294d97ee723f1008d2ab28e
6
6
  ---
7
- > 🚧 This product is still supported, but is considered to be deprecated. If you are looking for the DocTI API documentation, you can find it [here](https://developers.mindee.com/docs/ruby-generated-ocr).
7
+ > 🚧 This product is still supported, but is considered to be deprecated. If you are looking for the docTI API documentation, you can find it [here](https://developers.mindee.com/docs/ruby-generated-ocr).
8
8
 
9
9
  # Quick-Start
10
10
 
@@ -226,20 +226,20 @@ result = mindee_client.parse(
226
226
  )
227
227
  ```
228
228
 
229
- ### Custom Documents
229
+ ### Custom Documents (docTI)
230
230
  For custom documents, the endpoint to use must also be set, and it must take in an `endpoint_name`:
231
231
 
232
232
  ```ruby
233
- endpoint = mindee_client.create_endpoint(endpoint_name: 'wnine')
233
+ endpoint = mindee_client.create_endpoint(endpoint_name: 'wnine', account_name: 'my-account')
234
234
 
235
- result = mindee_client.parse(
235
+ result = mindee_client.enqueue_and_parse(
236
236
  input_source,
237
- Mindee::Product::Custom::CustomV1,
237
+ Mindee::Product::Generated::GeneratedV1,
238
238
  endpoint: endpoint
239
239
  )
240
240
  ```
241
241
 
242
- This is because the `CustomV1` class is enough to handle the return processing, but the actual endpoint needs to be specified.
242
+ This is because the `GeneratedV1` class is enough to handle the return processing, but the actual endpoint needs to be specified.
243
243
 
244
244
  ## Process the Result
245
245
  The response object is common to all documents, including custom documents. The main properties are:
@@ -22,7 +22,7 @@ input_source = mindee_client.source_from_path('/path/to/the/file.ext')
22
22
  # Parse the file
23
23
  result = mindee_client.enqueue_and_parse(
24
24
  input_source,
25
- Mindee::Product::US::UsMail::UsMailV2
25
+ Mindee::Product::US::UsMail::UsMailV3
26
26
  )
27
27
 
28
28
  # Print a full summary of the parsed data in RST format
@@ -35,7 +35,20 @@ puts result.document
35
35
 
36
36
  **Output (RST):**
37
37
  ```rst
38
- :Sender Name: zed
38
+ ########
39
+ Document
40
+ ########
41
+ :Mindee ID: f9c36f59-977d-4ddc-9f2d-31c294c456ac
42
+ :Filename: default_sample.jpg
43
+
44
+ Inference
45
+ #########
46
+ :Product: mindee/us_mail v3.0
47
+ :Rotation applied: Yes
48
+
49
+ Prediction
50
+ ==========
51
+ :Sender Name: company zed
39
52
  :Sender Address:
40
53
  :City: Dallas
41
54
  :Complete Address: 54321 Elm Street, Dallas, Texas 54321
@@ -44,11 +57,12 @@ puts result.document
44
57
  :Street: 54321 Elm Street
45
58
  :Recipient Names: Jane Doe
46
59
  :Recipient Addresses:
47
- +-----------------+-------------------------------------+-------------------+-------------+------------------------+-------+---------------------------+
48
- | City | Complete Address | Is Address Change | Postal Code | Private Mailbox Number | State | Street |
49
- +=================+=====================================+===================+=============+========================+=======+===========================+
50
- | Detroit | 1234 Market Street PMB 4321, Det... | | 12345 | 4321 | MI | 1234 Market Street |
51
- +-----------------+-------------------------------------+-------------------+-------------+------------------------+-------+---------------------------+
60
+ +-----------------+-------------------------------------+-------------------+-------------+------------------------+-------+---------------------------+-----------------+
61
+ | City | Complete Address | Is Address Change | Postal Code | Private Mailbox Number | State | Street | Unit |
62
+ +=================+=====================================+===================+=============+========================+=======+===========================+=================+
63
+ | Detroit | 1234 Market Street PMB 4321, Det... | False | 12345 | 4321 | MI | 1234 Market Street | |
64
+ +-----------------+-------------------------------------+-------------------+-------------+------------------------+-------+---------------------------+-----------------+
65
+ :Return to Sender: False
52
66
  ```
53
67
 
54
68
  # Field Types
@@ -78,7 +92,7 @@ Fields which are specific to this product; they are not used in any other produc
78
92
  ### Recipient Addresses Field
79
93
  The addresses of the recipients.
80
94
 
81
- A `UsMailV2RecipientAddress` implements the following attributes:
95
+ A `UsMailV3RecipientAddress` implements the following attributes:
82
96
 
83
97
  * `city` (String): The city of the recipient's address.
84
98
  * `complete` (String): The complete address of the recipient.
@@ -87,12 +101,13 @@ A `UsMailV2RecipientAddress` implements the following attributes:
87
101
  * `private_mailbox_number` (String): The private mailbox number of the recipient's address.
88
102
  * `state` (String): Second part of the ISO 3166-2 code, consisting of two letters indicating the US State.
89
103
  * `street` (String): The street of the recipient's address.
104
+ * `unit` (String): The unit number of the recipient's address.
90
105
  Fields which are specific to this product; they are not used in any other product.
91
106
 
92
107
  ### Sender Address Field
93
108
  The address of the sender.
94
109
 
95
- A `UsMailV2SenderAddress` implements the following attributes:
110
+ A `UsMailV3SenderAddress` implements the following attributes:
96
111
 
97
112
  * `city` (String): The city of the sender's address.
98
113
  * `complete` (String): The complete address of the sender.
@@ -101,10 +116,17 @@ A `UsMailV2SenderAddress` implements the following attributes:
101
116
  * `street` (String): The street of the sender's address.
102
117
 
103
118
  # Attributes
104
- The following fields are extracted for US Mail V2:
119
+ The following fields are extracted for US Mail V3:
120
+
121
+ ## Return to Sender
122
+ **is_return_to_sender** ([BooleanField](#boolean-field)): Whether the mailing is marked as return to sender.
123
+
124
+ ```rb
125
+ puts result.document.inference.prediction.is_return_to_sender.value
126
+ ```
105
127
 
106
128
  ## Recipient Addresses
107
- **recipient_addresses** (Array<[UsMailV2RecipientAddress](#recipient-addresses-field)>): The addresses of the recipients.
129
+ **recipient_addresses** (Array<[UsMailV3RecipientAddress](#recipient-addresses-field)>): The addresses of the recipients.
108
130
 
109
131
  ```rb
110
132
  for recipient_addresses_elem in result.document.inference.prediction.recipient_addresses do
@@ -122,7 +144,7 @@ end
122
144
  ```
123
145
 
124
146
  ## Sender Address
125
- **sender_address** ([UsMailV2SenderAddress](#sender-address-field)): The address of the sender.
147
+ **sender_address** ([UsMailV3SenderAddress](#sender-address-field)): The address of the sender.
126
148
 
127
149
  ```rb
128
150
  puts result.document.inference.prediction.sender_address.value
data/lib/mindee/client.rb CHANGED
@@ -151,7 +151,7 @@ module Mindee
151
151
  # This performs a cropping operation on the server and will increase response time.
152
152
  # @param initial_delay_sec [Integer, Float] initial delay before polling. Defaults to 2.
153
153
  # @param delay_sec [Integer, Float] delay between polling attempts. Defaults to 1.5.
154
- # @param max_retries [Integer] maximum amount of retries. Defaults to 60.
154
+ # @param max_retries [Integer] maximum amount of retries. Defaults to 80.
155
155
  # @return [Mindee::Parsing::Common::ApiResponse]
156
156
  def enqueue_and_parse(
157
157
  input_source,
@@ -164,7 +164,7 @@ module Mindee
164
164
  cropper: false,
165
165
  initial_delay_sec: 2,
166
166
  delay_sec: 1.5,
167
- max_retries: 60
167
+ max_retries: 80
168
168
  )
169
169
  enqueue_res = enqueue(
170
170
  input_source,
@@ -2,6 +2,8 @@
2
2
 
3
3
  require_relative 'ocr_extractor'
4
4
 
5
+ # rubocop:disable Metrics/ClassLength
6
+
5
7
  module Mindee
6
8
  module Extraction
7
9
  # Tax extractor class
@@ -72,9 +74,12 @@ module Mindee
72
74
  reconstructed_hash['code'] =
73
75
  found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
74
76
 
75
- if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
76
- found_hash['rate'] =
77
- found_hash['rate'] * 100
77
+ if found_hash['rate']
78
+ if found_hash['rate'].abs < 1
79
+ found_hash['rate'] *= 10
80
+ elsif found_hash['rate'].abs > 100
81
+ found_hash['rate'] /= 10
82
+ end
78
83
  end
79
84
  found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
80
85
  found_hash = decimate_rates_if_needed(found_hash)
@@ -125,18 +130,28 @@ module Mindee
125
130
  # @param found_hash [Hash] Hash of currently retrieved values
126
131
  # @return [Hash]
127
132
  def self.set_base_and_value(reconstructed_hash, found_hash)
128
- if found_hash['base'].nil?
129
- reconstructed_hash['base'] = found_hash['base']
130
- reconstructed_hash['value'] = found_hash['value']
131
- elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
132
- reconstructed_hash['base'] = found_hash['value']
133
- reconstructed_hash['value'] = found_hash['base']
134
- else
135
- reconstructed_hash['value'] = found_hash['value']
133
+ base = found_hash['base']
134
+ value = found_hash['value']
135
+
136
+ if base && value
137
+ reconstructed_hash['base'], reconstructed_hash['value'] = [base, value].minmax
138
+ elsif base
139
+ reconstructed_hash['base'] = base
140
+ elsif value
141
+ reconstructed_hash['value'] = value
142
+ calculate_base(reconstructed_hash)
136
143
  end
144
+
137
145
  reconstructed_hash
138
146
  end
139
147
 
148
+ def self.calculate_base(hash)
149
+ rate = hash['rate']
150
+ return unless rate&.positive?
151
+
152
+ hash['base'] = hash['value'] / (rate / 100.0)
153
+ end
154
+
140
155
  # Extracts a single custom type of tax.
141
156
  # For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
142
157
  # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
@@ -149,7 +164,6 @@ module Mindee
149
164
 
150
165
  tax_names.sort!
151
166
  found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
152
- # a tax is considered found horizontally if it has a value, otherwise it is vertical
153
167
  if found_hash.nil? || found_hash['value'].nil?
154
168
  found_hash = extract_vertical_tax(ocr_result, tax_names,
155
169
  found_hash)
@@ -240,14 +254,14 @@ module Mindee
240
254
  linear_pattern_percent_first = %r{
241
255
  ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
242
256
  ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
243
- ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
244
- ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
257
+ ((?:\s*-\s*)?(?:\d*[.,])+\d+)?[ .]*
258
+ ((?:\s*-\s*)?(\d*[.,])*\d+)?
245
259
  }x
246
260
  linear_pattern_percent_second = %r{
247
261
  ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
248
262
  ((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
249
- ((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
250
- ((?:\s*-\s*)?(\d*[.,])*\d{2,})?
263
+ ((?:\s*-\s*)?(?:\d*[.,])+\d+)?[ .]*
264
+ ((?:\s*-\s*)?(\d*[.,])*\d+)?
251
265
  }x
252
266
  ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
253
267
  page.all_lines.each do |line|
@@ -304,7 +318,7 @@ module Mindee
304
318
  page.all_words.each do |word|
305
319
  next if match_index(word.text, tax_names).nil?
306
320
 
307
- reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
321
+ reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id, 0.25)
308
322
  found_hash['page_id'] = page_id if found_hash['page_id'].nil?
309
323
  found_hash['code'] = word.text.strip if found_hash['code'].nil?
310
324
  found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
@@ -316,8 +330,9 @@ module Mindee
316
330
  private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
317
331
  :extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
318
332
  :create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
319
- :decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
320
- :swap_rates_if_needed
333
+ :decimate_rates_if_needed, :set_base_and_value, :valid_candidate?,
334
+ :swap_rates_if_needed, :calculate_base
321
335
  end
322
336
  end
323
337
  end
338
+ # rubocop:enable Metrics/ClassLength
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ module Mindee
6
+ module Input
7
+ module Source
8
+ # Load a document from a base64 string.
9
+ class Base64InputSource < LocalInputSource
10
+ # @param base64_string [String]
11
+ # @param filename [String]
12
+ # @param fix_pdf [Boolean]
13
+ def initialize(base64_string, filename, fix_pdf: false)
14
+ io_stream = StringIO.new(base64_string.unpack1('m*'))
15
+ io_stream.set_encoding Encoding::BINARY
16
+ super(io_stream, filename, fix_pdf: fix_pdf)
17
+ end
18
+
19
+ # Overload of the same function to prevent a base64 from being re-encoded.
20
+ # @param close [Boolean]
21
+ # @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
22
+ def read_document(close: true)
23
+ @io_stream.seek(0)
24
+ data = @io_stream.read
25
+ @io_stream.close if close
26
+ ['document', [data].pack('m'), { filename: Source.convert_to_unicode_escape(@filename) }]
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ module Mindee
6
+ module Input
7
+ module Source
8
+ # Load a document from raw bytes.
9
+ class BytesInputSource < LocalInputSource
10
+ # @param raw_bytes [String]
11
+ # @param filename [String]
12
+ # @param fix_pdf [Boolean]
13
+ def initialize(raw_bytes, filename, fix_pdf: false)
14
+ io_stream = StringIO.new(raw_bytes)
15
+ io_stream.set_encoding Encoding::BINARY
16
+ super(io_stream, filename, fix_pdf: fix_pdf)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ module Mindee
6
+ module Input
7
+ module Source
8
+ # Load a document from a file handle.
9
+ class FileInputSource < LocalInputSource
10
+ # @param input_file [File]
11
+ # @param filename [String]
12
+ # @param fix_pdf [Boolean]
13
+ def initialize(input_file, filename, fix_pdf: false)
14
+ io_stream = input_file
15
+ super(io_stream, filename, fix_pdf: fix_pdf)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,183 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+ require 'marcel'
5
+
6
+ require_relative '../../pdf'
7
+ require_relative '../../image'
8
+
9
+ module Mindee
10
+ module Input
11
+ # Document source handling.
12
+ module Source
13
+ # Mime types accepted by the server.
14
+ ALLOWED_MIME_TYPES = [
15
+ 'application/pdf',
16
+ 'image/heic',
17
+ 'image/png',
18
+ 'image/jpeg',
19
+ 'image/tiff',
20
+ 'image/webp',
21
+ ].freeze
22
+
23
+ # Standard error for invalid mime types
24
+ class MimeTypeError < StandardError
25
+ end
26
+
27
+ # Error sent if the file's mimetype isn't allowed
28
+ class InvalidMimeTypeError < MimeTypeError
29
+ # @return [String]
30
+ attr_reader :invalid_mimetype
31
+
32
+ # @param mime_type [String]
33
+ def initialize(mime_type)
34
+ @invalid_mimetype = mime_type
35
+ super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}")
36
+ end
37
+ end
38
+
39
+ # Error sent if a pdf file couldn't be fixed
40
+ class UnfixablePDFError < MimeTypeError
41
+ def initialize
42
+ super("Corrupted PDF couldn't be repaired.")
43
+ end
44
+ end
45
+
46
+ # Base class for loading documents.
47
+ class LocalInputSource
48
+ # @return [String]
49
+ attr_reader :filename
50
+ # @return [String]
51
+ attr_reader :file_mimetype
52
+ # @return [StringIO]
53
+ attr_reader :io_stream
54
+
55
+ # @param io_stream [StringIO]
56
+ # @param filename [String]
57
+ # @param fix_pdf [Boolean]
58
+ def initialize(io_stream, filename, fix_pdf: false)
59
+ @io_stream = io_stream
60
+ @filename = filename
61
+ @file_mimetype = if fix_pdf
62
+ Marcel::MimeType.for @io_stream
63
+ else
64
+ Marcel::MimeType.for @io_stream, name: @filename
65
+ end
66
+ return if ALLOWED_MIME_TYPES.include? @file_mimetype
67
+
68
+ if filename.end_with?('.pdf') && fix_pdf
69
+ rescue_broken_pdf(@io_stream)
70
+ @file_mimetype = Marcel::MimeType.for @io_stream
71
+
72
+ return if ALLOWED_MIME_TYPES.include? @file_mimetype
73
+ end
74
+
75
+ raise InvalidMimeTypeError, @file_mimetype.to_s
76
+ end
77
+
78
+ # Attempts to fix pdf files if mimetype is rejected.
79
+ # "Broken PDFs" are often a result of third-party injecting invalid headers.
80
+ # This attempts to remove them and send the file
81
+ # @param stream [StringIO]
82
+ def rescue_broken_pdf(stream)
83
+ stream.gets('%PDF-')
84
+ raise UnfixablePDFError if stream.eof? || stream.pos > 500
85
+
86
+ stream.pos = stream.pos - 5
87
+ data = stream.read
88
+ @io_stream.close
89
+
90
+ @io_stream = StringIO.new
91
+ @io_stream << data
92
+ end
93
+
94
+ # Shorthand for pdf mimetype validation.
95
+ def pdf?
96
+ @file_mimetype.to_s == 'application/pdf'
97
+ end
98
+
99
+ # Parses a PDF file according to provided options.
100
+ # @param options [Hash, nil] Page cutting/merge options:
101
+ #
102
+ # * `:page_indexes` Zero-based list of page indexes.
103
+ # * `:operation` Operation to apply on the document, given the `page_indexes specified:
104
+ # * `:KEEP_ONLY` - keep only the specified pages, and remove all others.
105
+ # * `:REMOVE` - remove the specified pages, and keep all others.
106
+ # * `:on_min_pages` Apply the operation only if document has at least this many pages.
107
+ def process_pdf(options)
108
+ @io_stream.seek(0)
109
+ @io_stream = PdfProcessor.parse(@io_stream, options)
110
+ end
111
+
112
+ # Reads a document.
113
+ # @param close [Boolean]
114
+ # @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
115
+ def read_document(close: true)
116
+ @io_stream.seek(0)
117
+ # Avoids needlessly re-packing some files
118
+ data = @io_stream.read
119
+ @io_stream.close if close
120
+ ['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
121
+ end
122
+
123
+ def count_pdf_pages
124
+ return 1 unless pdf?
125
+
126
+ @io_stream.seek(0)
127
+ pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
128
+ pdf_processor.pages.size
129
+ end
130
+
131
+ # Compresses the file, according to the provided info.
132
+ # @param [Integer] quality Quality of the output file.
133
+ # @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
134
+ # @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
135
+ # @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
136
+ # This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
137
+ # WARNING: this operation is strongly discouraged.
138
+ # @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
139
+ # not. Needs force_source_text to work.
140
+ def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
141
+ buffer = if pdf?
142
+ Mindee::PDF::PDFCompressor.compress_pdf(
143
+ @io_stream,
144
+ quality: quality,
145
+ force_source_text_compression: force_source_text,
146
+ disable_source_text: disable_source_text
147
+ )
148
+ else
149
+ Mindee::Image::ImageCompressor.compress_image(
150
+ @io_stream,
151
+ quality: quality,
152
+ max_width: max_width,
153
+ max_height: max_height
154
+ )
155
+ end
156
+ @io_stream = buffer
157
+ @io_stream.rewind
158
+ end
159
+
160
+ # Checks whether the file has source text if it is a pdf. False otherwise
161
+ # @return [Boolean] True if the file is a PDF and has source text.
162
+ def source_text?
163
+ Mindee::PDF::PDFTools.source_text?(@io_stream)
164
+ end
165
+ end
166
+
167
+ # Replaces non-ASCII characters by their UNICODE escape sequence.
168
+ # Keeps other characters as is.
169
+ # @return A clean String.
170
+ def self.convert_to_unicode_escape(string)
171
+ unicode_escape_string = ''.dup
172
+ string.each_char do |char|
173
+ unicode_escape_string << if char.bytesize > 1
174
+ "\\u#{char.unpack1('U').to_s(16).rjust(4, '0')}"
175
+ else
176
+ char
177
+ end
178
+ end
179
+ unicode_escape_string
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ module Mindee
6
+ module Input
7
+ # Document source handling.
8
+ module Source
9
+ # Load a document from a path.
10
+ class PathInputSource < LocalInputSource
11
+ # @param filepath [String]
12
+ # @param fix_pdf [Boolean]
13
+ def initialize(filepath, fix_pdf: false)
14
+ io_stream = File.open(filepath, 'rb')
15
+ super(io_stream, File.basename(filepath), fix_pdf: fix_pdf)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end