mindee 3.16.0 → 3.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/README.md +4 -4
- data/bin/mindee.rb +20 -8
- data/docs/code_samples/{international_id_v1_async.txt → driver_license_v1_async.txt} +1 -1
- data/docs/code_samples/french_healthcard_v1_async.txt +19 -0
- data/docs/code_samples/{carte_vitale_v1.txt → payslip_fra_v3_async.txt} +2 -2
- data/docs/code_samples/workflow_execution.txt +29 -0
- data/docs/custom_v1.md +1 -1
- data/docs/driver_license_v1.md +156 -0
- data/docs/{carte_vitale_v1.md → french_healthcard_v1.md} +14 -24
- data/docs/getting_started.md +5 -5
- data/docs/payslip_fra_v3.md +319 -0
- data/lib/mindee/client.rb +40 -0
- data/lib/mindee/extraction/tax_extractor/tax_extractor.rb +34 -19
- data/lib/mindee/http/workflow_endpoint.rb +90 -0
- data/lib/mindee/http.rb +1 -0
- data/lib/mindee/input/sources/base64_input_source.rb +31 -0
- data/lib/mindee/input/sources/bytes_input_source.rb +21 -0
- data/lib/mindee/input/sources/file_input_source.rb +20 -0
- data/lib/mindee/input/sources/local_input_source.rb +183 -0
- data/lib/mindee/input/sources/path_input_source.rb +20 -0
- data/lib/mindee/input/sources/url_input_source.rb +127 -0
- data/lib/mindee/input/sources.rb +6 -248
- data/lib/mindee/parsing/common/api_response.rb +22 -1
- data/lib/mindee/parsing/common/execution.rb +73 -0
- data/lib/mindee/parsing/common/execution_file.rb +24 -0
- data/lib/mindee/parsing/common/execution_priority.rb +30 -0
- data/lib/mindee/parsing/common.rb +3 -0
- data/lib/mindee/product/{international_id/international_id_v1.rb → driver_license/driver_license_v1.rb} +9 -9
- data/lib/mindee/product/driver_license/driver_license_v1_document.rb +91 -0
- data/lib/mindee/product/{international_id/international_id_v1_page.rb → driver_license/driver_license_v1_page.rb} +7 -7
- data/lib/mindee/product/fr/{carte_vitale/carte_vitale_v1.rb → health_card/health_card_v1.rb} +9 -9
- data/lib/mindee/product/fr/{carte_vitale/carte_vitale_v1_document.rb → health_card/health_card_v1_document.rb} +6 -6
- data/lib/mindee/product/fr/{carte_vitale/carte_vitale_v1_page.rb → health_card/health_card_v1_page.rb} +7 -7
- data/lib/mindee/product/fr/payslip/payslip_v3.rb +41 -0
- data/lib/mindee/product/fr/payslip/payslip_v3_bank_account_detail.rb +54 -0
- data/lib/mindee/product/fr/payslip/payslip_v3_document.rb +166 -0
- data/lib/mindee/product/fr/payslip/payslip_v3_employee.rb +78 -0
- data/lib/mindee/product/fr/payslip/payslip_v3_employer.rb +78 -0
- data/lib/mindee/product/fr/payslip/payslip_v3_employment.rb +78 -0
- data/lib/mindee/product/fr/payslip/payslip_v3_page.rb +34 -0
- data/lib/mindee/product/fr/payslip/payslip_v3_paid_time_off.rb +89 -0
- data/lib/mindee/product/fr/payslip/payslip_v3_pay_detail.rb +100 -0
- data/lib/mindee/product/fr/payslip/payslip_v3_pay_period.rb +66 -0
- data/lib/mindee/product/fr/payslip/payslip_v3_salary_detail.rb +89 -0
- data/lib/mindee/product/resume/resume_v1_document.rb +1 -1
- data/lib/mindee/product/resume/resume_v1_page.rb +1 -1
- data/lib/mindee/product.rb +3 -2
- data/lib/mindee/version.rb +1 -1
- metadata +36 -14
- data/docs/eu_driver_license_v1.md +0 -227
- data/docs/proof_of_address_v1.md +0 -211
- data/docs/us_driver_license_v1.md +0 -272
- data/lib/mindee/product/international_id/international_id_v1_document.rb +0 -109
@@ -0,0 +1,183 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
require 'marcel'
|
5
|
+
|
6
|
+
require_relative '../../pdf'
|
7
|
+
require_relative '../../image'
|
8
|
+
|
9
|
+
module Mindee
|
10
|
+
module Input
|
11
|
+
# Document source handling.
|
12
|
+
module Source
|
13
|
+
# Mime types accepted by the server.
|
14
|
+
ALLOWED_MIME_TYPES = [
|
15
|
+
'application/pdf',
|
16
|
+
'image/heic',
|
17
|
+
'image/png',
|
18
|
+
'image/jpeg',
|
19
|
+
'image/tiff',
|
20
|
+
'image/webp',
|
21
|
+
].freeze
|
22
|
+
|
23
|
+
# Standard error for invalid mime types
|
24
|
+
class MimeTypeError < StandardError
|
25
|
+
end
|
26
|
+
|
27
|
+
# Error sent if the file's mimetype isn't allowed
|
28
|
+
class InvalidMimeTypeError < MimeTypeError
|
29
|
+
# @return [String]
|
30
|
+
attr_reader :invalid_mimetype
|
31
|
+
|
32
|
+
# @param mime_type [String]
|
33
|
+
def initialize(mime_type)
|
34
|
+
@invalid_mimetype = mime_type
|
35
|
+
super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}")
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Error sent if a pdf file couldn't be fixed
|
40
|
+
class UnfixablePDFError < MimeTypeError
|
41
|
+
def initialize
|
42
|
+
super("Corrupted PDF couldn't be repaired.")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Base class for loading documents.
|
47
|
+
class LocalInputSource
|
48
|
+
# @return [String]
|
49
|
+
attr_reader :filename
|
50
|
+
# @return [String]
|
51
|
+
attr_reader :file_mimetype
|
52
|
+
# @return [StringIO]
|
53
|
+
attr_reader :io_stream
|
54
|
+
|
55
|
+
# @param io_stream [StringIO]
|
56
|
+
# @param filename [String]
|
57
|
+
# @param fix_pdf [Boolean]
|
58
|
+
def initialize(io_stream, filename, fix_pdf: false)
|
59
|
+
@io_stream = io_stream
|
60
|
+
@filename = filename
|
61
|
+
@file_mimetype = if fix_pdf
|
62
|
+
Marcel::MimeType.for @io_stream
|
63
|
+
else
|
64
|
+
Marcel::MimeType.for @io_stream, name: @filename
|
65
|
+
end
|
66
|
+
return if ALLOWED_MIME_TYPES.include? @file_mimetype
|
67
|
+
|
68
|
+
if filename.end_with?('.pdf') && fix_pdf
|
69
|
+
rescue_broken_pdf(@io_stream)
|
70
|
+
@file_mimetype = Marcel::MimeType.for @io_stream
|
71
|
+
|
72
|
+
return if ALLOWED_MIME_TYPES.include? @file_mimetype
|
73
|
+
end
|
74
|
+
|
75
|
+
raise InvalidMimeTypeError, @file_mimetype.to_s
|
76
|
+
end
|
77
|
+
|
78
|
+
# Attempts to fix pdf files if mimetype is rejected.
|
79
|
+
# "Broken PDFs" are often a result of third-party injecting invalid headers.
|
80
|
+
# This attempts to remove them and send the file
|
81
|
+
# @param stream [StringIO]
|
82
|
+
def rescue_broken_pdf(stream)
|
83
|
+
stream.gets('%PDF-')
|
84
|
+
raise UnfixablePDFError if stream.eof? || stream.pos > 500
|
85
|
+
|
86
|
+
stream.pos = stream.pos - 5
|
87
|
+
data = stream.read
|
88
|
+
@io_stream.close
|
89
|
+
|
90
|
+
@io_stream = StringIO.new
|
91
|
+
@io_stream << data
|
92
|
+
end
|
93
|
+
|
94
|
+
# Shorthand for pdf mimetype validation.
|
95
|
+
def pdf?
|
96
|
+
@file_mimetype.to_s == 'application/pdf'
|
97
|
+
end
|
98
|
+
|
99
|
+
# Parses a PDF file according to provided options.
|
100
|
+
# @param options [Hash, nil] Page cutting/merge options:
|
101
|
+
#
|
102
|
+
# * `:page_indexes` Zero-based list of page indexes.
|
103
|
+
# * `:operation` Operation to apply on the document, given the `page_indexes specified:
|
104
|
+
# * `:KEEP_ONLY` - keep only the specified pages, and remove all others.
|
105
|
+
# * `:REMOVE` - remove the specified pages, and keep all others.
|
106
|
+
# * `:on_min_pages` Apply the operation only if document has at least this many pages.
|
107
|
+
def process_pdf(options)
|
108
|
+
@io_stream.seek(0)
|
109
|
+
@io_stream = PdfProcessor.parse(@io_stream, options)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Reads a document.
|
113
|
+
# @param close [Boolean]
|
114
|
+
# @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
|
115
|
+
def read_document(close: true)
|
116
|
+
@io_stream.seek(0)
|
117
|
+
# Avoids needlessly re-packing some files
|
118
|
+
data = @io_stream.read
|
119
|
+
@io_stream.close if close
|
120
|
+
['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
|
121
|
+
end
|
122
|
+
|
123
|
+
def count_pdf_pages
|
124
|
+
return 1 unless pdf?
|
125
|
+
|
126
|
+
@io_stream.seek(0)
|
127
|
+
pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
|
128
|
+
pdf_processor.pages.size
|
129
|
+
end
|
130
|
+
|
131
|
+
# Compresses the file, according to the provided info.
|
132
|
+
# @param [Integer] quality Quality of the output file.
|
133
|
+
# @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
|
134
|
+
# @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
|
135
|
+
# @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
|
136
|
+
# This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
|
137
|
+
# WARNING: this operation is strongly discouraged.
|
138
|
+
# @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
|
139
|
+
# not. Needs force_source_text to work.
|
140
|
+
def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
|
141
|
+
buffer = if pdf?
|
142
|
+
Mindee::PDF::PDFCompressor.compress_pdf(
|
143
|
+
@io_stream,
|
144
|
+
quality: quality,
|
145
|
+
force_source_text_compression: force_source_text,
|
146
|
+
disable_source_text: disable_source_text
|
147
|
+
)
|
148
|
+
else
|
149
|
+
Mindee::Image::ImageCompressor.compress_image(
|
150
|
+
@io_stream,
|
151
|
+
quality: quality,
|
152
|
+
max_width: max_width,
|
153
|
+
max_height: max_height
|
154
|
+
)
|
155
|
+
end
|
156
|
+
@io_stream = buffer
|
157
|
+
@io_stream.rewind
|
158
|
+
end
|
159
|
+
|
160
|
+
# Checks whether the file has source text if it is a pdf. False otherwise
|
161
|
+
# @return [Boolean] True if the file is a PDF and has source text.
|
162
|
+
def source_text?
|
163
|
+
Mindee::PDF::PDFTools.source_text?(@io_stream)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
# Replaces non-ASCII characters by their UNICODE escape sequence.
|
168
|
+
# Keeps other characters as is.
|
169
|
+
# @return A clean String.
|
170
|
+
def self.convert_to_unicode_escape(string)
|
171
|
+
unicode_escape_string = ''.dup
|
172
|
+
string.each_char do |char|
|
173
|
+
unicode_escape_string << if char.bytesize > 1
|
174
|
+
"\\u#{char.unpack1('U').to_s(16).rjust(4, '0')}"
|
175
|
+
else
|
176
|
+
char
|
177
|
+
end
|
178
|
+
end
|
179
|
+
unicode_escape_string
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
module Mindee
|
6
|
+
module Input
|
7
|
+
# Document source handling.
|
8
|
+
module Source
|
9
|
+
# Load a document from a path.
|
10
|
+
class PathInputSource < LocalInputSource
|
11
|
+
# @param filepath [String]
|
12
|
+
# @param fix_pdf [Boolean]
|
13
|
+
def initialize(filepath, fix_pdf: false)
|
14
|
+
io_stream = File.open(filepath, 'rb')
|
15
|
+
super(io_stream, File.basename(filepath), fix_pdf: fix_pdf)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'net/http'
|
4
|
+
require 'uri'
|
5
|
+
require 'fileutils'
|
6
|
+
|
7
|
+
module Mindee
|
8
|
+
module Input
|
9
|
+
module Source
|
10
|
+
# Load a remote document from a file url.
|
11
|
+
class UrlInputSource
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :url
|
14
|
+
|
15
|
+
def initialize(url)
|
16
|
+
raise 'URL must be HTTPS' unless url.start_with? 'https://'
|
17
|
+
|
18
|
+
@url = url
|
19
|
+
end
|
20
|
+
|
21
|
+
# Downloads the file from the URL and saves it to the specified path.
|
22
|
+
#
|
23
|
+
# @param path [String] Path to save the file to.
|
24
|
+
# @param filename [String, nil] Optional name to give to the file.
|
25
|
+
# @param username [String, nil] Optional username for authentication.
|
26
|
+
# @param password [String, nil] Optional password for authentication.
|
27
|
+
# @param token [String, nil] Optional token for JWT-based authentication.
|
28
|
+
# @param max_redirects [Integer] Maximum amount of redirects to follow.
|
29
|
+
# @return [String] The full path of the saved file.
|
30
|
+
def save_to_file(path, filename: nil, username: nil, password: nil, token: nil, max_redirects: 3)
|
31
|
+
response_body = fetch_file_content(username: username, password: password, token: token,
|
32
|
+
max_redirects: max_redirects)
|
33
|
+
|
34
|
+
filename = fill_filename(filename)
|
35
|
+
|
36
|
+
full_path = File.join(path.chomp('/'), filename)
|
37
|
+
File.write(full_path, response_body)
|
38
|
+
|
39
|
+
full_path
|
40
|
+
end
|
41
|
+
|
42
|
+
# Downloads the file from the url, and returns a BytesInputSource wrapper object for it.
|
43
|
+
#
|
44
|
+
# @param filename [String, nil] Optional name to give to the file.
|
45
|
+
# @param username [String, nil] Optional username for authentication.
|
46
|
+
# @param password [String, nil] Optional password for authentication.
|
47
|
+
# @param token [String, nil] Optional token for JWT-based authentication.
|
48
|
+
# @param max_redirects [Integer] Maximum amount of redirects to follow.
|
49
|
+
# @return [BytesInputSource] The full path of the saved file.
|
50
|
+
def as_local_input_source(filename: nil, username: nil, password: nil, token: nil, max_redirects: 3)
|
51
|
+
filename = fill_filename(filename)
|
52
|
+
response_body = fetch_file_content(username: username, password: password, token: token,
|
53
|
+
max_redirects: max_redirects)
|
54
|
+
bytes = StringIO.new(response_body)
|
55
|
+
|
56
|
+
BytesInputSource.new(bytes.read, filename)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Fetches the file content from the URL.
|
60
|
+
#
|
61
|
+
# @param username [String, nil] Optional username for authentication.
|
62
|
+
# @param password [String, nil] Optional password for authentication.
|
63
|
+
# @param token [String, nil] Optional token for JWT-based authentication.
|
64
|
+
# @param max_redirects [Integer] Maximum amount of redirects to follow.
|
65
|
+
# @return [String] The downloaded file content.
|
66
|
+
def fetch_file_content(username: nil, password: nil, token: nil, max_redirects: 3)
|
67
|
+
uri = URI.parse(@url)
|
68
|
+
request = Net::HTTP::Get.new(uri)
|
69
|
+
|
70
|
+
request['Authorization'] = "Bearer #{token}" if token
|
71
|
+
request.basic_auth(username, password) if username && password
|
72
|
+
|
73
|
+
response = make_request(uri, request, max_redirects)
|
74
|
+
if response.code.to_i > 299
|
75
|
+
raise "Failed to download file: HTTP status code #{response.code}"
|
76
|
+
elsif response.code.to_i < 200
|
77
|
+
raise "Failed to download file: Invalid response code #{response.code}."
|
78
|
+
end
|
79
|
+
|
80
|
+
response.body
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def extract_filename_from_url(uri)
|
86
|
+
filename = File.basename(uri.path)
|
87
|
+
filename.empty? ? '' : filename
|
88
|
+
end
|
89
|
+
|
90
|
+
def fill_filename(filename)
|
91
|
+
filename ||= extract_filename_from_url(URI.parse(@url))
|
92
|
+
if filename.empty? || File.extname(filename).empty?
|
93
|
+
filename = generate_file_name(extension: get_file_extension(filename))
|
94
|
+
end
|
95
|
+
filename
|
96
|
+
end
|
97
|
+
|
98
|
+
def make_request(uri, request, max_redirects)
|
99
|
+
Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
|
100
|
+
response = http.request(request)
|
101
|
+
if response.is_a?(Net::HTTPRedirection) && max_redirects.positive?
|
102
|
+
location = response['location']
|
103
|
+
raise 'No location in redirection header.' if location.nil?
|
104
|
+
|
105
|
+
new_uri = URI.parse(location)
|
106
|
+
request = Net::HTTP::Get.new(new_uri)
|
107
|
+
make_request(new_uri, request, max_redirects - 1)
|
108
|
+
else
|
109
|
+
response
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def get_file_extension(filename)
|
115
|
+
ext = File.extname(filename)
|
116
|
+
ext.empty? ? nil : ext.downcase
|
117
|
+
end
|
118
|
+
|
119
|
+
def generate_file_name(extension: nil)
|
120
|
+
extension ||= '.tmp'
|
121
|
+
random_string = Array.new(8) { rand(36).to_s(36) }.join
|
122
|
+
"mindee_temp_#{Time.now.strftime('%Y-%m-%d_%H-%M-%S')}_#{random_string}#{extension}"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
data/lib/mindee/input/sources.rb
CHANGED
@@ -1,250 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
require_relative '
|
7
|
-
require_relative '
|
8
|
-
|
9
|
-
module Mindee
|
10
|
-
module Input
|
11
|
-
# Document source handling.
|
12
|
-
module Source
|
13
|
-
# Mime types accepted by the server.
|
14
|
-
ALLOWED_MIME_TYPES = [
|
15
|
-
'application/pdf',
|
16
|
-
'image/heic',
|
17
|
-
'image/png',
|
18
|
-
'image/jpeg',
|
19
|
-
'image/tiff',
|
20
|
-
'image/webp',
|
21
|
-
].freeze
|
22
|
-
|
23
|
-
# Standard error for invalid mime types
|
24
|
-
class MimeTypeError < StandardError
|
25
|
-
end
|
26
|
-
|
27
|
-
# Error sent if the file's mimetype isn't allowed
|
28
|
-
class InvalidMimeTypeError < MimeTypeError
|
29
|
-
# @return [String]
|
30
|
-
attr_reader :invalid_mimetype
|
31
|
-
|
32
|
-
# @param mime_type [String]
|
33
|
-
def initialize(mime_type)
|
34
|
-
@invalid_mimetype = mime_type
|
35
|
-
super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}")
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
# Error sent if a pdf file couldn't be fixed
|
40
|
-
class UnfixablePDFError < MimeTypeError
|
41
|
-
def initialize
|
42
|
-
super("Corrupted PDF couldn't be repaired.")
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
# Base class for loading documents.
|
47
|
-
class LocalInputSource
|
48
|
-
# @return [String]
|
49
|
-
attr_reader :filename
|
50
|
-
# @return [String]
|
51
|
-
attr_reader :file_mimetype
|
52
|
-
# @return [StringIO]
|
53
|
-
attr_reader :io_stream
|
54
|
-
|
55
|
-
# @param io_stream [StringIO]
|
56
|
-
# @param filename [String]
|
57
|
-
# @param fix_pdf [Boolean]
|
58
|
-
def initialize(io_stream, filename, fix_pdf: false)
|
59
|
-
@io_stream = io_stream
|
60
|
-
@filename = filename
|
61
|
-
@file_mimetype = if fix_pdf
|
62
|
-
Marcel::MimeType.for @io_stream
|
63
|
-
else
|
64
|
-
Marcel::MimeType.for @io_stream, name: @filename
|
65
|
-
end
|
66
|
-
return if ALLOWED_MIME_TYPES.include? @file_mimetype
|
67
|
-
|
68
|
-
if filename.end_with?('.pdf') && fix_pdf
|
69
|
-
rescue_broken_pdf(@io_stream)
|
70
|
-
@file_mimetype = Marcel::MimeType.for @io_stream
|
71
|
-
|
72
|
-
return if ALLOWED_MIME_TYPES.include? @file_mimetype
|
73
|
-
end
|
74
|
-
|
75
|
-
raise InvalidMimeTypeError, @file_mimetype.to_s
|
76
|
-
end
|
77
|
-
|
78
|
-
# Attempts to fix pdf files if mimetype is rejected.
|
79
|
-
# "Broken PDFs" are often a result of third-party injecting invalid headers.
|
80
|
-
# This attempts to remove them and send the file
|
81
|
-
# @param stream [StringIO]
|
82
|
-
def rescue_broken_pdf(stream)
|
83
|
-
stream.gets('%PDF-')
|
84
|
-
raise UnfixablePDFError if stream.eof? || stream.pos > 500
|
85
|
-
|
86
|
-
stream.pos = stream.pos - 5
|
87
|
-
data = stream.read
|
88
|
-
@io_stream.close
|
89
|
-
|
90
|
-
@io_stream = StringIO.new
|
91
|
-
@io_stream << data
|
92
|
-
end
|
93
|
-
|
94
|
-
# Shorthand for pdf mimetype validation.
|
95
|
-
def pdf?
|
96
|
-
@file_mimetype.to_s == 'application/pdf'
|
97
|
-
end
|
98
|
-
|
99
|
-
# Parses a PDF file according to provided options.
|
100
|
-
# @param options [Hash, nil] Page cutting/merge options:
|
101
|
-
#
|
102
|
-
# * `:page_indexes` Zero-based list of page indexes.
|
103
|
-
# * `:operation` Operation to apply on the document, given the `page_indexes specified:
|
104
|
-
# * `:KEEP_ONLY` - keep only the specified pages, and remove all others.
|
105
|
-
# * `:REMOVE` - remove the specified pages, and keep all others.
|
106
|
-
# * `:on_min_pages` Apply the operation only if document has at least this many pages.
|
107
|
-
def process_pdf(options)
|
108
|
-
@io_stream.seek(0)
|
109
|
-
@io_stream = PdfProcessor.parse(@io_stream, options)
|
110
|
-
end
|
111
|
-
|
112
|
-
# Reads a document.
|
113
|
-
# @param close [Boolean]
|
114
|
-
# @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
|
115
|
-
def read_document(close: true)
|
116
|
-
@io_stream.seek(0)
|
117
|
-
# Avoids needlessly re-packing some files
|
118
|
-
data = @io_stream.read
|
119
|
-
@io_stream.close if close
|
120
|
-
['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
|
121
|
-
end
|
122
|
-
|
123
|
-
def count_pdf_pages
|
124
|
-
return 1 unless pdf?
|
125
|
-
|
126
|
-
@io_stream.seek(0)
|
127
|
-
pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
|
128
|
-
pdf_processor.pages.size
|
129
|
-
end
|
130
|
-
|
131
|
-
# Compresses the file, according to the provided info.
|
132
|
-
# @param [Integer] quality Quality of the output file.
|
133
|
-
# @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
|
134
|
-
# @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
|
135
|
-
# @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
|
136
|
-
# This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
|
137
|
-
# WARNING: this operation is strongly discouraged.
|
138
|
-
# @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
|
139
|
-
# not. Needs force_source_text to work.
|
140
|
-
def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
|
141
|
-
buffer = if pdf?
|
142
|
-
Mindee::PDF::PDFCompressor.compress_pdf(
|
143
|
-
@io_stream,
|
144
|
-
quality: quality,
|
145
|
-
force_source_text_compression: force_source_text,
|
146
|
-
disable_source_text: disable_source_text
|
147
|
-
)
|
148
|
-
else
|
149
|
-
Mindee::Image::ImageCompressor.compress_image(
|
150
|
-
@io_stream,
|
151
|
-
quality: quality,
|
152
|
-
max_width: max_width,
|
153
|
-
max_height: max_height
|
154
|
-
)
|
155
|
-
end
|
156
|
-
@io_stream = buffer
|
157
|
-
@io_stream.rewind
|
158
|
-
end
|
159
|
-
|
160
|
-
# Checks whether the file has source text if it is a pdf. False otherwise
|
161
|
-
# @return [Boolean] True if the file is a PDF and has source text.
|
162
|
-
def source_text?
|
163
|
-
Mindee::PDF::PDFTools.source_text?(@io_stream)
|
164
|
-
end
|
165
|
-
end
|
166
|
-
|
167
|
-
# Load a document from a path.
|
168
|
-
class PathInputSource < LocalInputSource
|
169
|
-
# @param filepath [String]
|
170
|
-
# @param fix_pdf [Boolean]
|
171
|
-
def initialize(filepath, fix_pdf: false)
|
172
|
-
io_stream = File.open(filepath, 'rb')
|
173
|
-
super(io_stream, File.basename(filepath), fix_pdf: fix_pdf)
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
# Load a document from a base64 string.
|
178
|
-
class Base64InputSource < LocalInputSource
|
179
|
-
# @param base64_string [String]
|
180
|
-
# @param filename [String]
|
181
|
-
# @param fix_pdf [Boolean]
|
182
|
-
def initialize(base64_string, filename, fix_pdf: false)
|
183
|
-
io_stream = StringIO.new(base64_string.unpack1('m*'))
|
184
|
-
io_stream.set_encoding Encoding::BINARY
|
185
|
-
super(io_stream, filename, fix_pdf: fix_pdf)
|
186
|
-
end
|
187
|
-
|
188
|
-
# Overload of the same function to prevent a base64 from being re-encoded.
|
189
|
-
# @param close [Boolean]
|
190
|
-
# @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
|
191
|
-
def read_document(close: true)
|
192
|
-
@io_stream.seek(0)
|
193
|
-
data = @io_stream.read
|
194
|
-
@io_stream.close if close
|
195
|
-
['document', [data].pack('m'), { filename: Source.convert_to_unicode_escape(@filename) }]
|
196
|
-
end
|
197
|
-
end
|
198
|
-
|
199
|
-
# Load a document from raw bytes.
|
200
|
-
class BytesInputSource < LocalInputSource
|
201
|
-
# @param raw_bytes [String]
|
202
|
-
# @param filename [String]
|
203
|
-
# @param fix_pdf [Boolean]
|
204
|
-
def initialize(raw_bytes, filename, fix_pdf: false)
|
205
|
-
io_stream = StringIO.new(raw_bytes)
|
206
|
-
io_stream.set_encoding Encoding::BINARY
|
207
|
-
super(io_stream, filename, fix_pdf: fix_pdf)
|
208
|
-
end
|
209
|
-
end
|
210
|
-
|
211
|
-
# Load a document from a file handle.
|
212
|
-
class FileInputSource < LocalInputSource
|
213
|
-
# @param input_file [File]
|
214
|
-
# @param filename [String]
|
215
|
-
# @param fix_pdf [Boolean]
|
216
|
-
def initialize(input_file, filename, fix_pdf: false)
|
217
|
-
io_stream = input_file
|
218
|
-
super(io_stream, filename, fix_pdf: fix_pdf)
|
219
|
-
end
|
220
|
-
end
|
221
|
-
|
222
|
-
# Load a remote document from a file url.
|
223
|
-
class UrlInputSource
|
224
|
-
# @return [String]
|
225
|
-
attr_reader :url
|
226
|
-
|
227
|
-
def initialize(url)
|
228
|
-
raise 'URL must be HTTPS' unless url.start_with? 'https://'
|
229
|
-
|
230
|
-
@url = url
|
231
|
-
end
|
232
|
-
end
|
233
|
-
|
234
|
-
# Replaces non-ASCII characters by their unicode escape sequence.
|
235
|
-
# Keeps other characters as is.
|
236
|
-
# @return A clean String.
|
237
|
-
def self.convert_to_unicode_escape(string)
|
238
|
-
unicode_escape_string = ''.dup
|
239
|
-
string.each_char do |char|
|
240
|
-
unicode_escape_string << if char.bytesize > 1
|
241
|
-
"\\u#{char.unpack1('U').to_s(16).rjust(4, '0')}"
|
242
|
-
else
|
243
|
-
char
|
244
|
-
end
|
245
|
-
end
|
246
|
-
unicode_escape_string
|
247
|
-
end
|
248
|
-
end
|
249
|
-
end
|
250
|
-
end
|
3
|
+
require_relative 'sources/local_input_source'
|
4
|
+
require_relative 'sources/bytes_input_source'
|
5
|
+
require_relative 'sources/base64_input_source'
|
6
|
+
require_relative 'sources/file_input_source'
|
7
|
+
require_relative 'sources/path_input_source'
|
8
|
+
require_relative 'sources/url_input_source'
|
@@ -32,7 +32,7 @@ module Mindee
|
|
32
32
|
attr_reader :id
|
33
33
|
# @return [Mindee::Parsing::Standard::DateField]
|
34
34
|
attr_reader :issued_at
|
35
|
-
# @return [
|
35
|
+
# @return [Time, nil]
|
36
36
|
attr_reader :available_at
|
37
37
|
# @return [JobStatus, Symbol]
|
38
38
|
attr_reader :status
|
@@ -121,6 +121,27 @@ module Mindee
|
|
121
121
|
@job = Mindee::Parsing::Common::Job.new(http_response['job']) if http_response.key?('job')
|
122
122
|
end
|
123
123
|
end
|
124
|
+
|
125
|
+
# Represents the server response after a document is sent to a workflow.
|
126
|
+
class WorkflowResponse
|
127
|
+
# Set the prediction model used to parse the document.
|
128
|
+
# The response object will be instantiated based on this parameter.
|
129
|
+
# @return [Mindee::Parsing::Common::Execution]
|
130
|
+
attr_reader :execution
|
131
|
+
# @return [Mindee::Parsing::Common::ApiRequest]
|
132
|
+
attr_reader :api_request
|
133
|
+
# @return [String]
|
134
|
+
attr_reader :raw_http
|
135
|
+
|
136
|
+
# @param http_response [Hash]
|
137
|
+
# @param product_class [Mindee::Inference]
|
138
|
+
def initialize(product_class, http_response, raw_http)
|
139
|
+
@raw_http = raw_http.to_s
|
140
|
+
@api_request = Mindee::Parsing::Common::ApiRequest.new(http_response['api_request'])
|
141
|
+
product_class = (product_class || Product::Generated::GeneratedV1)
|
142
|
+
@execution = Mindee::Parsing::Common::Execution.new(product_class, http_response['execution'])
|
143
|
+
end
|
144
|
+
end
|
124
145
|
end
|
125
146
|
end
|
126
147
|
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mindee
|
4
|
+
module Parsing
|
5
|
+
module Common
|
6
|
+
# Identifier for the batch to which the execution belongs.
|
7
|
+
class Execution
|
8
|
+
# Identifier for the batch to which the execution belongs.
|
9
|
+
# @return [String]
|
10
|
+
attr_reader :batch_name
|
11
|
+
# The time at which the execution started.
|
12
|
+
# @return [Time, nil]
|
13
|
+
attr_reader :created_at
|
14
|
+
# File representation within a workflow execution.
|
15
|
+
# @return [ExecutionFile]
|
16
|
+
attr_reader :file
|
17
|
+
# Identifier for the execution.
|
18
|
+
# @return [String]
|
19
|
+
attr_reader :id
|
20
|
+
# Deserialized inference object.
|
21
|
+
# @return [Mindee::Inference]
|
22
|
+
attr_reader :inference
|
23
|
+
# Priority of the execution.
|
24
|
+
# @return [ExecutionPriority]
|
25
|
+
attr_reader :priority
|
26
|
+
# The time at which the file was tagged as reviewed.
|
27
|
+
# @return [Time, nil]
|
28
|
+
attr_reader :reviewed_at
|
29
|
+
# The time at which the file was uploaded to a workflow.
|
30
|
+
# @return [Time, nil]
|
31
|
+
attr_reader :available_at
|
32
|
+
# Reviewed fields and values.
|
33
|
+
# @return [Mindee::Product::Generated::GeneratedV1Document]
|
34
|
+
attr_reader :reviewed_prediction
|
35
|
+
# Execution Status.
|
36
|
+
# @return [String]
|
37
|
+
attr_reader :status
|
38
|
+
# Execution type.
|
39
|
+
# @return [String]
|
40
|
+
attr_reader :type
|
41
|
+
# The time at which the file was uploaded to a workflow.
|
42
|
+
# @return [Time, nil]
|
43
|
+
attr_reader :uploaded_at
|
44
|
+
# Identifier for the workflow.
|
45
|
+
# @return [String]
|
46
|
+
attr_reader :workflow_id
|
47
|
+
|
48
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
49
|
+
|
50
|
+
# @param product_class [Mindee::Inference]
|
51
|
+
# @param http_response [Hash]
|
52
|
+
def initialize(product_class, http_response)
|
53
|
+
@batch_name = http_response['batch_name']
|
54
|
+
@created_at = Time.iso8601(http_response['created_at']) if http_response['created_at']
|
55
|
+
@file = ExecutionFile.new(http_response['file']) if http_response['file']
|
56
|
+
@id = http_response['id']
|
57
|
+
@inference = product_class.new(http_response['inference']) if http_response['inference']
|
58
|
+
@priority = Mindee::Parsing::Common::ExecutionPriority.to_priority(http_response['priority'])
|
59
|
+
@reviewed_at = Time.iso8601(http_response['reviewed_at']) if http_response['reviewed_at']
|
60
|
+
@available_at = Time.iso8601(http_response['available_at']) if http_response['available_at']
|
61
|
+
if http_response['reviewed_prediction']
|
62
|
+
@reviewed_prediction = GeneratedV1Document.new(http_response['reviewed_prediction'])
|
63
|
+
end
|
64
|
+
@status = http_response['status']
|
65
|
+
@type = http_response['type']
|
66
|
+
@uploaded_at = Time.iso8601(http_response['uploaded_at']) if http_response['uploaded_at']
|
67
|
+
@workflow_id = http_response['workflow_id']
|
68
|
+
end
|
69
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|