mindee 1.2.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/.rubocop.yml +2 -2
- data/.yardopts +4 -0
- data/CHANGELOG.md +26 -0
- data/README.md +46 -23
- data/Rakefile +6 -1
- data/bin/mindee.rb +78 -61
- data/docs/ruby-api-builder.md +124 -0
- data/docs/ruby-getting-started.md +265 -0
- data/docs/ruby-invoice-ocr.md +260 -0
- data/docs/ruby-passport-ocr.md +156 -0
- data/docs/ruby-receipt-ocr.md +170 -0
- data/lib/mindee/client.rb +132 -93
- data/lib/mindee/document_config.rb +29 -169
- data/lib/mindee/geometry.rb +105 -8
- data/lib/mindee/http/endpoint.rb +80 -0
- data/lib/mindee/input/pdf_processing.rb +106 -0
- data/lib/mindee/input/sources.rb +97 -0
- data/lib/mindee/input.rb +3 -0
- data/lib/mindee/parsing/document.rb +31 -0
- data/lib/mindee/parsing/error.rb +22 -0
- data/lib/mindee/parsing/inference.rb +53 -0
- data/lib/mindee/parsing/page.rb +46 -0
- data/lib/mindee/parsing/prediction/base.rb +30 -0
- data/lib/mindee/{fields → parsing/prediction/common_fields}/amount.rb +5 -1
- data/lib/mindee/{fields → parsing/prediction/common_fields}/base.rb +16 -5
- data/lib/mindee/{fields → parsing/prediction/common_fields}/company_registration.rb +0 -0
- data/lib/mindee/{fields/datefield.rb → parsing/prediction/common_fields/date.rb} +0 -0
- data/lib/mindee/{fields → parsing/prediction/common_fields}/locale.rb +0 -0
- data/lib/mindee/{fields → parsing/prediction/common_fields}/payment_details.rb +0 -0
- data/lib/mindee/parsing/prediction/common_fields/position.rb +39 -0
- data/lib/mindee/{fields → parsing/prediction/common_fields}/tax.rb +7 -2
- data/lib/mindee/parsing/prediction/common_fields/text.rb +12 -0
- data/lib/mindee/parsing/prediction/common_fields.rb +11 -0
- data/lib/mindee/parsing/prediction/custom/custom_v1.rb +58 -0
- data/lib/mindee/{fields/custom_docs.rb → parsing/prediction/custom/fields.rb} +5 -5
- data/lib/mindee/parsing/prediction/eu/license_plate/license_plate_v1.rb +34 -0
- data/lib/mindee/parsing/prediction/financial_document/financial_document_v1.rb +245 -0
- data/lib/mindee/parsing/prediction/financial_document/invoice_line_item.rb +58 -0
- data/lib/mindee/parsing/prediction/fr/bank_account_details/bank_account_details_v1.rb +40 -0
- data/lib/mindee/parsing/prediction/fr/carte_vitale/carte_vitale_v1.rb +49 -0
- data/lib/mindee/parsing/prediction/fr/id_card/id_card_v1.rb +84 -0
- data/lib/mindee/parsing/prediction/invoice/invoice_line_item.rb +58 -0
- data/lib/mindee/parsing/prediction/invoice/invoice_v4.rb +216 -0
- data/lib/mindee/parsing/prediction/passport/passport_v1.rb +184 -0
- data/lib/mindee/parsing/prediction/proof_of_address/proof_of_address_v1.rb +82 -0
- data/lib/mindee/parsing/prediction/receipt/receipt_v4.rb +87 -0
- data/lib/mindee/parsing/prediction/shipping_container/shipping_container_v1.rb +38 -0
- data/lib/mindee/parsing/prediction/us/bank_check/bank_check_v1.rb +70 -0
- data/lib/mindee/parsing/prediction.rb +14 -0
- data/lib/mindee/parsing.rb +4 -0
- data/lib/mindee/version.rb +1 -1
- data/mindee.gemspec +2 -1
- metadata +60 -24
- data/lib/mindee/documents/base.rb +0 -35
- data/lib/mindee/documents/custom.rb +0 -65
- data/lib/mindee/documents/financial_doc.rb +0 -135
- data/lib/mindee/documents/invoice.rb +0 -162
- data/lib/mindee/documents/passport.rb +0 -163
- data/lib/mindee/documents/receipt.rb +0 -109
- data/lib/mindee/documents.rb +0 -7
- data/lib/mindee/endpoint.rb +0 -105
- data/lib/mindee/fields/orientation.rb +0 -26
- data/lib/mindee/fields.rb +0 -11
- data/lib/mindee/inputs.rb +0 -153
- data/lib/mindee/response.rb +0 -27
|
@@ -2,199 +2,59 @@
|
|
|
2
2
|
|
|
3
3
|
require 'json'
|
|
4
4
|
|
|
5
|
-
require_relative 'endpoint'
|
|
6
|
-
require_relative '
|
|
7
|
-
require_relative '
|
|
5
|
+
require_relative 'http/endpoint'
|
|
6
|
+
require_relative 'parsing/document'
|
|
7
|
+
require_relative 'parsing/error'
|
|
8
|
+
require_relative 'parsing/prediction'
|
|
8
9
|
|
|
9
10
|
module Mindee
|
|
10
11
|
# Specific client for sending a document to the API.
|
|
11
12
|
class DocumentConfig
|
|
12
13
|
# Array of possible Mindee::Endpoint to be used.
|
|
13
|
-
# @return [
|
|
14
|
-
attr_reader :
|
|
14
|
+
# @return [Mindee::HTTP::Endpoint]
|
|
15
|
+
attr_reader :endpoint
|
|
15
16
|
|
|
16
|
-
# @param
|
|
17
|
-
# @param
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@doc_class = doc_class
|
|
22
|
-
@document_type = document_type
|
|
23
|
-
@endpoints = endpoints
|
|
24
|
-
@raise_on_error = raise_on_error
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
# Parse a prediction API result.
|
|
28
|
-
# @param input_doc [Mindee::InputDocument]
|
|
29
|
-
# @param response [Hash]
|
|
30
|
-
# @return [Mindee::DocumentResponse]
|
|
31
|
-
def build_predict_result(input_doc, response)
|
|
32
|
-
document = @doc_class.new(
|
|
33
|
-
response['document']['inference']['prediction'],
|
|
34
|
-
input_file: input_doc,
|
|
35
|
-
page_id: nil
|
|
36
|
-
)
|
|
37
|
-
pages = []
|
|
38
|
-
response['document']['inference']['pages'].each do |page|
|
|
39
|
-
pages.push(
|
|
40
|
-
@doc_class.new(
|
|
41
|
-
page['prediction'],
|
|
42
|
-
input_file: input_doc,
|
|
43
|
-
page_id: page['id']
|
|
44
|
-
)
|
|
45
|
-
)
|
|
46
|
-
end
|
|
47
|
-
DocumentResponse.new(response, @document_type, document, pages)
|
|
17
|
+
# @param prediction_class [Class<Mindee::Prediction::Prediction>]
|
|
18
|
+
# @param endpoint [Mindee::HTTP::Endpoint]
|
|
19
|
+
def initialize(prediction_class, endpoint)
|
|
20
|
+
@prediction_class = prediction_class
|
|
21
|
+
@endpoint = endpoint
|
|
48
22
|
end
|
|
49
23
|
|
|
50
24
|
# Call the prediction API.
|
|
51
25
|
# @param input_doc [Mindee::InputDocument]
|
|
52
26
|
# @param include_words [Boolean]
|
|
53
27
|
# @param close_file [Boolean]
|
|
28
|
+
# @param cropper [Boolean]
|
|
54
29
|
# @return [Mindee::DocumentResponse]
|
|
55
|
-
def predict(input_doc, include_words, close_file)
|
|
56
|
-
|
|
57
|
-
response = predict_request(input_doc, include_words, close_file)
|
|
58
|
-
|
|
30
|
+
def predict(input_doc, include_words, close_file, cropper)
|
|
31
|
+
check_api_key
|
|
32
|
+
response = predict_request(input_doc, include_words, close_file, cropper)
|
|
33
|
+
hashed_response = JSON.parse(response.body, object_class: Hash)
|
|
34
|
+
return Document.new(@prediction_class, hashed_response['document']) if (200..299).include?(response.code.to_i)
|
|
35
|
+
|
|
36
|
+
error = Parsing::Error.new(hashed_response['api_request']['error'])
|
|
37
|
+
raise error
|
|
59
38
|
end
|
|
60
39
|
|
|
61
40
|
private
|
|
62
41
|
|
|
63
|
-
# @param input_doc [Mindee::InputDocument]
|
|
64
|
-
# @param response [Net::HTTPResponse]
|
|
65
|
-
# @return [Mindee::DocumentResponse]
|
|
66
|
-
def parse_response(input_doc, response)
|
|
67
|
-
hashed_response = JSON.parse(response.body, object_class: Hash)
|
|
68
|
-
unless (200..299).include?(response.code.to_i)
|
|
69
|
-
if @raise_on_error
|
|
70
|
-
raise Net::HTTPError.new(
|
|
71
|
-
"API #{response.code} HTTP error: #{hashed_response}", response
|
|
72
|
-
)
|
|
73
|
-
end
|
|
74
|
-
return DocumentResponse.new(
|
|
75
|
-
hashed_response, @document_type, {}, []
|
|
76
|
-
)
|
|
77
|
-
end
|
|
78
|
-
build_predict_result(input_doc, hashed_response)
|
|
79
|
-
end
|
|
80
|
-
|
|
81
42
|
# @param input_doc [Mindee::InputDocument]
|
|
82
43
|
# @param include_words [Boolean]
|
|
83
44
|
# @param close_file [Boolean]
|
|
45
|
+
# # @param cropper [Boolean]
|
|
84
46
|
# @return [Net::HTTPResponse]
|
|
85
|
-
def predict_request(input_doc, include_words, close_file)
|
|
86
|
-
@
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
def check_api_keys
|
|
90
|
-
@endpoints.each do |endpoint|
|
|
91
|
-
next unless endpoint.api_key.nil? || endpoint.api_key.empty?
|
|
92
|
-
|
|
93
|
-
raise "Missing API key for '#{@document_type}', " \
|
|
94
|
-
"check your Client Configuration.\n" \
|
|
95
|
-
'You can set this using the ' \
|
|
96
|
-
"'#{endpoint.envvar_key_name}' environment variable."
|
|
97
|
-
end
|
|
98
|
-
end
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
# Client for Invoice documents
|
|
102
|
-
class InvoiceConfig < DocumentConfig
|
|
103
|
-
def initialize(api_key, raise_on_error)
|
|
104
|
-
endpoints = [InvoiceEndpoint.new(api_key)]
|
|
105
|
-
super(
|
|
106
|
-
Invoice,
|
|
107
|
-
'invoice',
|
|
108
|
-
endpoints,
|
|
109
|
-
raise_on_error
|
|
110
|
-
)
|
|
47
|
+
def predict_request(input_doc, include_words, close_file, cropper)
|
|
48
|
+
@endpoint.predict_req_post(input_doc, include_words: include_words, close_file: close_file, cropper: cropper)
|
|
111
49
|
end
|
|
112
|
-
end
|
|
113
50
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def initialize(api_key, raise_on_error)
|
|
117
|
-
endpoints = [ReceiptEndpoint.new(api_key)]
|
|
118
|
-
super(
|
|
119
|
-
Receipt,
|
|
120
|
-
'receipt',
|
|
121
|
-
endpoints,
|
|
122
|
-
raise_on_error
|
|
123
|
-
)
|
|
124
|
-
end
|
|
125
|
-
end
|
|
51
|
+
def check_api_key
|
|
52
|
+
return unless @endpoint.api_key.nil? || @endpoint.api_key.empty?
|
|
126
53
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
super(
|
|
132
|
-
Passport,
|
|
133
|
-
'passport',
|
|
134
|
-
endpoints,
|
|
135
|
-
raise_on_error
|
|
136
|
-
)
|
|
137
|
-
end
|
|
138
|
-
end
|
|
139
|
-
|
|
140
|
-
# Client for Financial documents
|
|
141
|
-
class FinancialDocConfig < DocumentConfig
|
|
142
|
-
def initialize(api_key, raise_on_error)
|
|
143
|
-
endpoints = [
|
|
144
|
-
InvoiceEndpoint.new(api_key),
|
|
145
|
-
ReceiptEndpoint.new(api_key),
|
|
146
|
-
]
|
|
147
|
-
super(
|
|
148
|
-
FinancialDocument,
|
|
149
|
-
'financial_doc',
|
|
150
|
-
endpoints,
|
|
151
|
-
raise_on_error
|
|
152
|
-
)
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
private
|
|
156
|
-
|
|
157
|
-
def predict_request(input_doc, include_words, close_file)
|
|
158
|
-
endpoint = input_doc.pdf? ? @endpoints[0] : @endpoints[1]
|
|
159
|
-
endpoint.predict_request(input_doc, include_words: include_words, close_file: close_file)
|
|
160
|
-
end
|
|
161
|
-
end
|
|
162
|
-
|
|
163
|
-
# Client for Custom (constructed) documents
|
|
164
|
-
class CustomDocConfig < DocumentConfig
|
|
165
|
-
def initialize(document_type, account_name, version, api_key, raise_on_error)
|
|
166
|
-
endpoints = [CustomEndpoint.new(document_type, account_name, version, api_key)]
|
|
167
|
-
super(
|
|
168
|
-
CustomDocument,
|
|
169
|
-
document_type,
|
|
170
|
-
endpoints,
|
|
171
|
-
raise_on_error
|
|
172
|
-
)
|
|
173
|
-
end
|
|
174
|
-
|
|
175
|
-
# Parse a prediction API result.
|
|
176
|
-
# @param input_doc [Mindee::InputDocument]
|
|
177
|
-
# @param response [Hash]
|
|
178
|
-
# @return [Mindee::DocumentResponse]
|
|
179
|
-
def build_predict_result(input_doc, response)
|
|
180
|
-
document = CustomDocument.new(
|
|
181
|
-
@document_type,
|
|
182
|
-
response['document']['inference']['prediction'],
|
|
183
|
-
input_file: input_doc,
|
|
184
|
-
page_id: nil
|
|
185
|
-
)
|
|
186
|
-
pages = []
|
|
187
|
-
response['document']['inference']['pages'].each do |page|
|
|
188
|
-
pages.push(
|
|
189
|
-
CustomDocument.new(
|
|
190
|
-
@document_type,
|
|
191
|
-
page['prediction'],
|
|
192
|
-
input_file: input_doc,
|
|
193
|
-
page_id: page['id']
|
|
194
|
-
)
|
|
195
|
-
)
|
|
196
|
-
end
|
|
197
|
-
DocumentResponse.new(response, @document_type, document, pages)
|
|
54
|
+
raise "Missing API key for '#{@document_type}', " \
|
|
55
|
+
"check your Client Configuration.\n" \
|
|
56
|
+
'You can set this using the ' \
|
|
57
|
+
"'#{HTTP::API_KEY_ENV_NAME}' environment variable."
|
|
198
58
|
end
|
|
199
59
|
end
|
|
200
60
|
end
|
data/lib/mindee/geometry.rb
CHANGED
|
@@ -3,19 +3,116 @@
|
|
|
3
3
|
module Mindee
|
|
4
4
|
# Various helper functions for geometry.
|
|
5
5
|
module Geometry
|
|
6
|
+
# A relative set of coordinates (X, Y) on the document.
|
|
7
|
+
class Point
|
|
8
|
+
# @return [Float]
|
|
9
|
+
attr_accessor :x
|
|
10
|
+
# @return [Float]
|
|
11
|
+
attr_accessor :y
|
|
12
|
+
|
|
13
|
+
# @param x [Float]
|
|
14
|
+
# @param y [Float]
|
|
15
|
+
# rubocop:disable Naming/MethodParameterName
|
|
16
|
+
def initialize(x, y)
|
|
17
|
+
@x = x
|
|
18
|
+
@y = y
|
|
19
|
+
end
|
|
20
|
+
# rubocop:enable Naming/MethodParameterName
|
|
21
|
+
|
|
22
|
+
# @return [Float]
|
|
23
|
+
def [](key)
|
|
24
|
+
case key
|
|
25
|
+
when 0
|
|
26
|
+
@x
|
|
27
|
+
when 1
|
|
28
|
+
@y
|
|
29
|
+
else
|
|
30
|
+
throw '0 or 1 only'
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Contains exactly 4 relative vertices coordinates (Points).
|
|
36
|
+
class Quadrilateral
|
|
37
|
+
# @return [Mindee::Geometry::Point]
|
|
38
|
+
attr_accessor :top_left
|
|
39
|
+
# @return [Mindee::Geometry::Point]
|
|
40
|
+
attr_accessor :top_right
|
|
41
|
+
# @return [Mindee::Geometry::Point]
|
|
42
|
+
attr_accessor :bottom_right
|
|
43
|
+
# @return [Mindee::Geometry::Point]
|
|
44
|
+
attr_accessor :bottom_left
|
|
45
|
+
|
|
46
|
+
# @param top_left [Mindee::Geometry::Point]
|
|
47
|
+
# @param top_right [Mindee::Geometry::Point]
|
|
48
|
+
# @param bottom_right [Mindee::Geometry::Point]
|
|
49
|
+
# @param bottom_left [Mindee::Geometry::Point]
|
|
50
|
+
def initialize(top_left, top_right, bottom_right, bottom_left)
|
|
51
|
+
@top_left = top_left
|
|
52
|
+
@top_right = top_right
|
|
53
|
+
@bottom_right = bottom_right
|
|
54
|
+
@bottom_left = bottom_left
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# @return [Mindee::Geometry::Point]
|
|
58
|
+
def [](key)
|
|
59
|
+
case key
|
|
60
|
+
when 0
|
|
61
|
+
@top_left
|
|
62
|
+
when 1
|
|
63
|
+
@top_right
|
|
64
|
+
when 2
|
|
65
|
+
@bottom_right
|
|
66
|
+
when 3
|
|
67
|
+
@bottom_left
|
|
68
|
+
else
|
|
69
|
+
throw '0, 1, 2, 3 only'
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
class Polygon < Array
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Transform a prediction into a Quadrilateral.
|
|
78
|
+
def self.quadrilateral_from_prediction(prediction)
|
|
79
|
+
throw "Prediction must have exactly 4 points, found #{prediction.size}" if prediction.size != 4
|
|
80
|
+
|
|
81
|
+
Quadrilateral.new(
|
|
82
|
+
Point.new(prediction[0][0], prediction[0][1]),
|
|
83
|
+
Point.new(prediction[1][0], prediction[1][1]),
|
|
84
|
+
Point.new(prediction[2][0], prediction[2][1]),
|
|
85
|
+
Point.new(prediction[3][0], prediction[3][1])
|
|
86
|
+
)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Transform a prediction into a Polygon.
|
|
90
|
+
def self.polygon_from_prediction(prediction)
|
|
91
|
+
polygon = Polygon.new
|
|
92
|
+
return polygon if prediction.nil?
|
|
93
|
+
|
|
94
|
+
prediction.each do |point|
|
|
95
|
+
polygon << Point.new(point[0], point[1])
|
|
96
|
+
end
|
|
97
|
+
polygon
|
|
98
|
+
end
|
|
99
|
+
|
|
6
100
|
# @return [Array<Float>]
|
|
7
101
|
def self.get_bbox(vertices)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
y_max = vertices.map { |v| v[1] }.max
|
|
12
|
-
[x_min, y_min, x_max, y_max]
|
|
102
|
+
x_coords = vertices.map(&:x)
|
|
103
|
+
y_coords = vertices.map(&:y)
|
|
104
|
+
[x_coords.min, y_coords.min, x_coords.max, y_coords.max]
|
|
13
105
|
end
|
|
14
106
|
|
|
15
|
-
# @return [
|
|
16
|
-
def self.
|
|
107
|
+
# @return [Mindee::Geometry::Quadrilateral]
|
|
108
|
+
def self.get_bounding_box(vertices)
|
|
17
109
|
x_min, y_min, x_max, y_max = get_bbox(vertices)
|
|
18
|
-
|
|
110
|
+
Quadrilateral.new(
|
|
111
|
+
Point.new(x_min, y_min),
|
|
112
|
+
Point.new(x_max, y_min),
|
|
113
|
+
Point.new(x_max, y_max),
|
|
114
|
+
Point.new(x_min, y_max)
|
|
115
|
+
)
|
|
19
116
|
end
|
|
20
117
|
end
|
|
21
118
|
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'net/http'
|
|
4
|
+
require_relative '../version'
|
|
5
|
+
|
|
6
|
+
module Mindee
|
|
7
|
+
module HTTP
|
|
8
|
+
API_KEY_ENV_NAME = 'MINDEE_API_KEY'
|
|
9
|
+
API_KEY_DEFAULT = nil
|
|
10
|
+
|
|
11
|
+
BASE_URL_ENV_NAME = 'MINDEE_BASE_URL'
|
|
12
|
+
BASE_URL_DEFAULT = 'https://api.mindee.net/v1'
|
|
13
|
+
|
|
14
|
+
REQUEST_TIMEOUT_ENV_NAME = 'MINDEE_REQUEST_TIMEOUT'
|
|
15
|
+
TIMEOUT_DEFAULT = 120
|
|
16
|
+
|
|
17
|
+
USER_AGENT = "mindee-api-ruby@v#{Mindee::VERSION} ruby-v#{RUBY_VERSION} #{Mindee::PLATFORM}"
|
|
18
|
+
|
|
19
|
+
# Generic API endpoint for a product.
|
|
20
|
+
class Endpoint
|
|
21
|
+
# @return [String]
|
|
22
|
+
attr_reader :api_key
|
|
23
|
+
# @return [Integer]
|
|
24
|
+
attr_reader :request_timeout
|
|
25
|
+
|
|
26
|
+
def initialize(owner, url_name, version, api_key: '')
|
|
27
|
+
@owner = owner
|
|
28
|
+
@url_name = url_name
|
|
29
|
+
@version = version
|
|
30
|
+
@request_timeout = ENV.fetch(REQUEST_TIMEOUT_ENV_NAME, TIMEOUT_DEFAULT).to_i
|
|
31
|
+
@api_key = api_key.nil? || api_key.empty? ? ENV.fetch(API_KEY_ENV_NAME, API_KEY_DEFAULT) : api_key
|
|
32
|
+
@url_root = "#{BASE_URL_DEFAULT}/products/#{@owner}/#{@url_name}/v#{@version}"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# @param input_doc [Mindee::InputDocument]
|
|
36
|
+
# @param include_words [Boolean]
|
|
37
|
+
# @param close_file [Boolean]
|
|
38
|
+
# @param cropper [Boolean]
|
|
39
|
+
# @return [Net::HTTPResponse]
|
|
40
|
+
def predict_req_post(input_doc, include_words: false, close_file: true, cropper: false)
|
|
41
|
+
uri = URI("#{@url_root}/predict")
|
|
42
|
+
|
|
43
|
+
params = {}
|
|
44
|
+
params[:cropper] = 'true' if cropper
|
|
45
|
+
uri.query = URI.encode_www_form(params)
|
|
46
|
+
|
|
47
|
+
headers = {
|
|
48
|
+
'Authorization' => "Token #{@api_key}",
|
|
49
|
+
'User-Agent' => USER_AGENT,
|
|
50
|
+
}
|
|
51
|
+
req = Net::HTTP::Post.new(uri, headers)
|
|
52
|
+
|
|
53
|
+
form_data = {
|
|
54
|
+
'document' => input_doc.read_document(close: close_file),
|
|
55
|
+
}
|
|
56
|
+
form_data.push ['include_mvision', 'true'] if include_words
|
|
57
|
+
|
|
58
|
+
req.set_form(form_data, 'multipart/form-data')
|
|
59
|
+
|
|
60
|
+
Net::HTTP.start(uri.hostname, uri.port, use_ssl: true, read_timeout: @request_timeout) do |http|
|
|
61
|
+
http.request(req)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Receipt API endpoint
|
|
67
|
+
class StandardEndpoint < Endpoint
|
|
68
|
+
def initialize(endpoint_name, version, api_key)
|
|
69
|
+
super('mindee', endpoint_name, version, api_key: api_key)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Custom (constructed) API endpoint
|
|
74
|
+
class CustomEndpoint < Endpoint
|
|
75
|
+
def initialize(account_name, endpoint_name, version, api_key)
|
|
76
|
+
super(account_name, endpoint_name, version, api_key: api_key)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
require 'origami'
|
|
5
|
+
|
|
6
|
+
# Monkey-patching for Origami
|
|
7
|
+
module PDFTools
|
|
8
|
+
def to_io_stream(params = {})
|
|
9
|
+
options = {
|
|
10
|
+
delinearize: true,
|
|
11
|
+
recompile: true,
|
|
12
|
+
decrypt: false,
|
|
13
|
+
}
|
|
14
|
+
options.update(params)
|
|
15
|
+
|
|
16
|
+
if frozen? # incompatible flags with frozen doc (signed)
|
|
17
|
+
options[:recompile] = nil
|
|
18
|
+
options[:rebuild_xrefs] = nil
|
|
19
|
+
options[:noindent] = nil
|
|
20
|
+
options[:obfuscate] = false
|
|
21
|
+
end
|
|
22
|
+
load_all_objects unless @loaded
|
|
23
|
+
|
|
24
|
+
intents_as_pdfa1 if options[:intent] =~ %r{pdf[/-]?A1?/i}
|
|
25
|
+
delinearize! if options[:delinearize] && linearized?
|
|
26
|
+
compile(options) if options[:recompile]
|
|
27
|
+
|
|
28
|
+
io_stream = StringIO.new(output(options))
|
|
29
|
+
io_stream.set_encoding Encoding::BINARY
|
|
30
|
+
io_stream
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
Origami::PDF.class_eval { include PDFTools }
|
|
35
|
+
|
|
36
|
+
module Mindee
|
|
37
|
+
module Input
|
|
38
|
+
# Class for PDF documents
|
|
39
|
+
module PdfProcessor
|
|
40
|
+
DEFAULT_OPTIONS = {
|
|
41
|
+
page_indexes: [0],
|
|
42
|
+
operation: :KEEP_ONLY,
|
|
43
|
+
on_min_pages: 0,
|
|
44
|
+
}.freeze
|
|
45
|
+
|
|
46
|
+
# @param io_stream [StreamIO]
|
|
47
|
+
# @param options [Hash]
|
|
48
|
+
def self.parse(io_stream, options)
|
|
49
|
+
options = DEFAULT_OPTIONS.merge(options)
|
|
50
|
+
|
|
51
|
+
current_pdf = open_pdf(io_stream)
|
|
52
|
+
pages_count = current_pdf.pages.size
|
|
53
|
+
return if options[:on_min_pages] > pages_count
|
|
54
|
+
|
|
55
|
+
all_pages = (0..pages_count - 1).to_a
|
|
56
|
+
|
|
57
|
+
case options[:operation]
|
|
58
|
+
when :KEEP_ONLY
|
|
59
|
+
pages_to_remove = indexes_from_keep(options[:page_indexes], all_pages)
|
|
60
|
+
when :REMOVE
|
|
61
|
+
pages_to_remove = indexes_from_remove(options[:page_indexes], all_pages)
|
|
62
|
+
else
|
|
63
|
+
raise "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{behavior}'"
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a
|
|
67
|
+
current_pdf.to_io_stream
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# @param page_indexes [Array]
|
|
71
|
+
# @param all_pages [Array]
|
|
72
|
+
def self.indexes_from_keep(page_indexes, all_pages)
|
|
73
|
+
pages_to_keep = Set.new
|
|
74
|
+
page_indexes.each do |idx|
|
|
75
|
+
idx = (all_pages.length - (idx + 2)) if idx.negative?
|
|
76
|
+
page = all_pages[idx]
|
|
77
|
+
next if page.nil?
|
|
78
|
+
|
|
79
|
+
pages_to_keep << page
|
|
80
|
+
end
|
|
81
|
+
all_pages.to_set - pages_to_keep
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# @param page_indexes [Array]
|
|
85
|
+
# @param all_pages [Array]
|
|
86
|
+
def self.indexes_from_remove(page_indexes, all_pages)
|
|
87
|
+
pages_to_remove = Set.new
|
|
88
|
+
page_indexes.each do |idx|
|
|
89
|
+
idx = (all_pages.length - (idx + 2)) if idx.negative?
|
|
90
|
+
page = all_pages[idx]
|
|
91
|
+
next if page.nil?
|
|
92
|
+
|
|
93
|
+
pages_to_remove << page
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# @param io_stream [StringIO]
|
|
98
|
+
# @return [Origami::PDF]
|
|
99
|
+
def self.open_pdf(io_stream)
|
|
100
|
+
pdf_parser = Origami::PDF::LinearParser.new({ verbosity: Origami::Parser::VERBOSE_QUIET })
|
|
101
|
+
io_stream.seek(0)
|
|
102
|
+
pdf_parser.parse(io_stream)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'stringio'
|
|
4
|
+
require 'marcel'
|
|
5
|
+
|
|
6
|
+
require_relative 'pdf_processing'
|
|
7
|
+
|
|
8
|
+
module Mindee
|
|
9
|
+
module Input
|
|
10
|
+
ALLOWED_MIME_TYPES = [
|
|
11
|
+
'application/pdf',
|
|
12
|
+
'image/heic',
|
|
13
|
+
'image/png',
|
|
14
|
+
'image/jpeg',
|
|
15
|
+
'image/tiff',
|
|
16
|
+
'image/webp',
|
|
17
|
+
].freeze
|
|
18
|
+
|
|
19
|
+
# Base class for loading documents.
|
|
20
|
+
class InputDocument
|
|
21
|
+
# @return [String]
|
|
22
|
+
attr_reader :filename
|
|
23
|
+
# @return [String]
|
|
24
|
+
attr_reader :file_mimetype
|
|
25
|
+
# @return [StreamIO]
|
|
26
|
+
attr_reader :io_stream
|
|
27
|
+
|
|
28
|
+
# @param io_stream [StreamIO]
|
|
29
|
+
def initialize(io_stream, filename)
|
|
30
|
+
@io_stream = io_stream
|
|
31
|
+
@filename = filename
|
|
32
|
+
@file_mimetype = Marcel::MimeType.for @io_stream, name: @filename
|
|
33
|
+
|
|
34
|
+
return if ALLOWED_MIME_TYPES.include? @file_mimetype
|
|
35
|
+
|
|
36
|
+
raise "File type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def pdf?
|
|
40
|
+
@file_mimetype == 'application/pdf'
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def process_pdf(options)
|
|
44
|
+
@io_stream.seek(0)
|
|
45
|
+
@io_stream = PdfProcessor.parse(@io_stream, options)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# @param close [Boolean]
|
|
49
|
+
def read_document(close: true)
|
|
50
|
+
@io_stream.seek(0)
|
|
51
|
+
data = @io_stream.read
|
|
52
|
+
@io_stream.close if close
|
|
53
|
+
[data].pack('m')
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Load a document from a path.
|
|
58
|
+
class PathDocument < InputDocument
|
|
59
|
+
# @param filepath [String]
|
|
60
|
+
def initialize(filepath)
|
|
61
|
+
io_stream = File.open(filepath, 'rb')
|
|
62
|
+
super(io_stream, File.basename(filepath))
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Load a document from a base64 string.
|
|
67
|
+
class Base64Document < InputDocument
|
|
68
|
+
# @param base64_string [String]
|
|
69
|
+
# @param filename [String]
|
|
70
|
+
def initialize(base64_string, filename)
|
|
71
|
+
io_stream = StringIO.new(base64_string.unpack1('m*'))
|
|
72
|
+
io_stream.set_encoding Encoding::BINARY
|
|
73
|
+
super(io_stream, filename)
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Load a document from raw bytes.
|
|
78
|
+
class BytesDocument < InputDocument
|
|
79
|
+
# @param raw_bytes [String]
|
|
80
|
+
# @param filename [String]
|
|
81
|
+
def initialize(raw_bytes, filename)
|
|
82
|
+
io_stream = StringIO.new(raw_bytes)
|
|
83
|
+
io_stream.set_encoding Encoding::BINARY
|
|
84
|
+
super(io_stream, filename)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Load a document from a file handle.
|
|
89
|
+
class FileDocument < InputDocument
|
|
90
|
+
# @param filename [String]
|
|
91
|
+
def initialize(file_handle, filename)
|
|
92
|
+
io_stream = file_handle
|
|
93
|
+
super(io_stream, filename)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
data/lib/mindee/input.rb
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'inference'
|
|
4
|
+
|
|
5
|
+
module Mindee
|
|
6
|
+
# Stores all response attributes.
|
|
7
|
+
class Document
|
|
8
|
+
# @return [Mindee::Inference]
|
|
9
|
+
attr_reader :inference
|
|
10
|
+
# @return [String] Filename sent to the API
|
|
11
|
+
attr_reader :name
|
|
12
|
+
# @return [String] Mindee ID of the document
|
|
13
|
+
attr_reader :id
|
|
14
|
+
|
|
15
|
+
# @param prediction_class [Class<Mindee::Prediction::Prediction>]
|
|
16
|
+
# @param http_response [Hash]
|
|
17
|
+
def initialize(prediction_class, http_response)
|
|
18
|
+
@id = http_response['id']
|
|
19
|
+
@name = http_response['name']
|
|
20
|
+
@inference = Mindee::Inference.new(prediction_class, http_response['inference'])
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def to_s
|
|
24
|
+
out_str = String.new
|
|
25
|
+
out_str << "########\nDocument\n########"
|
|
26
|
+
out_str << "\n:Mindee ID: #{@id}"
|
|
27
|
+
out_str << "\n:Filename: #{@name}"
|
|
28
|
+
out_str << "\n\n#{@inference}"
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Mindee
|
|
4
|
+
module Parsing
|
|
5
|
+
# API Error
|
|
6
|
+
class Error < StandardError
|
|
7
|
+
# @return [String]
|
|
8
|
+
attr_reader :api_code
|
|
9
|
+
# @return [String]
|
|
10
|
+
attr_reader :api_details
|
|
11
|
+
# @return [String]
|
|
12
|
+
attr_reader :api_message
|
|
13
|
+
|
|
14
|
+
def initialize(error)
|
|
15
|
+
@api_code = error['code']
|
|
16
|
+
@api_details = error['details']
|
|
17
|
+
@api_message = error['message']
|
|
18
|
+
super("#{@api_code}: #{@api_details} - #{@api_message}")
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|