google-cloud-gke_recommender-v1 0.a → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +12 -0
- data/AUTHENTICATION.md +122 -0
- data/README.md +153 -8
- data/lib/google/cloud/gke_recommender/v1/gke_inference_quickstart/client.rb +1028 -0
- data/lib/google/cloud/gke_recommender/v1/gke_inference_quickstart/credentials.rb +47 -0
- data/lib/google/cloud/gke_recommender/v1/gke_inference_quickstart/rest/client.rb +1008 -0
- data/lib/google/cloud/gke_recommender/v1/gke_inference_quickstart/rest/service_stub.rb +438 -0
- data/lib/google/cloud/gke_recommender/v1/gke_inference_quickstart/rest.rb +54 -0
- data/lib/google/cloud/gke_recommender/v1/gke_inference_quickstart.rb +57 -0
- data/lib/google/cloud/gke_recommender/v1/rest.rb +37 -0
- data/lib/google/cloud/gke_recommender/v1/version.rb +7 -2
- data/lib/google/cloud/gke_recommender/v1.rb +45 -0
- data/lib/google/cloud/gkerecommender/v1/gkerecommender_pb.rb +69 -0
- data/lib/google/cloud/gkerecommender/v1/gkerecommender_services_pb.rb +83 -0
- data/lib/google-cloud-gke_recommender-v1.rb +21 -0
- data/proto_docs/README.md +4 -0
- data/proto_docs/google/api/client.rb +473 -0
- data/proto_docs/google/api/field_behavior.rb +85 -0
- data/proto_docs/google/api/launch_stage.rb +71 -0
- data/proto_docs/google/api/resource.rb +227 -0
- data/proto_docs/google/cloud/gkerecommender/v1/gkerecommender.rb +558 -0
- data/proto_docs/google/protobuf/duration.rb +98 -0
- metadata +55 -9
@@ -0,0 +1,558 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Copyright 2025 Google LLC
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
# Auto-generated by gapic-generator-ruby. DO NOT EDIT!
|
18
|
+
|
19
|
+
|
20
|
+
module Google
|
21
|
+
module Cloud
|
22
|
+
module GkeRecommender
|
23
|
+
module V1
|
24
|
+
# Request message for
|
25
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}.
|
26
|
+
# @!attribute [rw] page_size
|
27
|
+
# @return [::Integer]
|
28
|
+
# Optional. The target number of results to return in a single response.
|
29
|
+
# If not specified, a default value will be chosen by the service.
|
30
|
+
# Note that the response may include a partial list and a caller should
|
31
|
+
# only rely on the response's
|
32
|
+
# {::Google::Cloud::GkeRecommender::V1::FetchModelsResponse#next_page_token next_page_token}
|
33
|
+
# to determine if there are more instances left to be queried.
|
34
|
+
# @!attribute [rw] page_token
|
35
|
+
# @return [::String]
|
36
|
+
# Optional. The value of
|
37
|
+
# {::Google::Cloud::GkeRecommender::V1::FetchModelsResponse#next_page_token next_page_token}
|
38
|
+
# received from a previous `FetchModelsRequest` call.
|
39
|
+
# Provide this to retrieve the subsequent page in a multi-page list of
|
40
|
+
# results. When paginating, all other parameters provided to
|
41
|
+
# `FetchModelsRequest` must match the call that provided the page token.
|
42
|
+
class FetchModelsRequest
|
43
|
+
include ::Google::Protobuf::MessageExts
|
44
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
45
|
+
end
|
46
|
+
|
47
|
+
# Response message for
|
48
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}.
|
49
|
+
# @!attribute [r] models
|
50
|
+
# @return [::Array<::String>]
|
51
|
+
# Output only. List of available models. Open-source models follow the
|
52
|
+
# Huggingface Hub `owner/model_name` format.
|
53
|
+
# @!attribute [r] next_page_token
|
54
|
+
# @return [::String]
|
55
|
+
# Output only. A token which may be sent as
|
56
|
+
# [page_token][FetchModelsResponse.page_token] in a subsequent
|
57
|
+
# `FetchModelsResponse` call to retrieve the next page of results.
|
58
|
+
# If this field is omitted or empty, then there are no more results to
|
59
|
+
# return.
|
60
|
+
class FetchModelsResponse
|
61
|
+
include ::Google::Protobuf::MessageExts
|
62
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
63
|
+
end
|
64
|
+
|
65
|
+
# Request message for
|
66
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_servers GkeInferenceQuickstart.FetchModelServers}.
|
67
|
+
# @!attribute [rw] model
|
68
|
+
# @return [::String]
|
69
|
+
# Required. The model for which to list model servers. Open-source models
|
70
|
+
# follow the Huggingface Hub `owner/model_name` format. Use
|
71
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}
|
72
|
+
# to find available models.
|
73
|
+
# @!attribute [rw] page_size
|
74
|
+
# @return [::Integer]
|
75
|
+
# Optional. The target number of results to return in a single response.
|
76
|
+
# If not specified, a default value will be chosen by the service.
|
77
|
+
# Note that the response may include a partial list and a caller should
|
78
|
+
# only rely on the response's
|
79
|
+
# {::Google::Cloud::GkeRecommender::V1::FetchModelServersResponse#next_page_token next_page_token}
|
80
|
+
# to determine if there are more instances left to be queried.
|
81
|
+
# @!attribute [rw] page_token
|
82
|
+
# @return [::String]
|
83
|
+
# Optional. The value of
|
84
|
+
# {::Google::Cloud::GkeRecommender::V1::FetchModelServersResponse#next_page_token next_page_token}
|
85
|
+
# received from a previous `FetchModelServersRequest` call.
|
86
|
+
# Provide this to retrieve the subsequent page in a multi-page list of
|
87
|
+
# results. When paginating, all other parameters provided to
|
88
|
+
# `FetchModelServersRequest` must match the call that provided the page
|
89
|
+
# token.
|
90
|
+
class FetchModelServersRequest
|
91
|
+
include ::Google::Protobuf::MessageExts
|
92
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
93
|
+
end
|
94
|
+
|
95
|
+
# Response message for
|
96
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_servers GkeInferenceQuickstart.FetchModelServers}.
|
97
|
+
# @!attribute [r] model_servers
|
98
|
+
# @return [::Array<::String>]
|
99
|
+
# Output only. List of available model servers. Open-source model servers use
|
100
|
+
# simplified, lowercase names (e.g., `vllm`).
|
101
|
+
# @!attribute [r] next_page_token
|
102
|
+
# @return [::String]
|
103
|
+
# Output only. A token which may be sent as
|
104
|
+
# [page_token][FetchModelServersResponse.page_token] in a subsequent
|
105
|
+
# `FetchModelServersResponse` call to retrieve the next page of results.
|
106
|
+
# If this field is omitted or empty, then there are no more results to
|
107
|
+
# return.
|
108
|
+
class FetchModelServersResponse
|
109
|
+
include ::Google::Protobuf::MessageExts
|
110
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
111
|
+
end
|
112
|
+
|
113
|
+
# Request message for
|
114
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_server_versions GkeInferenceQuickstart.FetchModelServerVersions}.
|
115
|
+
# @!attribute [rw] model
|
116
|
+
# @return [::String]
|
117
|
+
# Required. The model for which to list model server versions. Open-source
|
118
|
+
# models follow the Huggingface Hub `owner/model_name` format. Use
|
119
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}
|
120
|
+
# to find available models.
|
121
|
+
# @!attribute [rw] model_server
|
122
|
+
# @return [::String]
|
123
|
+
# Required. The model server for which to list versions. Open-source model
|
124
|
+
# servers use simplified, lowercase names (e.g., `vllm`). Use
|
125
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_servers GkeInferenceQuickstart.FetchModelServers}
|
126
|
+
# to find available model servers.
|
127
|
+
# @!attribute [rw] page_size
|
128
|
+
# @return [::Integer]
|
129
|
+
# Optional. The target number of results to return in a single response.
|
130
|
+
# If not specified, a default value will be chosen by the service.
|
131
|
+
# Note that the response may include a partial list and a caller should
|
132
|
+
# only rely on the response's
|
133
|
+
# {::Google::Cloud::GkeRecommender::V1::FetchModelServerVersionsResponse#next_page_token next_page_token}
|
134
|
+
# to determine if there are more instances left to be queried.
|
135
|
+
# @!attribute [rw] page_token
|
136
|
+
# @return [::String]
|
137
|
+
# Optional. The value of
|
138
|
+
# {::Google::Cloud::GkeRecommender::V1::FetchModelServerVersionsResponse#next_page_token next_page_token}
|
139
|
+
# received from a previous `FetchModelServerVersionsRequest` call.
|
140
|
+
# Provide this to retrieve the subsequent page in a multi-page list of
|
141
|
+
# results. When paginating, all other parameters provided to
|
142
|
+
# `FetchModelServerVersionsRequest` must match the call that provided the
|
143
|
+
# page token.
|
144
|
+
class FetchModelServerVersionsRequest
|
145
|
+
include ::Google::Protobuf::MessageExts
|
146
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
147
|
+
end
|
148
|
+
|
149
|
+
# Response message for
|
150
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_server_versions GkeInferenceQuickstart.FetchModelServerVersions}.
|
151
|
+
# @!attribute [r] model_server_versions
|
152
|
+
# @return [::Array<::String>]
|
153
|
+
# Output only. A list of available model server versions.
|
154
|
+
# @!attribute [r] next_page_token
|
155
|
+
# @return [::String]
|
156
|
+
# Output only. A token which may be sent as
|
157
|
+
# [page_token][FetchModelServerVersionsResponse.page_token] in a subsequent
|
158
|
+
# `FetchModelServerVersionsResponse` call to retrieve the next page of
|
159
|
+
# results. If this field is omitted or empty, then there are no more results
|
160
|
+
# to return.
|
161
|
+
class FetchModelServerVersionsResponse
|
162
|
+
include ::Google::Protobuf::MessageExts
|
163
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
164
|
+
end
|
165
|
+
|
166
|
+
# Request message for
|
167
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_benchmarking_data GkeInferenceQuickstart.FetchBenchmarkingData}.
|
168
|
+
# @!attribute [rw] model_server_info
|
169
|
+
# @return [::Google::Cloud::GkeRecommender::V1::ModelServerInfo]
|
170
|
+
# Required. The model server configuration to get benchmarking data for. Use
|
171
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}
|
172
|
+
# to find valid configurations.
|
173
|
+
# @!attribute [rw] instance_type
|
174
|
+
# @return [::String]
|
175
|
+
# Optional. The instance type to filter benchmarking data. Instance types are
|
176
|
+
# in the format `a2-highgpu-1g`. If not provided, all instance types for the
|
177
|
+
# given profile's `model_server_info` will be returned. Use
|
178
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}
|
179
|
+
# to find available instance types.
|
180
|
+
# @!attribute [rw] pricing_model
|
181
|
+
# @return [::String]
|
182
|
+
# Optional. The pricing model to use for the benchmarking data. Defaults to
|
183
|
+
# `spot`.
|
184
|
+
class FetchBenchmarkingDataRequest
|
185
|
+
include ::Google::Protobuf::MessageExts
|
186
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
187
|
+
end
|
188
|
+
|
189
|
+
# Response message for
|
190
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_benchmarking_data GkeInferenceQuickstart.FetchBenchmarkingData}.
|
191
|
+
# @!attribute [r] profile
|
192
|
+
# @return [::Array<::Google::Cloud::GkeRecommender::V1::Profile>]
|
193
|
+
# Output only. List of profiles containing their respective benchmarking
|
194
|
+
# data.
|
195
|
+
class FetchBenchmarkingDataResponse
|
196
|
+
include ::Google::Protobuf::MessageExts
|
197
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
198
|
+
end
|
199
|
+
|
200
|
+
# Request message for
|
201
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}.
|
202
|
+
# @!attribute [rw] model
|
203
|
+
# @return [::String]
|
204
|
+
# Optional. The model to filter profiles by. Open-source models follow the
|
205
|
+
# Huggingface Hub `owner/model_name` format. If not provided, all models are
|
206
|
+
# returned. Use
|
207
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}
|
208
|
+
# to find available models.
|
209
|
+
# @!attribute [rw] model_server
|
210
|
+
# @return [::String]
|
211
|
+
# Optional. The model server to filter profiles by. If not provided, all
|
212
|
+
# model servers are returned. Use
|
213
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_servers GkeInferenceQuickstart.FetchModelServers}
|
214
|
+
# to find available model servers for a given model.
|
215
|
+
# @!attribute [rw] model_server_version
|
216
|
+
# @return [::String]
|
217
|
+
# Optional. The model server version to filter profiles by. If not provided,
|
218
|
+
# all model server versions are returned. Use
|
219
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_server_versions GkeInferenceQuickstart.FetchModelServerVersions}
|
220
|
+
# to find available versions for a given model and server.
|
221
|
+
# @!attribute [rw] performance_requirements
|
222
|
+
# @return [::Google::Cloud::GkeRecommender::V1::PerformanceRequirements]
|
223
|
+
# Optional. The performance requirements to filter profiles. Profiles that do
|
224
|
+
# not meet these requirements are filtered out. If not provided, all profiles
|
225
|
+
# are returned.
|
226
|
+
# @!attribute [rw] page_size
|
227
|
+
# @return [::Integer]
|
228
|
+
# Optional. The target number of results to return in a single response. If
|
229
|
+
# not specified, a default value will be chosen by the service. Note that the
|
230
|
+
# response may include a partial list and a caller should only rely on the
|
231
|
+
# response's
|
232
|
+
# {::Google::Cloud::GkeRecommender::V1::FetchProfilesResponse#next_page_token next_page_token}
|
233
|
+
# to determine if there are more instances left to be queried.
|
234
|
+
# @!attribute [rw] page_token
|
235
|
+
# @return [::String]
|
236
|
+
# Optional. The value of
|
237
|
+
# {::Google::Cloud::GkeRecommender::V1::FetchProfilesResponse#next_page_token next_page_token}
|
238
|
+
# received from a previous `FetchProfilesRequest` call.
|
239
|
+
# Provide this to retrieve the subsequent page in a multi-page list of
|
240
|
+
# results. When paginating, all other parameters provided to
|
241
|
+
# `FetchProfilesRequest` must match the call that provided the page
|
242
|
+
# token.
|
243
|
+
class FetchProfilesRequest
|
244
|
+
include ::Google::Protobuf::MessageExts
|
245
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
246
|
+
end
|
247
|
+
|
248
|
+
# Performance requirements for a profile and or model deployment.
|
249
|
+
# @!attribute [rw] target_ntpot_milliseconds
|
250
|
+
# @return [::Integer]
|
251
|
+
# Optional. The target Normalized Time Per Output Token (NTPOT) in
|
252
|
+
# milliseconds. NTPOT is calculated as `request_latency /
|
253
|
+
# total_output_tokens`. If not provided, this target will not be enforced.
|
254
|
+
# @!attribute [rw] target_ttft_milliseconds
|
255
|
+
# @return [::Integer]
|
256
|
+
# Optional. The target Time To First Token (TTFT) in milliseconds. TTFT is
|
257
|
+
# the time it takes to generate the first token for a request. If not
|
258
|
+
# provided, this target will not be enforced.
|
259
|
+
# @!attribute [rw] target_cost
|
260
|
+
# @return [::Google::Cloud::GkeRecommender::V1::Cost]
|
261
|
+
# Optional. The target cost for running a profile's model server. If not
|
262
|
+
# provided, this requirement will not be enforced.
|
263
|
+
class PerformanceRequirements
|
264
|
+
include ::Google::Protobuf::MessageExts
|
265
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
266
|
+
end
|
267
|
+
|
268
|
+
# Represents an amount of money in a specific currency.
|
269
|
+
# @!attribute [r] units
|
270
|
+
# @return [::Integer]
|
271
|
+
# Output only. The whole units of the amount.
|
272
|
+
# For example if `currencyCode` is `"USD"`, then 1 unit is one US dollar.
|
273
|
+
# @!attribute [r] nanos
|
274
|
+
# @return [::Integer]
|
275
|
+
# Output only. Number of nano (10^-9) units of the amount.
|
276
|
+
# The value must be between -999,999,999 and +999,999,999 inclusive.
|
277
|
+
# If `units` is positive, `nanos` must be positive or zero.
|
278
|
+
# If `units` is zero, `nanos` can be positive, zero, or negative.
|
279
|
+
# If `units` is negative, `nanos` must be negative or zero.
|
280
|
+
# For example $-1.75 is represented as `units`=-1 and `nanos`=-750,000,000.
|
281
|
+
class Amount
|
282
|
+
include ::Google::Protobuf::MessageExts
|
283
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
284
|
+
end
|
285
|
+
|
286
|
+
# Cost for running a model deployment on a given instance type. Currently, only
|
287
|
+
# USD currency code is supported.
|
288
|
+
# @!attribute [rw] cost_per_million_output_tokens
|
289
|
+
# @return [::Google::Cloud::GkeRecommender::V1::Amount]
|
290
|
+
# Optional. The cost per million output tokens, calculated as:
|
291
|
+
# $/output token = GPU $/s / (1/output-to-input-cost-ratio * input tokens/s +
|
292
|
+
# output tokens/s)
|
293
|
+
# @!attribute [rw] cost_per_million_input_tokens
|
294
|
+
# @return [::Google::Cloud::GkeRecommender::V1::Amount]
|
295
|
+
# Optional. The cost per million input tokens. $/input token = ($/output
|
296
|
+
# token) / output-to-input-cost-ratio.
|
297
|
+
# @!attribute [rw] pricing_model
|
298
|
+
# @return [::String]
|
299
|
+
# Optional. The pricing model used to calculate the cost. Can be one of:
|
300
|
+
# `3-years-cud`, `1-year-cud`, `on-demand`, `spot`. If not provided, `spot`
|
301
|
+
# will be used.
|
302
|
+
# @!attribute [rw] output_input_cost_ratio
|
303
|
+
# @return [::Float]
|
304
|
+
# Optional. The output-to-input cost ratio. This determines how the total GPU
|
305
|
+
# cost is split between input and output tokens. If not provided, `4.0` is
|
306
|
+
# used, assuming a 4:1 output:input cost ratio.
|
307
|
+
class Cost
|
308
|
+
include ::Google::Protobuf::MessageExts
|
309
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
310
|
+
end
|
311
|
+
|
312
|
+
# Represents a range of throughput values in tokens per second.
|
313
|
+
# @!attribute [r] min
|
314
|
+
# @return [::Integer]
|
315
|
+
# Output only. The minimum value of the range.
|
316
|
+
# @!attribute [r] max
|
317
|
+
# @return [::Integer]
|
318
|
+
# Output only. The maximum value of the range.
|
319
|
+
class TokensPerSecondRange
|
320
|
+
include ::Google::Protobuf::MessageExts
|
321
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
322
|
+
end
|
323
|
+
|
324
|
+
# Represents a range of latency values in milliseconds.
|
325
|
+
# @!attribute [r] min
|
326
|
+
# @return [::Integer]
|
327
|
+
# Output only. The minimum value of the range.
|
328
|
+
# @!attribute [r] max
|
329
|
+
# @return [::Integer]
|
330
|
+
# Output only. The maximum value of the range.
|
331
|
+
class MillisecondRange
|
332
|
+
include ::Google::Protobuf::MessageExts
|
333
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
334
|
+
end
|
335
|
+
|
336
|
+
# Performance range for a model deployment.
|
337
|
+
# @!attribute [r] throughput_output_range
|
338
|
+
# @return [::Google::Cloud::GkeRecommender::V1::TokensPerSecondRange]
|
339
|
+
# Output only. The range of throughput in output tokens per second. This is
|
340
|
+
# measured as total_output_tokens_generated_by_server /
|
341
|
+
# elapsed_time_in_seconds.
|
342
|
+
# @!attribute [r] ttft_range
|
343
|
+
# @return [::Google::Cloud::GkeRecommender::V1::MillisecondRange]
|
344
|
+
# Output only. The range of TTFT (Time To First Token) in milliseconds. TTFT
|
345
|
+
# is the time it takes to generate the first token for a request.
|
346
|
+
# @!attribute [r] ntpot_range
|
347
|
+
# @return [::Google::Cloud::GkeRecommender::V1::MillisecondRange]
|
348
|
+
# Output only. The range of NTPOT (Normalized Time Per Output Token) in
|
349
|
+
# milliseconds. NTPOT is the request latency normalized by the number of
|
350
|
+
# output tokens, measured as request_latency / total_output_tokens.
|
351
|
+
class PerformanceRange
|
352
|
+
include ::Google::Protobuf::MessageExts
|
353
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
354
|
+
end
|
355
|
+
|
356
|
+
# Response message for
|
357
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}.
|
358
|
+
# @!attribute [r] profile
|
359
|
+
# @return [::Array<::Google::Cloud::GkeRecommender::V1::Profile>]
|
360
|
+
# Output only. List of profiles that match the given model server info and
|
361
|
+
# performance requirements (if provided).
|
362
|
+
# @!attribute [r] performance_range
|
363
|
+
# @return [::Google::Cloud::GkeRecommender::V1::PerformanceRange]
|
364
|
+
# Output only. The combined range of performance values observed across all
|
365
|
+
# profiles in this response.
|
366
|
+
# @!attribute [r] comments
|
367
|
+
# @return [::String]
|
368
|
+
# Output only. Additional comments related to the response.
|
369
|
+
# @!attribute [r] next_page_token
|
370
|
+
# @return [::String]
|
371
|
+
# Output only. A token which may be sent as
|
372
|
+
# [page_token][FetchProfilesResponse.page_token] in a subsequent
|
373
|
+
# `FetchProfilesResponse` call to retrieve the next page of results. If this
|
374
|
+
# field is omitted or empty, then there are no more results to return.
|
375
|
+
class FetchProfilesResponse
|
376
|
+
include ::Google::Protobuf::MessageExts
|
377
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
378
|
+
end
|
379
|
+
|
380
|
+
# Model server information gives. Valid model server info combinations can
|
381
|
+
# be found using
|
382
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}.
|
383
|
+
# @!attribute [rw] model
|
384
|
+
# @return [::String]
|
385
|
+
# Required. The model. Open-source models follow the Huggingface Hub
|
386
|
+
# `owner/model_name` format. Use
|
387
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}
|
388
|
+
# to find available models.
|
389
|
+
# @!attribute [rw] model_server
|
390
|
+
# @return [::String]
|
391
|
+
# Required. The model server. Open-source model servers use simplified,
|
392
|
+
# lowercase names (e.g., `vllm`). Use
|
393
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_servers GkeInferenceQuickstart.FetchModelServers}
|
394
|
+
# to find available servers.
|
395
|
+
# @!attribute [rw] model_server_version
|
396
|
+
# @return [::String]
|
397
|
+
# Optional. The model server version. Use
|
398
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_server_versions GkeInferenceQuickstart.FetchModelServerVersions}
|
399
|
+
# to find available versions. If not provided, the latest available version
|
400
|
+
# is used.
|
401
|
+
class ModelServerInfo
|
402
|
+
include ::Google::Protobuf::MessageExts
|
403
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
404
|
+
end
|
405
|
+
|
406
|
+
# Resources used by a model deployment.
|
407
|
+
# @!attribute [r] accelerator_count
|
408
|
+
# @return [::Integer]
|
409
|
+
# Output only. The number of accelerators (e.g., GPUs or TPUs) used by the
|
410
|
+
# model deployment on the Kubernetes node.
|
411
|
+
class ResourcesUsed
|
412
|
+
include ::Google::Protobuf::MessageExts
|
413
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
414
|
+
end
|
415
|
+
|
416
|
+
# Performance statistics for a model deployment.
|
417
|
+
# @!attribute [r] queries_per_second
|
418
|
+
# @return [::Float]
|
419
|
+
# Output only. The number of queries per second.
|
420
|
+
# Note: This metric can vary widely based on context length and may not be a
|
421
|
+
# reliable measure of LLM throughput.
|
422
|
+
# @!attribute [r] output_tokens_per_second
|
423
|
+
# @return [::Integer]
|
424
|
+
# Output only. The number of output tokens per second. This is the throughput
|
425
|
+
# measured as total_output_tokens_generated_by_server /
|
426
|
+
# elapsed_time_in_seconds.
|
427
|
+
# @!attribute [r] ntpot_milliseconds
|
428
|
+
# @return [::Integer]
|
429
|
+
# Output only. The Normalized Time Per Output Token (NTPOT) in milliseconds.
|
430
|
+
# This is the request latency normalized by the number of output tokens,
|
431
|
+
# measured as request_latency / total_output_tokens.
|
432
|
+
# @!attribute [r] ttft_milliseconds
|
433
|
+
# @return [::Integer]
|
434
|
+
# Output only. The Time To First Token (TTFT) in milliseconds. This is the
|
435
|
+
# time it takes to generate the first token for a request.
|
436
|
+
# @!attribute [r] cost
|
437
|
+
# @return [::Array<::Google::Cloud::GkeRecommender::V1::Cost>]
|
438
|
+
# Output only. The cost of running the model deployment.
|
439
|
+
class PerformanceStats
|
440
|
+
include ::Google::Protobuf::MessageExts
|
441
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
442
|
+
end
|
443
|
+
|
444
|
+
# A profile containing information about a model deployment.
|
445
|
+
# @!attribute [r] model_server_info
|
446
|
+
# @return [::Google::Cloud::GkeRecommender::V1::ModelServerInfo]
|
447
|
+
# Output only. The model server configuration. Use
|
448
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}
|
449
|
+
# to find valid configurations.
|
450
|
+
# @!attribute [r] accelerator_type
|
451
|
+
# @return [::String]
|
452
|
+
# Output only. The accelerator type. Expected format: `nvidia-h100-80gb`.
|
453
|
+
# @!attribute [r] tpu_topology
|
454
|
+
# @return [::String]
|
455
|
+
# Output only. The TPU topology (if applicable).
|
456
|
+
# @!attribute [r] instance_type
|
457
|
+
# @return [::String]
|
458
|
+
# Output only. The instance type. Expected format: `a2-highgpu-1g`.
|
459
|
+
# @!attribute [r] resources_used
|
460
|
+
# @return [::Google::Cloud::GkeRecommender::V1::ResourcesUsed]
|
461
|
+
# Output only. The resources used by the model deployment.
|
462
|
+
# @!attribute [r] performance_stats
|
463
|
+
# @return [::Array<::Google::Cloud::GkeRecommender::V1::PerformanceStats>]
|
464
|
+
# Output only. The performance statistics for this profile.
|
465
|
+
class Profile
|
466
|
+
include ::Google::Protobuf::MessageExts
|
467
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
468
|
+
end
|
469
|
+
|
470
|
+
# Request message for
|
471
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#generate_optimized_manifest GkeInferenceQuickstart.GenerateOptimizedManifest}.
|
472
|
+
# @!attribute [rw] model_server_info
|
473
|
+
# @return [::Google::Cloud::GkeRecommender::V1::ModelServerInfo]
|
474
|
+
# Required. The model server configuration to generate the manifest for. Use
|
475
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}
|
476
|
+
# to find valid configurations.
|
477
|
+
# @!attribute [rw] accelerator_type
|
478
|
+
# @return [::String]
|
479
|
+
# Required. The accelerator type. Use
|
480
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}
|
481
|
+
# to find valid accelerators for a given `model_server_info`.
|
482
|
+
# @!attribute [rw] kubernetes_namespace
|
483
|
+
# @return [::String]
|
484
|
+
# Optional. The kubernetes namespace to deploy the manifests in.
|
485
|
+
# @!attribute [rw] performance_requirements
|
486
|
+
# @return [::Google::Cloud::GkeRecommender::V1::PerformanceRequirements]
|
487
|
+
# Optional. The performance requirements to use for generating Horizontal Pod
|
488
|
+
# Autoscaler (HPA) resources. If provided, the manifest includes HPA
|
489
|
+
# resources to adjust the model server replica count to maintain the
|
490
|
+
# specified targets (e.g., NTPOT, TTFT) at a P50 latency. Cost targets are
|
491
|
+
# not currently supported for HPA generation. If the specified targets are
|
492
|
+
# not achievable, the HPA manifest will not be generated.
|
493
|
+
# @!attribute [rw] storage_config
|
494
|
+
# @return [::Google::Cloud::GkeRecommender::V1::StorageConfig]
|
495
|
+
# Optional. The storage configuration for the model. If not provided, the
|
496
|
+
# model is loaded from Huggingface.
|
497
|
+
class GenerateOptimizedManifestRequest
|
498
|
+
include ::Google::Protobuf::MessageExts
|
499
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
500
|
+
end
|
501
|
+
|
502
|
+
# A Kubernetes manifest.
|
503
|
+
# @!attribute [r] kind
|
504
|
+
# @return [::String]
|
505
|
+
# Output only. Kubernetes resource kind.
|
506
|
+
# @!attribute [r] api_version
|
507
|
+
# @return [::String]
|
508
|
+
# Output only. Kubernetes API version.
|
509
|
+
# @!attribute [r] content
|
510
|
+
# @return [::String]
|
511
|
+
# Output only. YAML content.
|
512
|
+
class KubernetesManifest
|
513
|
+
include ::Google::Protobuf::MessageExts
|
514
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
515
|
+
end
|
516
|
+
|
517
|
+
# Response message for
|
518
|
+
# {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#generate_optimized_manifest GkeInferenceQuickstart.GenerateOptimizedManifest}.
|
519
|
+
# @!attribute [r] kubernetes_manifests
|
520
|
+
# @return [::Array<::Google::Cloud::GkeRecommender::V1::KubernetesManifest>]
|
521
|
+
# Output only. A list of generated Kubernetes manifests.
|
522
|
+
# @!attribute [r] comments
|
523
|
+
# @return [::Array<::String>]
|
524
|
+
# Output only. Comments related to deploying the generated manifests.
|
525
|
+
# @!attribute [r] manifest_version
|
526
|
+
# @return [::String]
|
527
|
+
# Output only. Additional information about the versioned dependencies used
|
528
|
+
# to generate the manifests. See [Run best practice inference with GKE
|
529
|
+
# Inference Quickstart
|
530
|
+
# recipes](https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/inference-quickstart)
|
531
|
+
# for details.
|
532
|
+
class GenerateOptimizedManifestResponse
|
533
|
+
include ::Google::Protobuf::MessageExts
|
534
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
535
|
+
end
|
536
|
+
|
537
|
+
# Storage configuration for a model deployment.
|
538
|
+
# @!attribute [rw] model_bucket_uri
|
539
|
+
# @return [::String]
|
540
|
+
# Optional. The Google Cloud Storage bucket URI to load the model from. This
|
541
|
+
# URI must point to the directory containing the model's config file
|
542
|
+
# (`config.json`) and model weights. A tuned GCSFuse setup can improve
|
543
|
+
# LLM Pod startup time by more than 7x. Expected format:
|
544
|
+
# `gs://<bucket-name>/<path-to-model>`.
|
545
|
+
# @!attribute [rw] xla_cache_bucket_uri
|
546
|
+
# @return [::String]
|
547
|
+
# Optional. The URI for the GCS bucket containing the XLA compilation cache.
|
548
|
+
# If using TPUs, the XLA cache will be written to the same path as
|
549
|
+
# `model_bucket_uri`. This can speed up vLLM model preparation for repeated
|
550
|
+
# deployments.
|
551
|
+
class StorageConfig
|
552
|
+
include ::Google::Protobuf::MessageExts
|
553
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
554
|
+
end
|
555
|
+
end
|
556
|
+
end
|
557
|
+
end
|
558
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Copyright 2025 Google LLC
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
# Auto-generated by gapic-generator-ruby. DO NOT EDIT!
|
18
|
+
|
19
|
+
|
20
|
+
module Google
|
21
|
+
module Protobuf
|
22
|
+
# A Duration represents a signed, fixed-length span of time represented
|
23
|
+
# as a count of seconds and fractions of seconds at nanosecond
|
24
|
+
# resolution. It is independent of any calendar and concepts like "day"
|
25
|
+
# or "month". It is related to Timestamp in that the difference between
|
26
|
+
# two Timestamp values is a Duration and it can be added or subtracted
|
27
|
+
# from a Timestamp. Range is approximately +-10,000 years.
|
28
|
+
#
|
29
|
+
# # Examples
|
30
|
+
#
|
31
|
+
# Example 1: Compute Duration from two Timestamps in pseudo code.
|
32
|
+
#
|
33
|
+
# Timestamp start = ...;
|
34
|
+
# Timestamp end = ...;
|
35
|
+
# Duration duration = ...;
|
36
|
+
#
|
37
|
+
# duration.seconds = end.seconds - start.seconds;
|
38
|
+
# duration.nanos = end.nanos - start.nanos;
|
39
|
+
#
|
40
|
+
# if (duration.seconds < 0 && duration.nanos > 0) {
|
41
|
+
# duration.seconds += 1;
|
42
|
+
# duration.nanos -= 1000000000;
|
43
|
+
# } else if (duration.seconds > 0 && duration.nanos < 0) {
|
44
|
+
# duration.seconds -= 1;
|
45
|
+
# duration.nanos += 1000000000;
|
46
|
+
# }
|
47
|
+
#
|
48
|
+
# Example 2: Compute Timestamp from Timestamp + Duration in pseudo code.
|
49
|
+
#
|
50
|
+
# Timestamp start = ...;
|
51
|
+
# Duration duration = ...;
|
52
|
+
# Timestamp end = ...;
|
53
|
+
#
|
54
|
+
# end.seconds = start.seconds + duration.seconds;
|
55
|
+
# end.nanos = start.nanos + duration.nanos;
|
56
|
+
#
|
57
|
+
# if (end.nanos < 0) {
|
58
|
+
# end.seconds -= 1;
|
59
|
+
# end.nanos += 1000000000;
|
60
|
+
# } else if (end.nanos >= 1000000000) {
|
61
|
+
# end.seconds += 1;
|
62
|
+
# end.nanos -= 1000000000;
|
63
|
+
# }
|
64
|
+
#
|
65
|
+
# Example 3: Compute Duration from datetime.timedelta in Python.
|
66
|
+
#
|
67
|
+
# td = datetime.timedelta(days=3, minutes=10)
|
68
|
+
# duration = Duration()
|
69
|
+
# duration.FromTimedelta(td)
|
70
|
+
#
|
71
|
+
# # JSON Mapping
|
72
|
+
#
|
73
|
+
# In JSON format, the Duration type is encoded as a string rather than an
|
74
|
+
# object, where the string ends in the suffix "s" (indicating seconds) and
|
75
|
+
# is preceded by the number of seconds, with nanoseconds expressed as
|
76
|
+
# fractional seconds. For example, 3 seconds with 0 nanoseconds should be
|
77
|
+
# encoded in JSON format as "3s", while 3 seconds and 1 nanosecond should
|
78
|
+
# be expressed in JSON format as "3.000000001s", and 3 seconds and 1
|
79
|
+
# microsecond should be expressed in JSON format as "3.000001s".
|
80
|
+
# @!attribute [rw] seconds
|
81
|
+
# @return [::Integer]
|
82
|
+
# Signed seconds of the span of time. Must be from -315,576,000,000
|
83
|
+
# to +315,576,000,000 inclusive. Note: these bounds are computed from:
|
84
|
+
# 60 sec/min * 60 min/hr * 24 hr/day * 365.25 days/year * 10000 years
|
85
|
+
# @!attribute [rw] nanos
|
86
|
+
# @return [::Integer]
|
87
|
+
# Signed fractions of a second at nanosecond resolution of the span
|
88
|
+
# of time. Durations less than one second are represented with a 0
|
89
|
+
# `seconds` field and a positive or negative `nanos` field. For durations
|
90
|
+
# of one second or more, a non-zero value for the `nanos` field must be
|
91
|
+
# of the same sign as the `seconds` field. Must be from -999,999,999
|
92
|
+
# to +999,999,999 inclusive.
|
93
|
+
class Duration
|
94
|
+
include ::Google::Protobuf::MessageExts
|
95
|
+
extend ::Google::Protobuf::MessageExts::ClassMethods
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|