google-cloud-gke_recommender-v1 0.a → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,558 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Copyright 2025 Google LLC
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # https://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # Auto-generated by gapic-generator-ruby. DO NOT EDIT!
18
+
19
+
20
+ module Google
21
+ module Cloud
22
+ module GkeRecommender
23
+ module V1
24
+ # Request message for
25
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}.
26
+ # @!attribute [rw] page_size
27
+ # @return [::Integer]
28
+ # Optional. The target number of results to return in a single response.
29
+ # If not specified, a default value will be chosen by the service.
30
+ # Note that the response may include a partial list and a caller should
31
+ # only rely on the response's
32
+ # {::Google::Cloud::GkeRecommender::V1::FetchModelsResponse#next_page_token next_page_token}
33
+ # to determine if there are more instances left to be queried.
34
+ # @!attribute [rw] page_token
35
+ # @return [::String]
36
+ # Optional. The value of
37
+ # {::Google::Cloud::GkeRecommender::V1::FetchModelsResponse#next_page_token next_page_token}
38
+ # received from a previous `FetchModelsRequest` call.
39
+ # Provide this to retrieve the subsequent page in a multi-page list of
40
+ # results. When paginating, all other parameters provided to
41
+ # `FetchModelsRequest` must match the call that provided the page token.
42
+ class FetchModelsRequest
43
+ include ::Google::Protobuf::MessageExts
44
+ extend ::Google::Protobuf::MessageExts::ClassMethods
45
+ end
46
+
47
+ # Response message for
48
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}.
49
+ # @!attribute [r] models
50
+ # @return [::Array<::String>]
51
+ # Output only. List of available models. Open-source models follow the
52
+ # Huggingface Hub `owner/model_name` format.
53
+ # @!attribute [r] next_page_token
54
+ # @return [::String]
55
+ # Output only. A token which may be sent as
56
+ # [page_token][FetchModelsResponse.page_token] in a subsequent
57
+ # `FetchModelsResponse` call to retrieve the next page of results.
58
+ # If this field is omitted or empty, then there are no more results to
59
+ # return.
60
+ class FetchModelsResponse
61
+ include ::Google::Protobuf::MessageExts
62
+ extend ::Google::Protobuf::MessageExts::ClassMethods
63
+ end
64
+
65
+ # Request message for
66
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_servers GkeInferenceQuickstart.FetchModelServers}.
67
+ # @!attribute [rw] model
68
+ # @return [::String]
69
+ # Required. The model for which to list model servers. Open-source models
70
+ # follow the Huggingface Hub `owner/model_name` format. Use
71
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}
72
+ # to find available models.
73
+ # @!attribute [rw] page_size
74
+ # @return [::Integer]
75
+ # Optional. The target number of results to return in a single response.
76
+ # If not specified, a default value will be chosen by the service.
77
+ # Note that the response may include a partial list and a caller should
78
+ # only rely on the response's
79
+ # {::Google::Cloud::GkeRecommender::V1::FetchModelServersResponse#next_page_token next_page_token}
80
+ # to determine if there are more instances left to be queried.
81
+ # @!attribute [rw] page_token
82
+ # @return [::String]
83
+ # Optional. The value of
84
+ # {::Google::Cloud::GkeRecommender::V1::FetchModelServersResponse#next_page_token next_page_token}
85
+ # received from a previous `FetchModelServersRequest` call.
86
+ # Provide this to retrieve the subsequent page in a multi-page list of
87
+ # results. When paginating, all other parameters provided to
88
+ # `FetchModelServersRequest` must match the call that provided the page
89
+ # token.
90
+ class FetchModelServersRequest
91
+ include ::Google::Protobuf::MessageExts
92
+ extend ::Google::Protobuf::MessageExts::ClassMethods
93
+ end
94
+
95
+ # Response message for
96
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_servers GkeInferenceQuickstart.FetchModelServers}.
97
+ # @!attribute [r] model_servers
98
+ # @return [::Array<::String>]
99
+ # Output only. List of available model servers. Open-source model servers use
100
+ # simplified, lowercase names (e.g., `vllm`).
101
+ # @!attribute [r] next_page_token
102
+ # @return [::String]
103
+ # Output only. A token which may be sent as
104
+ # [page_token][FetchModelServersResponse.page_token] in a subsequent
105
+ # `FetchModelServersResponse` call to retrieve the next page of results.
106
+ # If this field is omitted or empty, then there are no more results to
107
+ # return.
108
+ class FetchModelServersResponse
109
+ include ::Google::Protobuf::MessageExts
110
+ extend ::Google::Protobuf::MessageExts::ClassMethods
111
+ end
112
+
113
+ # Request message for
114
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_server_versions GkeInferenceQuickstart.FetchModelServerVersions}.
115
+ # @!attribute [rw] model
116
+ # @return [::String]
117
+ # Required. The model for which to list model server versions. Open-source
118
+ # models follow the Huggingface Hub `owner/model_name` format. Use
119
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}
120
+ # to find available models.
121
+ # @!attribute [rw] model_server
122
+ # @return [::String]
123
+ # Required. The model server for which to list versions. Open-source model
124
+ # servers use simplified, lowercase names (e.g., `vllm`). Use
125
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_servers GkeInferenceQuickstart.FetchModelServers}
126
+ # to find available model servers.
127
+ # @!attribute [rw] page_size
128
+ # @return [::Integer]
129
+ # Optional. The target number of results to return in a single response.
130
+ # If not specified, a default value will be chosen by the service.
131
+ # Note that the response may include a partial list and a caller should
132
+ # only rely on the response's
133
+ # {::Google::Cloud::GkeRecommender::V1::FetchModelServerVersionsResponse#next_page_token next_page_token}
134
+ # to determine if there are more instances left to be queried.
135
+ # @!attribute [rw] page_token
136
+ # @return [::String]
137
+ # Optional. The value of
138
+ # {::Google::Cloud::GkeRecommender::V1::FetchModelServerVersionsResponse#next_page_token next_page_token}
139
+ # received from a previous `FetchModelServerVersionsRequest` call.
140
+ # Provide this to retrieve the subsequent page in a multi-page list of
141
+ # results. When paginating, all other parameters provided to
142
+ # `FetchModelServerVersionsRequest` must match the call that provided the
143
+ # page token.
144
+ class FetchModelServerVersionsRequest
145
+ include ::Google::Protobuf::MessageExts
146
+ extend ::Google::Protobuf::MessageExts::ClassMethods
147
+ end
148
+
149
+ # Response message for
150
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_server_versions GkeInferenceQuickstart.FetchModelServerVersions}.
151
+ # @!attribute [r] model_server_versions
152
+ # @return [::Array<::String>]
153
+ # Output only. A list of available model server versions.
154
+ # @!attribute [r] next_page_token
155
+ # @return [::String]
156
+ # Output only. A token which may be sent as
157
+ # [page_token][FetchModelServerVersionsResponse.page_token] in a subsequent
158
+ # `FetchModelServerVersionsResponse` call to retrieve the next page of
159
+ # results. If this field is omitted or empty, then there are no more results
160
+ # to return.
161
+ class FetchModelServerVersionsResponse
162
+ include ::Google::Protobuf::MessageExts
163
+ extend ::Google::Protobuf::MessageExts::ClassMethods
164
+ end
165
+
166
+ # Request message for
167
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_benchmarking_data GkeInferenceQuickstart.FetchBenchmarkingData}.
168
+ # @!attribute [rw] model_server_info
169
+ # @return [::Google::Cloud::GkeRecommender::V1::ModelServerInfo]
170
+ # Required. The model server configuration to get benchmarking data for. Use
171
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}
172
+ # to find valid configurations.
173
+ # @!attribute [rw] instance_type
174
+ # @return [::String]
175
+ # Optional. The instance type to filter benchmarking data. Instance types are
176
+ # in the format `a2-highgpu-1g`. If not provided, all instance types for the
177
+ # given profile's `model_server_info` will be returned. Use
178
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}
179
+ # to find available instance types.
180
+ # @!attribute [rw] pricing_model
181
+ # @return [::String]
182
+ # Optional. The pricing model to use for the benchmarking data. Defaults to
183
+ # `spot`.
184
+ class FetchBenchmarkingDataRequest
185
+ include ::Google::Protobuf::MessageExts
186
+ extend ::Google::Protobuf::MessageExts::ClassMethods
187
+ end
188
+
189
+ # Response message for
190
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_benchmarking_data GkeInferenceQuickstart.FetchBenchmarkingData}.
191
+ # @!attribute [r] profile
192
+ # @return [::Array<::Google::Cloud::GkeRecommender::V1::Profile>]
193
+ # Output only. List of profiles containing their respective benchmarking
194
+ # data.
195
+ class FetchBenchmarkingDataResponse
196
+ include ::Google::Protobuf::MessageExts
197
+ extend ::Google::Protobuf::MessageExts::ClassMethods
198
+ end
199
+
200
+ # Request message for
201
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}.
202
+ # @!attribute [rw] model
203
+ # @return [::String]
204
+ # Optional. The model to filter profiles by. Open-source models follow the
205
+ # Huggingface Hub `owner/model_name` format. If not provided, all models are
206
+ # returned. Use
207
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}
208
+ # to find available models.
209
+ # @!attribute [rw] model_server
210
+ # @return [::String]
211
+ # Optional. The model server to filter profiles by. If not provided, all
212
+ # model servers are returned. Use
213
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_servers GkeInferenceQuickstart.FetchModelServers}
214
+ # to find available model servers for a given model.
215
+ # @!attribute [rw] model_server_version
216
+ # @return [::String]
217
+ # Optional. The model server version to filter profiles by. If not provided,
218
+ # all model server versions are returned. Use
219
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_server_versions GkeInferenceQuickstart.FetchModelServerVersions}
220
+ # to find available versions for a given model and server.
221
+ # @!attribute [rw] performance_requirements
222
+ # @return [::Google::Cloud::GkeRecommender::V1::PerformanceRequirements]
223
+ # Optional. The performance requirements to filter profiles. Profiles that do
224
+ # not meet these requirements are filtered out. If not provided, all profiles
225
+ # are returned.
226
+ # @!attribute [rw] page_size
227
+ # @return [::Integer]
228
+ # Optional. The target number of results to return in a single response. If
229
+ # not specified, a default value will be chosen by the service. Note that the
230
+ # response may include a partial list and a caller should only rely on the
231
+ # response's
232
+ # {::Google::Cloud::GkeRecommender::V1::FetchProfilesResponse#next_page_token next_page_token}
233
+ # to determine if there are more instances left to be queried.
234
+ # @!attribute [rw] page_token
235
+ # @return [::String]
236
+ # Optional. The value of
237
+ # {::Google::Cloud::GkeRecommender::V1::FetchProfilesResponse#next_page_token next_page_token}
238
+ # received from a previous `FetchProfilesRequest` call.
239
+ # Provide this to retrieve the subsequent page in a multi-page list of
240
+ # results. When paginating, all other parameters provided to
241
+ # `FetchProfilesRequest` must match the call that provided the page
242
+ # token.
243
+ class FetchProfilesRequest
244
+ include ::Google::Protobuf::MessageExts
245
+ extend ::Google::Protobuf::MessageExts::ClassMethods
246
+ end
247
+
248
+ # Performance requirements for a profile and or model deployment.
249
+ # @!attribute [rw] target_ntpot_milliseconds
250
+ # @return [::Integer]
251
+ # Optional. The target Normalized Time Per Output Token (NTPOT) in
252
+ # milliseconds. NTPOT is calculated as `request_latency /
253
+ # total_output_tokens`. If not provided, this target will not be enforced.
254
+ # @!attribute [rw] target_ttft_milliseconds
255
+ # @return [::Integer]
256
+ # Optional. The target Time To First Token (TTFT) in milliseconds. TTFT is
257
+ # the time it takes to generate the first token for a request. If not
258
+ # provided, this target will not be enforced.
259
+ # @!attribute [rw] target_cost
260
+ # @return [::Google::Cloud::GkeRecommender::V1::Cost]
261
+ # Optional. The target cost for running a profile's model server. If not
262
+ # provided, this requirement will not be enforced.
263
+ class PerformanceRequirements
264
+ include ::Google::Protobuf::MessageExts
265
+ extend ::Google::Protobuf::MessageExts::ClassMethods
266
+ end
267
+
268
+ # Represents an amount of money in a specific currency.
269
+ # @!attribute [r] units
270
+ # @return [::Integer]
271
+ # Output only. The whole units of the amount.
272
+ # For example if `currencyCode` is `"USD"`, then 1 unit is one US dollar.
273
+ # @!attribute [r] nanos
274
+ # @return [::Integer]
275
+ # Output only. Number of nano (10^-9) units of the amount.
276
+ # The value must be between -999,999,999 and +999,999,999 inclusive.
277
+ # If `units` is positive, `nanos` must be positive or zero.
278
+ # If `units` is zero, `nanos` can be positive, zero, or negative.
279
+ # If `units` is negative, `nanos` must be negative or zero.
280
+ # For example $-1.75 is represented as `units`=-1 and `nanos`=-750,000,000.
281
+ class Amount
282
+ include ::Google::Protobuf::MessageExts
283
+ extend ::Google::Protobuf::MessageExts::ClassMethods
284
+ end
285
+
286
+ # Cost for running a model deployment on a given instance type. Currently, only
287
+ # USD currency code is supported.
288
+ # @!attribute [rw] cost_per_million_output_tokens
289
+ # @return [::Google::Cloud::GkeRecommender::V1::Amount]
290
+ # Optional. The cost per million output tokens, calculated as:
291
+ # $/output token = GPU $/s / (1/output-to-input-cost-ratio * input tokens/s +
292
+ # output tokens/s)
293
+ # @!attribute [rw] cost_per_million_input_tokens
294
+ # @return [::Google::Cloud::GkeRecommender::V1::Amount]
295
+ # Optional. The cost per million input tokens. $/input token = ($/output
296
+ # token) / output-to-input-cost-ratio.
297
+ # @!attribute [rw] pricing_model
298
+ # @return [::String]
299
+ # Optional. The pricing model used to calculate the cost. Can be one of:
300
+ # `3-years-cud`, `1-year-cud`, `on-demand`, `spot`. If not provided, `spot`
301
+ # will be used.
302
+ # @!attribute [rw] output_input_cost_ratio
303
+ # @return [::Float]
304
+ # Optional. The output-to-input cost ratio. This determines how the total GPU
305
+ # cost is split between input and output tokens. If not provided, `4.0` is
306
+ # used, assuming a 4:1 output:input cost ratio.
307
+ class Cost
308
+ include ::Google::Protobuf::MessageExts
309
+ extend ::Google::Protobuf::MessageExts::ClassMethods
310
+ end
311
+
312
+ # Represents a range of throughput values in tokens per second.
313
+ # @!attribute [r] min
314
+ # @return [::Integer]
315
+ # Output only. The minimum value of the range.
316
+ # @!attribute [r] max
317
+ # @return [::Integer]
318
+ # Output only. The maximum value of the range.
319
+ class TokensPerSecondRange
320
+ include ::Google::Protobuf::MessageExts
321
+ extend ::Google::Protobuf::MessageExts::ClassMethods
322
+ end
323
+
324
+ # Represents a range of latency values in milliseconds.
325
+ # @!attribute [r] min
326
+ # @return [::Integer]
327
+ # Output only. The minimum value of the range.
328
+ # @!attribute [r] max
329
+ # @return [::Integer]
330
+ # Output only. The maximum value of the range.
331
+ class MillisecondRange
332
+ include ::Google::Protobuf::MessageExts
333
+ extend ::Google::Protobuf::MessageExts::ClassMethods
334
+ end
335
+
336
+ # Performance range for a model deployment.
337
+ # @!attribute [r] throughput_output_range
338
+ # @return [::Google::Cloud::GkeRecommender::V1::TokensPerSecondRange]
339
+ # Output only. The range of throughput in output tokens per second. This is
340
+ # measured as total_output_tokens_generated_by_server /
341
+ # elapsed_time_in_seconds.
342
+ # @!attribute [r] ttft_range
343
+ # @return [::Google::Cloud::GkeRecommender::V1::MillisecondRange]
344
+ # Output only. The range of TTFT (Time To First Token) in milliseconds. TTFT
345
+ # is the time it takes to generate the first token for a request.
346
+ # @!attribute [r] ntpot_range
347
+ # @return [::Google::Cloud::GkeRecommender::V1::MillisecondRange]
348
+ # Output only. The range of NTPOT (Normalized Time Per Output Token) in
349
+ # milliseconds. NTPOT is the request latency normalized by the number of
350
+ # output tokens, measured as request_latency / total_output_tokens.
351
+ class PerformanceRange
352
+ include ::Google::Protobuf::MessageExts
353
+ extend ::Google::Protobuf::MessageExts::ClassMethods
354
+ end
355
+
356
+ # Response message for
357
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}.
358
+ # @!attribute [r] profile
359
+ # @return [::Array<::Google::Cloud::GkeRecommender::V1::Profile>]
360
+ # Output only. List of profiles that match the given model server info and
361
+ # performance requirements (if provided).
362
+ # @!attribute [r] performance_range
363
+ # @return [::Google::Cloud::GkeRecommender::V1::PerformanceRange]
364
+ # Output only. The combined range of performance values observed across all
365
+ # profiles in this response.
366
+ # @!attribute [r] comments
367
+ # @return [::String]
368
+ # Output only. Additional comments related to the response.
369
+ # @!attribute [r] next_page_token
370
+ # @return [::String]
371
+ # Output only. A token which may be sent as
372
+ # [page_token][FetchProfilesResponse.page_token] in a subsequent
373
+ # `FetchProfilesResponse` call to retrieve the next page of results. If this
374
+ # field is omitted or empty, then there are no more results to return.
375
+ class FetchProfilesResponse
376
+ include ::Google::Protobuf::MessageExts
377
+ extend ::Google::Protobuf::MessageExts::ClassMethods
378
+ end
379
+
380
+ # Model server information gives. Valid model server info combinations can
381
+ # be found using
382
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}.
383
+ # @!attribute [rw] model
384
+ # @return [::String]
385
+ # Required. The model. Open-source models follow the Huggingface Hub
386
+ # `owner/model_name` format. Use
387
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_models GkeInferenceQuickstart.FetchModels}
388
+ # to find available models.
389
+ # @!attribute [rw] model_server
390
+ # @return [::String]
391
+ # Required. The model server. Open-source model servers use simplified,
392
+ # lowercase names (e.g., `vllm`). Use
393
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_servers GkeInferenceQuickstart.FetchModelServers}
394
+ # to find available servers.
395
+ # @!attribute [rw] model_server_version
396
+ # @return [::String]
397
+ # Optional. The model server version. Use
398
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_model_server_versions GkeInferenceQuickstart.FetchModelServerVersions}
399
+ # to find available versions. If not provided, the latest available version
400
+ # is used.
401
+ class ModelServerInfo
402
+ include ::Google::Protobuf::MessageExts
403
+ extend ::Google::Protobuf::MessageExts::ClassMethods
404
+ end
405
+
406
+ # Resources used by a model deployment.
407
+ # @!attribute [r] accelerator_count
408
+ # @return [::Integer]
409
+ # Output only. The number of accelerators (e.g., GPUs or TPUs) used by the
410
+ # model deployment on the Kubernetes node.
411
+ class ResourcesUsed
412
+ include ::Google::Protobuf::MessageExts
413
+ extend ::Google::Protobuf::MessageExts::ClassMethods
414
+ end
415
+
416
+ # Performance statistics for a model deployment.
417
+ # @!attribute [r] queries_per_second
418
+ # @return [::Float]
419
+ # Output only. The number of queries per second.
420
+ # Note: This metric can vary widely based on context length and may not be a
421
+ # reliable measure of LLM throughput.
422
+ # @!attribute [r] output_tokens_per_second
423
+ # @return [::Integer]
424
+ # Output only. The number of output tokens per second. This is the throughput
425
+ # measured as total_output_tokens_generated_by_server /
426
+ # elapsed_time_in_seconds.
427
+ # @!attribute [r] ntpot_milliseconds
428
+ # @return [::Integer]
429
+ # Output only. The Normalized Time Per Output Token (NTPOT) in milliseconds.
430
+ # This is the request latency normalized by the number of output tokens,
431
+ # measured as request_latency / total_output_tokens.
432
+ # @!attribute [r] ttft_milliseconds
433
+ # @return [::Integer]
434
+ # Output only. The Time To First Token (TTFT) in milliseconds. This is the
435
+ # time it takes to generate the first token for a request.
436
+ # @!attribute [r] cost
437
+ # @return [::Array<::Google::Cloud::GkeRecommender::V1::Cost>]
438
+ # Output only. The cost of running the model deployment.
439
+ class PerformanceStats
440
+ include ::Google::Protobuf::MessageExts
441
+ extend ::Google::Protobuf::MessageExts::ClassMethods
442
+ end
443
+
444
+ # A profile containing information about a model deployment.
445
+ # @!attribute [r] model_server_info
446
+ # @return [::Google::Cloud::GkeRecommender::V1::ModelServerInfo]
447
+ # Output only. The model server configuration. Use
448
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}
449
+ # to find valid configurations.
450
+ # @!attribute [r] accelerator_type
451
+ # @return [::String]
452
+ # Output only. The accelerator type. Expected format: `nvidia-h100-80gb`.
453
+ # @!attribute [r] tpu_topology
454
+ # @return [::String]
455
+ # Output only. The TPU topology (if applicable).
456
+ # @!attribute [r] instance_type
457
+ # @return [::String]
458
+ # Output only. The instance type. Expected format: `a2-highgpu-1g`.
459
+ # @!attribute [r] resources_used
460
+ # @return [::Google::Cloud::GkeRecommender::V1::ResourcesUsed]
461
+ # Output only. The resources used by the model deployment.
462
+ # @!attribute [r] performance_stats
463
+ # @return [::Array<::Google::Cloud::GkeRecommender::V1::PerformanceStats>]
464
+ # Output only. The performance statistics for this profile.
465
+ class Profile
466
+ include ::Google::Protobuf::MessageExts
467
+ extend ::Google::Protobuf::MessageExts::ClassMethods
468
+ end
469
+
470
+ # Request message for
471
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#generate_optimized_manifest GkeInferenceQuickstart.GenerateOptimizedManifest}.
472
+ # @!attribute [rw] model_server_info
473
+ # @return [::Google::Cloud::GkeRecommender::V1::ModelServerInfo]
474
+ # Required. The model server configuration to generate the manifest for. Use
475
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}
476
+ # to find valid configurations.
477
+ # @!attribute [rw] accelerator_type
478
+ # @return [::String]
479
+ # Required. The accelerator type. Use
480
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#fetch_profiles GkeInferenceQuickstart.FetchProfiles}
481
+ # to find valid accelerators for a given `model_server_info`.
482
+ # @!attribute [rw] kubernetes_namespace
483
+ # @return [::String]
484
+ # Optional. The kubernetes namespace to deploy the manifests in.
485
+ # @!attribute [rw] performance_requirements
486
+ # @return [::Google::Cloud::GkeRecommender::V1::PerformanceRequirements]
487
+ # Optional. The performance requirements to use for generating Horizontal Pod
488
+ # Autoscaler (HPA) resources. If provided, the manifest includes HPA
489
+ # resources to adjust the model server replica count to maintain the
490
+ # specified targets (e.g., NTPOT, TTFT) at a P50 latency. Cost targets are
491
+ # not currently supported for HPA generation. If the specified targets are
492
+ # not achievable, the HPA manifest will not be generated.
493
+ # @!attribute [rw] storage_config
494
+ # @return [::Google::Cloud::GkeRecommender::V1::StorageConfig]
495
+ # Optional. The storage configuration for the model. If not provided, the
496
+ # model is loaded from Huggingface.
497
+ class GenerateOptimizedManifestRequest
498
+ include ::Google::Protobuf::MessageExts
499
+ extend ::Google::Protobuf::MessageExts::ClassMethods
500
+ end
501
+
502
+ # A Kubernetes manifest.
503
+ # @!attribute [r] kind
504
+ # @return [::String]
505
+ # Output only. Kubernetes resource kind.
506
+ # @!attribute [r] api_version
507
+ # @return [::String]
508
+ # Output only. Kubernetes API version.
509
+ # @!attribute [r] content
510
+ # @return [::String]
511
+ # Output only. YAML content.
512
+ class KubernetesManifest
513
+ include ::Google::Protobuf::MessageExts
514
+ extend ::Google::Protobuf::MessageExts::ClassMethods
515
+ end
516
+
517
+ # Response message for
518
+ # {::Google::Cloud::GkeRecommender::V1::GkeInferenceQuickstart::Client#generate_optimized_manifest GkeInferenceQuickstart.GenerateOptimizedManifest}.
519
+ # @!attribute [r] kubernetes_manifests
520
+ # @return [::Array<::Google::Cloud::GkeRecommender::V1::KubernetesManifest>]
521
+ # Output only. A list of generated Kubernetes manifests.
522
+ # @!attribute [r] comments
523
+ # @return [::Array<::String>]
524
+ # Output only. Comments related to deploying the generated manifests.
525
+ # @!attribute [r] manifest_version
526
+ # @return [::String]
527
+ # Output only. Additional information about the versioned dependencies used
528
+ # to generate the manifests. See [Run best practice inference with GKE
529
+ # Inference Quickstart
530
+ # recipes](https://cloud.google.com/kubernetes-engine/docs/how-to/machine-learning/inference/inference-quickstart)
531
+ # for details.
532
+ class GenerateOptimizedManifestResponse
533
+ include ::Google::Protobuf::MessageExts
534
+ extend ::Google::Protobuf::MessageExts::ClassMethods
535
+ end
536
+
537
+ # Storage configuration for a model deployment.
538
+ # @!attribute [rw] model_bucket_uri
539
+ # @return [::String]
540
+ # Optional. The Google Cloud Storage bucket URI to load the model from. This
541
+ # URI must point to the directory containing the model's config file
542
+ # (`config.json`) and model weights. A tuned GCSFuse setup can improve
543
+ # LLM Pod startup time by more than 7x. Expected format:
544
+ # `gs://<bucket-name>/<path-to-model>`.
545
+ # @!attribute [rw] xla_cache_bucket_uri
546
+ # @return [::String]
547
+ # Optional. The URI for the GCS bucket containing the XLA compilation cache.
548
+ # If using TPUs, the XLA cache will be written to the same path as
549
+ # `model_bucket_uri`. This can speed up vLLM model preparation for repeated
550
+ # deployments.
551
+ class StorageConfig
552
+ include ::Google::Protobuf::MessageExts
553
+ extend ::Google::Protobuf::MessageExts::ClassMethods
554
+ end
555
+ end
556
+ end
557
+ end
558
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Copyright 2025 Google LLC
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # https://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # Auto-generated by gapic-generator-ruby. DO NOT EDIT!
18
+
19
+
20
+ module Google
21
+ module Protobuf
22
+ # A Duration represents a signed, fixed-length span of time represented
23
+ # as a count of seconds and fractions of seconds at nanosecond
24
+ # resolution. It is independent of any calendar and concepts like "day"
25
+ # or "month". It is related to Timestamp in that the difference between
26
+ # two Timestamp values is a Duration and it can be added or subtracted
27
+ # from a Timestamp. Range is approximately +-10,000 years.
28
+ #
29
+ # # Examples
30
+ #
31
+ # Example 1: Compute Duration from two Timestamps in pseudo code.
32
+ #
33
+ # Timestamp start = ...;
34
+ # Timestamp end = ...;
35
+ # Duration duration = ...;
36
+ #
37
+ # duration.seconds = end.seconds - start.seconds;
38
+ # duration.nanos = end.nanos - start.nanos;
39
+ #
40
+ # if (duration.seconds < 0 && duration.nanos > 0) {
41
+ # duration.seconds += 1;
42
+ # duration.nanos -= 1000000000;
43
+ # } else if (duration.seconds > 0 && duration.nanos < 0) {
44
+ # duration.seconds -= 1;
45
+ # duration.nanos += 1000000000;
46
+ # }
47
+ #
48
+ # Example 2: Compute Timestamp from Timestamp + Duration in pseudo code.
49
+ #
50
+ # Timestamp start = ...;
51
+ # Duration duration = ...;
52
+ # Timestamp end = ...;
53
+ #
54
+ # end.seconds = start.seconds + duration.seconds;
55
+ # end.nanos = start.nanos + duration.nanos;
56
+ #
57
+ # if (end.nanos < 0) {
58
+ # end.seconds -= 1;
59
+ # end.nanos += 1000000000;
60
+ # } else if (end.nanos >= 1000000000) {
61
+ # end.seconds += 1;
62
+ # end.nanos -= 1000000000;
63
+ # }
64
+ #
65
+ # Example 3: Compute Duration from datetime.timedelta in Python.
66
+ #
67
+ # td = datetime.timedelta(days=3, minutes=10)
68
+ # duration = Duration()
69
+ # duration.FromTimedelta(td)
70
+ #
71
+ # # JSON Mapping
72
+ #
73
+ # In JSON format, the Duration type is encoded as a string rather than an
74
+ # object, where the string ends in the suffix "s" (indicating seconds) and
75
+ # is preceded by the number of seconds, with nanoseconds expressed as
76
+ # fractional seconds. For example, 3 seconds with 0 nanoseconds should be
77
+ # encoded in JSON format as "3s", while 3 seconds and 1 nanosecond should
78
+ # be expressed in JSON format as "3.000000001s", and 3 seconds and 1
79
+ # microsecond should be expressed in JSON format as "3.000001s".
80
+ # @!attribute [rw] seconds
81
+ # @return [::Integer]
82
+ # Signed seconds of the span of time. Must be from -315,576,000,000
83
+ # to +315,576,000,000 inclusive. Note: these bounds are computed from:
84
+ # 60 sec/min * 60 min/hr * 24 hr/day * 365.25 days/year * 10000 years
85
+ # @!attribute [rw] nanos
86
+ # @return [::Integer]
87
+ # Signed fractions of a second at nanosecond resolution of the span
88
+ # of time. Durations less than one second are represented with a 0
89
+ # `seconds` field and a positive or negative `nanos` field. For durations
90
+ # of one second or more, a non-zero value for the `nanos` field must be
91
+ # of the same sign as the `seconds` field. Must be from -999,999,999
92
+ # to +999,999,999 inclusive.
93
+ class Duration
94
+ include ::Google::Protobuf::MessageExts
95
+ extend ::Google::Protobuf::MessageExts::ClassMethods
96
+ end
97
+ end
98
+ end