xpk 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +131 -0
- xpk/commands/cluster.py +808 -0
- xpk/commands/cluster_gcluster.py +269 -0
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +243 -0
- xpk/commands/inspector.py +357 -0
- xpk/commands/job.py +199 -0
- xpk/commands/kind.py +283 -0
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +140 -0
- xpk/commands/storage.py +267 -0
- xpk/commands/version.py +27 -0
- xpk/commands/workload.py +889 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +62 -0
- xpk/core/blueprint/blueprint_generator.py +708 -0
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +200 -0
- xpk/core/commands.py +356 -0
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +176 -0
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +444 -0
- xpk/core/kueue.py +358 -0
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +361 -0
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +377 -0
- xpk/core/ray.py +222 -0
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +1432 -0
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +341 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +129 -0
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
- xpk/main.py +75 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +43 -0
- xpk/parser/cluster.py +662 -0
- xpk/parser/common.py +259 -0
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +135 -0
- xpk/parser/info.py +64 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +147 -0
- xpk/parser/kind.py +95 -0
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +59 -0
- xpk/parser/storage.py +316 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +726 -0
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +88 -0
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- xpk/utils/yaml.py +30 -0
- xpk-0.0.1.dist-info/LICENSE +202 -0
- xpk-0.0.1.dist-info/METADATA +1498 -0
- xpk-0.0.1.dist-info/RECORD +92 -0
- xpk-0.0.1.dist-info/WHEEL +5 -0
- xpk-0.0.1.dist-info/entry_points.txt +2 -0
- xpk-0.0.1.dist-info/top_level.txt +1 -0
xpk/parser/cluster.py
ADDED
|
@@ -0,0 +1,662 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..commands.cluster import (
|
|
18
|
+
cluster_cacheimage,
|
|
19
|
+
cluster_create,
|
|
20
|
+
cluster_create_pathways,
|
|
21
|
+
cluster_create_ray_cluster,
|
|
22
|
+
cluster_delete,
|
|
23
|
+
cluster_describe,
|
|
24
|
+
cluster_list,
|
|
25
|
+
)
|
|
26
|
+
from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
|
|
27
|
+
from .common import add_shared_arguments
|
|
28
|
+
from .validators import name_type
|
|
29
|
+
from ..commands.config import xpk_cfg
|
|
30
|
+
from ..core.config import CFG_BUCKET_KEY
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def set_cluster_parser(cluster_parser):
|
|
34
|
+
cluster_subcommands = cluster_parser.add_subparsers(
|
|
35
|
+
title='cluster subcommands',
|
|
36
|
+
dest='xpk_cluster_subcommands',
|
|
37
|
+
help=(
|
|
38
|
+
'These are commands related to cluster management. Look at help for'
|
|
39
|
+
' specific subcommands for more details.'
|
|
40
|
+
),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
### "cluster create" command parser ###
|
|
44
|
+
cluster_create_parser = cluster_subcommands.add_parser(
|
|
45
|
+
'create', help='Create cloud clusters.'
|
|
46
|
+
)
|
|
47
|
+
cluster_create_required_arguments = cluster_create_parser.add_argument_group(
|
|
48
|
+
'Required Arguments',
|
|
49
|
+
'Arguments required for cluster create.',
|
|
50
|
+
)
|
|
51
|
+
cluster_create_optional_arguments = cluster_create_parser.add_argument_group(
|
|
52
|
+
'Optional Arguments', 'Arguments optional for cluster create.'
|
|
53
|
+
)
|
|
54
|
+
cluster_create_capacity_arguments = cluster_create_parser.add_argument_group(
|
|
55
|
+
'Capacity Arguments', 'Arguments related to capacity for cluster create.'
|
|
56
|
+
)
|
|
57
|
+
cluster_create_tensorboard_arguments = (
|
|
58
|
+
cluster_create_parser.add_argument_group(
|
|
59
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
60
|
+
'Arguments for creating Vertex AI Tensorboard in cluster create.',
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
### Required arguments specific to "cluster create"
|
|
65
|
+
|
|
66
|
+
cluster_device_group = (
|
|
67
|
+
cluster_create_required_arguments.add_mutually_exclusive_group(
|
|
68
|
+
required=True
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
cluster_device_group.add_argument(
|
|
72
|
+
'--tpu-type',
|
|
73
|
+
type=str,
|
|
74
|
+
default=None,
|
|
75
|
+
help='The tpu type to use, v5litepod-16, etc.',
|
|
76
|
+
)
|
|
77
|
+
cluster_device_group.add_argument(
|
|
78
|
+
'--device-type',
|
|
79
|
+
type=str,
|
|
80
|
+
default=None,
|
|
81
|
+
help=(
|
|
82
|
+
'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
|
|
83
|
+
' h100-80gb-8, n2-standard-32-4 etc.'
|
|
84
|
+
),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
### Optional arguments specific to "cluster create"
|
|
88
|
+
cluster_create_optional_arguments.add_argument(
|
|
89
|
+
'--cluster-state-gcs-bucket',
|
|
90
|
+
type=str,
|
|
91
|
+
default=xpk_cfg.get(CFG_BUCKET_KEY),
|
|
92
|
+
help='The name of the bucket to store cluster state.',
|
|
93
|
+
required=False,
|
|
94
|
+
)
|
|
95
|
+
cluster_create_optional_arguments.add_argument(
|
|
96
|
+
'--num-nodes',
|
|
97
|
+
type=int,
|
|
98
|
+
default=2,
|
|
99
|
+
help='The number of nodes for a cluster, defaults to 2.',
|
|
100
|
+
required=False,
|
|
101
|
+
)
|
|
102
|
+
cluster_create_optional_arguments.add_argument(
|
|
103
|
+
'--enable-pathways',
|
|
104
|
+
action='store_true',
|
|
105
|
+
help=(
|
|
106
|
+
'Please use `xpk cluster create-pathways` instead to'
|
|
107
|
+
' enable cluster to accept Pathways workloads.'
|
|
108
|
+
),
|
|
109
|
+
)
|
|
110
|
+
### Autoprovisioning arguments specific to "cluster create"
|
|
111
|
+
cluster_create_autoprovisioning_arguments = (
|
|
112
|
+
cluster_create_parser.add_argument_group(
|
|
113
|
+
'Optional Autoprovisioning Arguments',
|
|
114
|
+
'Arguments optional for enabling autoprovisioning.',
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
cluster_create_autoprovisioning_arguments.add_argument(
|
|
118
|
+
'--enable-autoprovisioning',
|
|
119
|
+
action='store_true',
|
|
120
|
+
help=(
|
|
121
|
+
'Enable GKE features for autoprovisioning node pools in GKE clusters.'
|
|
122
|
+
),
|
|
123
|
+
)
|
|
124
|
+
cluster_create_autoprovisioning_arguments.add_argument(
|
|
125
|
+
'--autoprovisioning-min-chips',
|
|
126
|
+
type=int,
|
|
127
|
+
help=(
|
|
128
|
+
'Optionally set the minimum autoprovisioning accelerator resources in'
|
|
129
|
+
' units of chips.By default, autoprovisioning will use the number of'
|
|
130
|
+
' resources in the cluster as the minimum, and maximum.'
|
|
131
|
+
),
|
|
132
|
+
)
|
|
133
|
+
cluster_create_autoprovisioning_arguments.add_argument(
|
|
134
|
+
'--autoprovisioning-max-chips',
|
|
135
|
+
type=int,
|
|
136
|
+
help=(
|
|
137
|
+
'Optionally set the maximum autoprovisioning accelerator resources in'
|
|
138
|
+
' units of chips.By default, autoprovisioning will use the number of'
|
|
139
|
+
' resources in the cluster as the minimum, and maximum.'
|
|
140
|
+
),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
### "cluster create-pathways" command parser ###
|
|
144
|
+
|
|
145
|
+
cluster_create_pathways_parser = cluster_subcommands.add_parser(
|
|
146
|
+
'create-pathways',
|
|
147
|
+
help='Create Pathways-on-Cloud clusters.',
|
|
148
|
+
)
|
|
149
|
+
cluster_create_pathways_required_arguments = (
|
|
150
|
+
cluster_create_pathways_parser.add_argument_group(
|
|
151
|
+
'Required Arguments',
|
|
152
|
+
'Arguments required for cluster create-pathways.',
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
cluster_create_pathways_optional_arguments = (
|
|
156
|
+
cluster_create_pathways_parser.add_argument_group(
|
|
157
|
+
'Optional Arguments',
|
|
158
|
+
'Arguments optional for cluster create-pathways.',
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
cluster_create_pathways_capacity_arguments = (
|
|
162
|
+
cluster_create_pathways_parser.add_argument_group(
|
|
163
|
+
'Capacity Arguments',
|
|
164
|
+
'Arguments related to capacity for cluster create-pathways.',
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
cluster_create_pathways_tensorboard_arguments = (
|
|
168
|
+
cluster_create_pathways_parser.add_argument_group(
|
|
169
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
170
|
+
'Arguments for creating Vertex AI Tensorboard in cluster create.',
|
|
171
|
+
)
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
### Pathways required arguments specific to "cluster create"
|
|
175
|
+
cluster_create_pathways_required_arguments.add_argument(
|
|
176
|
+
'--tpu-type',
|
|
177
|
+
type=str,
|
|
178
|
+
default=None,
|
|
179
|
+
help='The tpu type to use, v5litepod-16, etc.',
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
### "cluster create-ray" command parser
|
|
183
|
+
|
|
184
|
+
cluster_create_ray_cluster_parser = cluster_subcommands.add_parser(
|
|
185
|
+
'create-ray',
|
|
186
|
+
help='Create RayCluster',
|
|
187
|
+
)
|
|
188
|
+
cluster_create_ray_cluster_required_arguments = (
|
|
189
|
+
cluster_create_ray_cluster_parser.add_argument_group(
|
|
190
|
+
'Required Arguments',
|
|
191
|
+
'Arguments required for cluster create-ray.',
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
cluster_create_ray_cluster_optional_arguments = (
|
|
195
|
+
cluster_create_ray_cluster_parser.add_argument_group(
|
|
196
|
+
'Optional Arguments',
|
|
197
|
+
'Arguments optional for cluster create-ray.',
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
cluster_create_ray_cluster_capacity_arguments = (
|
|
201
|
+
cluster_create_ray_cluster_parser.add_argument_group(
|
|
202
|
+
'Capacity Arguments',
|
|
203
|
+
'Arguments related to capacity for cluster create-ray.',
|
|
204
|
+
)
|
|
205
|
+
)
|
|
206
|
+
cluster_create_ray_cluster_tensorboard_arguments = (
|
|
207
|
+
cluster_create_ray_cluster_parser.add_argument_group(
|
|
208
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
209
|
+
'Arguments for creating Vertex AI Tensorboard in cluster create.',
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
### RayCluster required arguments specific to "cluster create"
|
|
214
|
+
cluster_create_ray_cluster_required_arguments.add_argument(
|
|
215
|
+
'--tpu-type',
|
|
216
|
+
type=str,
|
|
217
|
+
default=None,
|
|
218
|
+
help='The tpu type to use, v5litepod-16, etc.',
|
|
219
|
+
required=True,
|
|
220
|
+
)
|
|
221
|
+
# TODO(bzmarke): Add --device-type to support GPU/CPU
|
|
222
|
+
cluster_create_ray_cluster_required_arguments.add_argument(
|
|
223
|
+
'--ray-version',
|
|
224
|
+
type=str,
|
|
225
|
+
default=None,
|
|
226
|
+
help="The Ray version to use, e.g. '2.38.0'",
|
|
227
|
+
required=True,
|
|
228
|
+
)
|
|
229
|
+
cluster_create_ray_cluster_optional_arguments.add_argument(
|
|
230
|
+
'--enable-pathways',
|
|
231
|
+
action='store_true',
|
|
232
|
+
help=(
|
|
233
|
+
'DEPRECATING SOON!!! Please use `xpk cluster create-pathways`.'
|
|
234
|
+
' Enable cluster to accept Pathways workloads.'
|
|
235
|
+
),
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
add_shared_cluster_create_required_arguments([
|
|
239
|
+
cluster_create_required_arguments,
|
|
240
|
+
cluster_create_pathways_required_arguments,
|
|
241
|
+
cluster_create_ray_cluster_required_arguments,
|
|
242
|
+
])
|
|
243
|
+
add_shared_cluster_create_optional_arguments([
|
|
244
|
+
cluster_create_optional_arguments,
|
|
245
|
+
cluster_create_pathways_optional_arguments,
|
|
246
|
+
cluster_create_ray_cluster_optional_arguments,
|
|
247
|
+
])
|
|
248
|
+
add_shared_cluster_create_capacity_arguments([
|
|
249
|
+
cluster_create_capacity_arguments,
|
|
250
|
+
cluster_create_pathways_capacity_arguments,
|
|
251
|
+
cluster_create_ray_cluster_capacity_arguments,
|
|
252
|
+
])
|
|
253
|
+
add_shared_cluster_create_tensorboard_arguments([
|
|
254
|
+
cluster_create_tensorboard_arguments,
|
|
255
|
+
cluster_create_pathways_tensorboard_arguments,
|
|
256
|
+
cluster_create_ray_cluster_tensorboard_arguments,
|
|
257
|
+
])
|
|
258
|
+
|
|
259
|
+
cluster_create_parser.set_defaults(func=cluster_create)
|
|
260
|
+
cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways)
|
|
261
|
+
cluster_create_ray_cluster_parser.set_defaults(
|
|
262
|
+
func=cluster_create_ray_cluster
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
### "cluster delete" command parser ###
|
|
266
|
+
cluster_delete_parser = cluster_subcommands.add_parser(
|
|
267
|
+
'delete',
|
|
268
|
+
help='Delete cloud clusters.',
|
|
269
|
+
)
|
|
270
|
+
cluster_delete_required_arguments = cluster_delete_parser.add_argument_group(
|
|
271
|
+
'Required Arguments',
|
|
272
|
+
'Arguments required for cluster delete.',
|
|
273
|
+
)
|
|
274
|
+
cluster_delete_optional_arguments = cluster_delete_parser.add_argument_group(
|
|
275
|
+
'Optional Arguments', 'Arguments optional for cluster delete.'
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
### Required arguments
|
|
279
|
+
cluster_delete_required_arguments.add_argument(
|
|
280
|
+
'--cluster',
|
|
281
|
+
type=name_type,
|
|
282
|
+
default=None,
|
|
283
|
+
help='The name of the cluster to be deleted.',
|
|
284
|
+
required=True,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
### Optional Arguments
|
|
288
|
+
cluster_delete_optional_arguments.add_argument(
|
|
289
|
+
'--cluster-state-gcs-bucket',
|
|
290
|
+
type=str,
|
|
291
|
+
default=xpk_cfg.get(CFG_BUCKET_KEY),
|
|
292
|
+
help='The name of the bucket to store cluster state.',
|
|
293
|
+
required=False,
|
|
294
|
+
)
|
|
295
|
+
add_shared_arguments(cluster_delete_optional_arguments)
|
|
296
|
+
cluster_delete_parser.set_defaults(func=cluster_delete)
|
|
297
|
+
cluster_delete_parser.add_argument(
|
|
298
|
+
'--force',
|
|
299
|
+
action='store_true',
|
|
300
|
+
help=(
|
|
301
|
+
'Forces workload deletion command to run without additional approval.'
|
|
302
|
+
),
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
### "cluster cacheimage" command parser ###
|
|
306
|
+
cluster_cacheimage_parser = cluster_subcommands.add_parser(
|
|
307
|
+
'cacheimage',
|
|
308
|
+
help='Cache image.',
|
|
309
|
+
)
|
|
310
|
+
cluster_cacheimage_required_arguments = (
|
|
311
|
+
cluster_cacheimage_parser.add_argument_group(
|
|
312
|
+
'Required Arguments',
|
|
313
|
+
'Arguments required for cluster cacheimage.',
|
|
314
|
+
)
|
|
315
|
+
)
|
|
316
|
+
cluster_cacheimage_optional_arguments = (
|
|
317
|
+
cluster_cacheimage_parser.add_argument_group(
|
|
318
|
+
'Optional Arguments', 'Arguments optional for cluster cacheimage.'
|
|
319
|
+
)
|
|
320
|
+
)
|
|
321
|
+
cluster_cacheimage_group = (
|
|
322
|
+
cluster_cacheimage_parser.add_mutually_exclusive_group(required=True)
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
### Device Type Argument
|
|
326
|
+
cluster_cacheimage_group.add_argument(
|
|
327
|
+
'--tpu-type',
|
|
328
|
+
type=str,
|
|
329
|
+
default=None,
|
|
330
|
+
help='The tpu type to cache images on, v5litepod-16, etc.',
|
|
331
|
+
)
|
|
332
|
+
cluster_cacheimage_group.add_argument(
|
|
333
|
+
'--device-type',
|
|
334
|
+
type=str,
|
|
335
|
+
default=None,
|
|
336
|
+
help=(
|
|
337
|
+
'The device type to cache images on (can be tpu or gpu),'
|
|
338
|
+
' v5litepod-16, h100-80gb-8, etc.'
|
|
339
|
+
),
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
### Required arguments
|
|
343
|
+
cluster_cacheimage_required_arguments.add_argument(
|
|
344
|
+
'--cluster',
|
|
345
|
+
type=name_type,
|
|
346
|
+
default=None,
|
|
347
|
+
help='The name of the cluster to cache the image.',
|
|
348
|
+
required=True,
|
|
349
|
+
)
|
|
350
|
+
cluster_cacheimage_required_arguments.add_argument(
|
|
351
|
+
'--docker-image',
|
|
352
|
+
type=str,
|
|
353
|
+
default=None,
|
|
354
|
+
help='The docker-image to cache.',
|
|
355
|
+
required=True,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
### Optional Arguments
|
|
359
|
+
add_shared_arguments(cluster_cacheimage_optional_arguments)
|
|
360
|
+
cluster_cacheimage_optional_arguments.add_argument(
|
|
361
|
+
'--cache-key',
|
|
362
|
+
type=str,
|
|
363
|
+
default='containerimage',
|
|
364
|
+
help='The key to cache the docker image under.',
|
|
365
|
+
required=False,
|
|
366
|
+
)
|
|
367
|
+
cluster_cacheimage_parser.set_defaults(func=cluster_cacheimage)
|
|
368
|
+
|
|
369
|
+
### "cluster describe" command parser ###
|
|
370
|
+
cluster_describe_parser = cluster_subcommands.add_parser(
|
|
371
|
+
'describe',
|
|
372
|
+
help='Describe a cluster.',
|
|
373
|
+
)
|
|
374
|
+
cluster_describe_required_arguments = (
|
|
375
|
+
cluster_describe_parser.add_argument_group(
|
|
376
|
+
'Required Arguments',
|
|
377
|
+
'Arguments required for cluster describe.',
|
|
378
|
+
)
|
|
379
|
+
)
|
|
380
|
+
cluster_describe_optional_arguments = (
|
|
381
|
+
cluster_describe_parser.add_argument_group(
|
|
382
|
+
'Optional Arguments', 'Arguments optional for cluster describe.'
|
|
383
|
+
)
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
### Required arguments
|
|
387
|
+
cluster_describe_required_arguments.add_argument(
|
|
388
|
+
'--cluster',
|
|
389
|
+
type=name_type,
|
|
390
|
+
default=None,
|
|
391
|
+
help='The name of the cluster to be describe.',
|
|
392
|
+
required=True,
|
|
393
|
+
)
|
|
394
|
+
### Optional Arguments
|
|
395
|
+
add_shared_arguments(cluster_describe_optional_arguments)
|
|
396
|
+
|
|
397
|
+
cluster_describe_parser.set_defaults(func=cluster_describe)
|
|
398
|
+
|
|
399
|
+
# "cluster list" command parser.
|
|
400
|
+
cluster_list_parser = cluster_subcommands.add_parser(
|
|
401
|
+
'list', help='List cloud clusters.'
|
|
402
|
+
)
|
|
403
|
+
cluster_list_optional_arguments = cluster_list_parser.add_argument_group(
|
|
404
|
+
'Optional Arguments', 'Arguments optional for cluster list.'
|
|
405
|
+
)
|
|
406
|
+
### Optional Arguments
|
|
407
|
+
add_shared_arguments(cluster_list_optional_arguments)
|
|
408
|
+
|
|
409
|
+
cluster_list_parser.set_defaults(func=cluster_list)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def add_shared_cluster_create_required_arguments(args_parsers):
|
|
413
|
+
"""Add shared required arguments in cluster create and Pathways cluster create.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
List of cluster create required arguments parsers
|
|
417
|
+
"""
|
|
418
|
+
for custom_parser in args_parsers:
|
|
419
|
+
custom_parser.add_argument(
|
|
420
|
+
'--cluster',
|
|
421
|
+
type=name_type,
|
|
422
|
+
default=None,
|
|
423
|
+
help=(
|
|
424
|
+
'The name of the cluster. Will be used as the prefix for internal'
|
|
425
|
+
' objects in the cluster.'
|
|
426
|
+
),
|
|
427
|
+
required=True,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def add_shared_cluster_create_optional_arguments(args_parsers):
|
|
432
|
+
"""Add shared optional arguments in cluster create and Pathways cluster create.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
List of cluster create optional arguments parsers
|
|
436
|
+
"""
|
|
437
|
+
for custom_parser in args_parsers:
|
|
438
|
+
add_shared_arguments(custom_parser)
|
|
439
|
+
custom_parser.add_argument(
|
|
440
|
+
'--host-maintenance-interval',
|
|
441
|
+
type=str,
|
|
442
|
+
choices=['AS_NEEDED', 'PERIODIC'],
|
|
443
|
+
default='AS_NEEDED',
|
|
444
|
+
help='The maintenance policy of the cluster and respective clusters.',
|
|
445
|
+
)
|
|
446
|
+
custom_parser.add_argument(
|
|
447
|
+
'--gke-version',
|
|
448
|
+
type=str,
|
|
449
|
+
help=(
|
|
450
|
+
'The GKE version of the cluster and respective clusters. The'
|
|
451
|
+
' default is determined dynamically based on RAPID channel'
|
|
452
|
+
' recommended version.'
|
|
453
|
+
),
|
|
454
|
+
)
|
|
455
|
+
custom_parser.add_argument(
|
|
456
|
+
'--num-slices',
|
|
457
|
+
type=int,
|
|
458
|
+
default=1,
|
|
459
|
+
help='The number of slices to run the job on, defaults to 1.',
|
|
460
|
+
required=False,
|
|
461
|
+
)
|
|
462
|
+
custom_parser.add_argument(
|
|
463
|
+
'--pathways-gce-machine-type',
|
|
464
|
+
type=str,
|
|
465
|
+
default='n1-standard-32',
|
|
466
|
+
help='The CPU type for Pathways CPU nodepools',
|
|
467
|
+
)
|
|
468
|
+
custom_parser.add_argument(
|
|
469
|
+
'--default-pool-cpu-machine-type',
|
|
470
|
+
type=str,
|
|
471
|
+
default='e2-standard-16',
|
|
472
|
+
help=(
|
|
473
|
+
'Set the machine type within the default cpu node pool. For'
|
|
474
|
+
' regional clusters, all zones must support the machine type.'
|
|
475
|
+
),
|
|
476
|
+
)
|
|
477
|
+
custom_parser.add_argument(
|
|
478
|
+
'--cluster-cpu-machine-type',
|
|
479
|
+
type=str,
|
|
480
|
+
default='',
|
|
481
|
+
help=(
|
|
482
|
+
'Getting deprecated soon! Please use'
|
|
483
|
+
' --default-pool-cpu-machine-typeinstead, to denote the machine'
|
|
484
|
+
' type of the default cpu node pool. Set the machine type of other'
|
|
485
|
+
' cpu nodepools using --device-type.'
|
|
486
|
+
),
|
|
487
|
+
)
|
|
488
|
+
custom_parser.add_argument(
|
|
489
|
+
'--default-pool-cpu-num-nodes',
|
|
490
|
+
type=int,
|
|
491
|
+
default=6,
|
|
492
|
+
help=(
|
|
493
|
+
'Set the number of nodes within the default cpu node pool. This is'
|
|
494
|
+
' set to 6 by default. Autoscaling is enabled to scale this value'
|
|
495
|
+
' over time.'
|
|
496
|
+
),
|
|
497
|
+
)
|
|
498
|
+
custom_parser.add_argument(
|
|
499
|
+
'--custom-cluster-arguments',
|
|
500
|
+
type=str,
|
|
501
|
+
default='',
|
|
502
|
+
help=(
|
|
503
|
+
'Users can add their own arguments to customize their cluster'
|
|
504
|
+
' create command. Do note, these will not override already used'
|
|
505
|
+
' cluster creation arguments. e.g.'
|
|
506
|
+
" --custom-cluster-arguments='--network=mtu9k --subnetwork=mtu9k'"
|
|
507
|
+
),
|
|
508
|
+
)
|
|
509
|
+
custom_parser.add_argument(
|
|
510
|
+
'--custom-nodepool-arguments',
|
|
511
|
+
type=str,
|
|
512
|
+
default='',
|
|
513
|
+
help=(
|
|
514
|
+
'Users can add their own arguments to customize their node pool '
|
|
515
|
+
' create command. Do note, these will not override already used'
|
|
516
|
+
' node pool creation arguments. e.g.'
|
|
517
|
+
' --custom-nodepool-arguments="--disk-size=300"'
|
|
518
|
+
),
|
|
519
|
+
)
|
|
520
|
+
custom_parser.add_argument(
|
|
521
|
+
'--force',
|
|
522
|
+
action='store_true',
|
|
523
|
+
help=(
|
|
524
|
+
'Forces node pool creation and delete commands to run without'
|
|
525
|
+
' additional approval.'
|
|
526
|
+
),
|
|
527
|
+
)
|
|
528
|
+
custom_parser.add_argument(
|
|
529
|
+
'--custom-tpu-nodepool-arguments',
|
|
530
|
+
type=str,
|
|
531
|
+
default='',
|
|
532
|
+
help=(
|
|
533
|
+
'DEPRECATING SOON! Please use --custom-nodepool-arguments to'
|
|
534
|
+
' customize node pool create command. Do note, these will not'
|
|
535
|
+
' override already used node pool creation arguments. Example usage'
|
|
536
|
+
' --custom-tpu-nodepool-arguments="--enable-ip-alias"'
|
|
537
|
+
),
|
|
538
|
+
)
|
|
539
|
+
custom_parser.add_argument(
|
|
540
|
+
'--private',
|
|
541
|
+
action='store_true',
|
|
542
|
+
help=(
|
|
543
|
+
'Creates a private GKE cluster, a VPC-native cluster in which Nodes'
|
|
544
|
+
' and Pods are isolated from the internet. If set,'
|
|
545
|
+
' master_authorized_networks will also be enabled and access to the'
|
|
546
|
+
" cluster's control plane will be restricted only to current"
|
|
547
|
+
" machine's IP address unless more IP ranges are authorized by"
|
|
548
|
+
' providing --authorized-networks. This works only on creating new'
|
|
549
|
+
' clusters.'
|
|
550
|
+
),
|
|
551
|
+
)
|
|
552
|
+
custom_parser.add_argument(
|
|
553
|
+
'--authorized-networks',
|
|
554
|
+
action='extend',
|
|
555
|
+
nargs='+',
|
|
556
|
+
help=(
|
|
557
|
+
'Sets the provided cidrs as authorized IP ranges to access the'
|
|
558
|
+
" private cluster's control plan. Access to the control plane will"
|
|
559
|
+
" be provided to current machine's IP address even if"
|
|
560
|
+
' --authorized-networks is not set or it does not cover the IP'
|
|
561
|
+
' address. If set, --private is considered true and a private'
|
|
562
|
+
' cluster will be provisioned. It replaces existing authorized'
|
|
563
|
+
' networks if used with an existing private cluster.'
|
|
564
|
+
' Example usage: --authorized-networks 1.2.3.0/24 1.2.4.5/32'
|
|
565
|
+
),
|
|
566
|
+
)
|
|
567
|
+
custom_parser.add_argument(
|
|
568
|
+
'--enable-workload-identity',
|
|
569
|
+
action='store_true',
|
|
570
|
+
help=(
|
|
571
|
+
'Enable Workload Identity Federation on the cluster and node-pools.'
|
|
572
|
+
),
|
|
573
|
+
)
|
|
574
|
+
custom_parser.add_argument(
|
|
575
|
+
'--enable-gcsfuse-csi-driver',
|
|
576
|
+
action='store_true',
|
|
577
|
+
help=(
|
|
578
|
+
'Enable GSCFuse driver on the cluster. This enables Workload'
|
|
579
|
+
' Identity Federation. When using A3 ultra/A3 mega Workload'
|
|
580
|
+
' Identity is enabled by default.'
|
|
581
|
+
),
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
custom_parser.add_argument(
|
|
585
|
+
'--enable-gcpfilestore-csi-driver',
|
|
586
|
+
action='store_true',
|
|
587
|
+
help=(
|
|
588
|
+
'Enable GCPFilestore driver on the cluster. This enables Workload'
|
|
589
|
+
' Identity Federation.'
|
|
590
|
+
),
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def add_shared_cluster_create_tensorboard_arguments(args_parsers):
|
|
595
|
+
"""Add shared tensorboard arguments in cluster create and Pathways cluster create.
|
|
596
|
+
Note that this feature enables non-Pathways workloads to use tensorboard arguments
|
|
597
|
+
on a Pathways cluster.
|
|
598
|
+
Args:
|
|
599
|
+
List of cluster create tensorboard arguments parsers
|
|
600
|
+
"""
|
|
601
|
+
for custom_parser in args_parsers:
|
|
602
|
+
custom_parser.add_argument(
|
|
603
|
+
'--create-vertex-tensorboard',
|
|
604
|
+
action='store_true',
|
|
605
|
+
help='Set this flag to create a Tensorboard instance in Vertex AI.',
|
|
606
|
+
)
|
|
607
|
+
custom_parser.add_argument(
|
|
608
|
+
'--tensorboard-region',
|
|
609
|
+
type=str,
|
|
610
|
+
default='us-central1',
|
|
611
|
+
help=(
|
|
612
|
+
'The region to create Vertex Tensorboard instance in. Visit'
|
|
613
|
+
' https://cloud.google.com/vertex-ai/docs/general/locations#available-regions'
|
|
614
|
+
' to view regions supported by Vertex AI. By default, Tensorboard'
|
|
615
|
+
' instance will be created in us-central1.'
|
|
616
|
+
),
|
|
617
|
+
)
|
|
618
|
+
custom_parser.add_argument(
|
|
619
|
+
'--tensorboard-name',
|
|
620
|
+
type=str,
|
|
621
|
+
required=False,
|
|
622
|
+
help=(
|
|
623
|
+
'The name of Vertex Tensorboard instance to create. If not'
|
|
624
|
+
' specified, a Tensorboard instance with the name'
|
|
625
|
+
f' <cluster>-{DEFAULT_VERTEX_TENSORBOARD_NAME} will be'
|
|
626
|
+
' created.'
|
|
627
|
+
),
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def add_shared_cluster_create_capacity_arguments(args_parsers):
|
|
632
|
+
"""Add shared capacity arguments in cluster create and Pathways cluster create.
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
List of cluster create capacity arguments parsers
|
|
636
|
+
"""
|
|
637
|
+
for custom_parser in args_parsers:
|
|
638
|
+
custom_parser.add_argument(
|
|
639
|
+
'--on-demand',
|
|
640
|
+
action='store_true',
|
|
641
|
+
help=(
|
|
642
|
+
'Sets node pool creation to use on-demand resources. '
|
|
643
|
+
' See `--reservation` or `--spot` for other capacity types.'
|
|
644
|
+
),
|
|
645
|
+
)
|
|
646
|
+
custom_parser.add_argument(
|
|
647
|
+
'--reservation',
|
|
648
|
+
type=str,
|
|
649
|
+
help=(
|
|
650
|
+
'The reservation to be used for acquiring resources in the'
|
|
651
|
+
' cluster. This will attempt to find the provided reservation.'
|
|
652
|
+
' See `--spot` or `--on-demand` for other capacity types.'
|
|
653
|
+
),
|
|
654
|
+
)
|
|
655
|
+
custom_parser.add_argument(
|
|
656
|
+
'--spot',
|
|
657
|
+
action='store_true',
|
|
658
|
+
help=(
|
|
659
|
+
'Sets node pool creation to use spot resources.'
|
|
660
|
+
' See `--reservation` or `--on-demand` for other capacity types.'
|
|
661
|
+
),
|
|
662
|
+
)
|