xpk 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +2 -3
- xpk/commands/cluster.py +225 -73
- xpk/commands/common.py +33 -1
- xpk/commands/kjob_common.py +10 -1
- xpk/commands/run.py +2 -3
- xpk/commands/storage.py +14 -3
- xpk/commands/workload.py +17 -15
- xpk/core/blueprint/blueprint_generator.py +18 -18
- xpk/core/cluster.py +119 -8
- xpk/core/config.py +1 -1
- xpk/core/filestore.py +2 -6
- xpk/core/gcsfuse.py +22 -4
- xpk/core/kjob.py +20 -13
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/network.py +23 -1
- xpk/core/pathways.py +1 -1
- xpk/core/resources.py +21 -0
- xpk/core/workload.py +1 -1
- xpk/core/workload_decorators/rdma_decorator.py +6 -10
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +15 -14
- xpk/parser/cluster.py +573 -389
- xpk/parser/storage.py +11 -2
- xpk/utils/kubectl.py +4 -1
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/METADATA +134 -91
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/RECORD +31 -29
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/parser/cluster.py
CHANGED
|
@@ -14,7 +14,10 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from argparse import ArgumentParser
|
|
18
|
+
|
|
17
19
|
from ..commands.cluster import (
|
|
20
|
+
cluster_adapt,
|
|
18
21
|
cluster_cacheimage,
|
|
19
22
|
cluster_create,
|
|
20
23
|
cluster_create_pathways,
|
|
@@ -23,14 +26,14 @@ from ..commands.cluster import (
|
|
|
23
26
|
cluster_describe,
|
|
24
27
|
cluster_list,
|
|
25
28
|
)
|
|
29
|
+
from ..commands.config import xpk_cfg
|
|
30
|
+
from ..core.config import CFG_BUCKET_KEY
|
|
26
31
|
from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
|
|
27
32
|
from .common import add_shared_arguments
|
|
28
33
|
from .validators import name_type
|
|
29
|
-
from ..commands.config import xpk_cfg
|
|
30
|
-
from ..core.config import CFG_BUCKET_KEY
|
|
31
34
|
|
|
32
35
|
|
|
33
|
-
def set_cluster_parser(cluster_parser):
|
|
36
|
+
def set_cluster_parser(cluster_parser: ArgumentParser):
|
|
34
37
|
cluster_subcommands = cluster_parser.add_subparsers(
|
|
35
38
|
title='cluster subcommands',
|
|
36
39
|
dest='xpk_cluster_subcommands',
|
|
@@ -40,28 +43,54 @@ def set_cluster_parser(cluster_parser):
|
|
|
40
43
|
),
|
|
41
44
|
)
|
|
42
45
|
|
|
43
|
-
### "cluster create" command parser ###
|
|
44
46
|
cluster_create_parser = cluster_subcommands.add_parser(
|
|
45
47
|
'create', help='Create cloud clusters.'
|
|
46
48
|
)
|
|
47
|
-
|
|
48
|
-
'
|
|
49
|
-
'
|
|
49
|
+
cluster_create_pathways_parser = cluster_subcommands.add_parser(
|
|
50
|
+
'create-pathways',
|
|
51
|
+
help='Create Pathways-on-Cloud clusters.',
|
|
50
52
|
)
|
|
51
|
-
|
|
52
|
-
'
|
|
53
|
+
cluster_create_ray_cluster_parser = cluster_subcommands.add_parser(
|
|
54
|
+
'create-ray',
|
|
55
|
+
help='Create RayCluster',
|
|
53
56
|
)
|
|
54
|
-
|
|
55
|
-
'
|
|
57
|
+
cluster_delete_parser = cluster_subcommands.add_parser(
|
|
58
|
+
'delete',
|
|
59
|
+
help='Delete cloud clusters.',
|
|
56
60
|
)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
61
|
+
cluster_cacheimage_parser = cluster_subcommands.add_parser(
|
|
62
|
+
'cacheimage',
|
|
63
|
+
help='Cache image.',
|
|
64
|
+
)
|
|
65
|
+
cluster_describe_parser = cluster_subcommands.add_parser(
|
|
66
|
+
'describe',
|
|
67
|
+
help='Describe a cluster.',
|
|
68
|
+
)
|
|
69
|
+
cluster_list_parser = cluster_subcommands.add_parser(
|
|
70
|
+
'list', help='List cloud clusters.'
|
|
62
71
|
)
|
|
72
|
+
cluster_adapt_parser = cluster_subcommands.add_parser(
|
|
73
|
+
'adapt', help='Adapt an existing cluster for XPK.'
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
set_cluster_create_parser(cluster_create_parser)
|
|
77
|
+
set_cluster_create_pathways_parser(cluster_create_pathways_parser)
|
|
78
|
+
set_cluster_create_ray_parser(cluster_create_ray_cluster_parser)
|
|
79
|
+
set_cluster_delete_parser(cluster_delete_parser)
|
|
80
|
+
set_cluster_cacheimage_parser(cluster_cacheimage_parser)
|
|
81
|
+
set_cluster_describe_parser(cluster_describe_parser)
|
|
82
|
+
set_cluster_list_parser(cluster_list_parser)
|
|
83
|
+
set_cluster_adapt_parser(cluster_adapt_parser)
|
|
84
|
+
|
|
63
85
|
|
|
86
|
+
def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
64
87
|
### Required arguments specific to "cluster create"
|
|
88
|
+
cluster_create_required_arguments = cluster_create_parser.add_argument_group(
|
|
89
|
+
'Required Arguments', 'Arguments required for cluster create.'
|
|
90
|
+
)
|
|
91
|
+
add_shared_cluster_create_required_arguments(
|
|
92
|
+
cluster_create_required_arguments
|
|
93
|
+
)
|
|
65
94
|
|
|
66
95
|
cluster_device_group = (
|
|
67
96
|
cluster_create_required_arguments.add_mutually_exclusive_group(
|
|
@@ -85,6 +114,12 @@ def set_cluster_parser(cluster_parser):
|
|
|
85
114
|
)
|
|
86
115
|
|
|
87
116
|
### Optional arguments specific to "cluster create"
|
|
117
|
+
cluster_create_optional_arguments = cluster_create_parser.add_argument_group(
|
|
118
|
+
'Optional Arguments', 'Arguments optional for cluster create.'
|
|
119
|
+
)
|
|
120
|
+
add_shared_cluster_create_optional_arguments(
|
|
121
|
+
cluster_create_optional_arguments
|
|
122
|
+
)
|
|
88
123
|
cluster_create_optional_arguments.add_argument(
|
|
89
124
|
'--cluster-state-gcs-bucket',
|
|
90
125
|
type=str,
|
|
@@ -108,111 +143,114 @@ def set_cluster_parser(cluster_parser):
|
|
|
108
143
|
),
|
|
109
144
|
)
|
|
110
145
|
|
|
111
|
-
|
|
112
|
-
|
|
146
|
+
autoprovisioning_arguments = cluster_create_parser.add_argument_group(
|
|
147
|
+
'Autoprovisioning Arguments',
|
|
148
|
+
'Optional arguments for enabling autoprovisioning.',
|
|
149
|
+
)
|
|
150
|
+
add_autoprovisioning_arguments(autoprovisioning_arguments)
|
|
151
|
+
|
|
152
|
+
### Capacity arguments specific to "cluster create"
|
|
153
|
+
cluster_create_capacity_arguments = cluster_create_parser.add_argument_group(
|
|
154
|
+
'Capacity Arguments', 'Arguments related to capacity for cluster create.'
|
|
155
|
+
)
|
|
156
|
+
add_shared_cluster_create_capacity_arguments(
|
|
157
|
+
cluster_create_capacity_arguments
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
### Tensorboard arguments specific to "cluster create"
|
|
161
|
+
cluster_create_tensorboard_arguments = (
|
|
113
162
|
cluster_create_parser.add_argument_group(
|
|
114
|
-
'Optional
|
|
115
|
-
'Arguments
|
|
163
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
164
|
+
'Arguments for creating Vertex AI Tensorboard in cluster create.',
|
|
116
165
|
)
|
|
117
166
|
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
action='store_true',
|
|
121
|
-
help=(
|
|
122
|
-
'Enable GKE features for autoprovisioning node pools in GKE clusters.'
|
|
123
|
-
),
|
|
167
|
+
add_shared_cluster_create_tensorboard_arguments(
|
|
168
|
+
cluster_create_tensorboard_arguments
|
|
124
169
|
)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
' units of chips.By default, autoprovisioning will use the number of'
|
|
131
|
-
' resources in the cluster as the minimum, and maximum.'
|
|
132
|
-
),
|
|
133
|
-
)
|
|
134
|
-
cluster_create_autoprovisioning_arguments.add_argument(
|
|
135
|
-
'--autoprovisioning-max-chips',
|
|
136
|
-
type=int,
|
|
137
|
-
help=(
|
|
138
|
-
'Optionally set the maximum autoprovisioning accelerator resources in'
|
|
139
|
-
' units of chips.By default, autoprovisioning will use the number of'
|
|
140
|
-
' resources in the cluster as the minimum, and maximum.'
|
|
141
|
-
),
|
|
170
|
+
|
|
171
|
+
### MTC arguments specific to "cluster create"
|
|
172
|
+
cluster_create_mtc_arguments = cluster_create_parser.add_argument_group(
|
|
173
|
+
'Optional MTC Arguments',
|
|
174
|
+
'Arguments for configuring MTC in cluster create.',
|
|
142
175
|
)
|
|
176
|
+
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
177
|
+
cluster_create_parser.set_defaults(func=cluster_create)
|
|
143
178
|
|
|
144
|
-
### "cluster create-pathways" command parser ###
|
|
145
179
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
180
|
+
def set_cluster_create_pathways_parser(
|
|
181
|
+
cluster_create_pathways_parser: ArgumentParser,
|
|
182
|
+
):
|
|
183
|
+
### Required arguments specific to "cluster create-pathways"
|
|
150
184
|
cluster_create_pathways_required_arguments = (
|
|
151
185
|
cluster_create_pathways_parser.add_argument_group(
|
|
152
186
|
'Required Arguments',
|
|
153
187
|
'Arguments required for cluster create-pathways.',
|
|
154
188
|
)
|
|
155
189
|
)
|
|
190
|
+
add_shared_cluster_create_required_arguments(
|
|
191
|
+
cluster_create_pathways_required_arguments
|
|
192
|
+
)
|
|
193
|
+
cluster_create_pathways_required_arguments.add_argument(
|
|
194
|
+
'--tpu-type',
|
|
195
|
+
type=str,
|
|
196
|
+
default=None,
|
|
197
|
+
help='The tpu type to use, v5litepod-16, etc.',
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
### Optional arguments specific to "cluster create-pathways"
|
|
156
201
|
cluster_create_pathways_optional_arguments = (
|
|
157
202
|
cluster_create_pathways_parser.add_argument_group(
|
|
158
203
|
'Optional Arguments',
|
|
159
204
|
'Arguments optional for cluster create-pathways.',
|
|
160
205
|
)
|
|
161
206
|
)
|
|
207
|
+
add_shared_cluster_create_optional_arguments(
|
|
208
|
+
cluster_create_pathways_optional_arguments
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
### Capacity arguments specific to "cluster create-pathways"
|
|
162
212
|
cluster_create_pathways_capacity_arguments = (
|
|
163
213
|
cluster_create_pathways_parser.add_argument_group(
|
|
164
214
|
'Capacity Arguments',
|
|
165
215
|
'Arguments related to capacity for cluster create-pathways.',
|
|
166
216
|
)
|
|
167
217
|
)
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
'Optional Vertex AI Tensorboard Arguments',
|
|
171
|
-
'Arguments for creating Vertex AI Tensorboard in cluster create.',
|
|
172
|
-
)
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
### Pathways required arguments specific to "cluster create"
|
|
176
|
-
cluster_create_pathways_required_arguments.add_argument(
|
|
177
|
-
'--tpu-type',
|
|
178
|
-
type=str,
|
|
179
|
-
default=None,
|
|
180
|
-
help='The tpu type to use, v5litepod-16, etc.',
|
|
218
|
+
add_shared_cluster_create_capacity_arguments(
|
|
219
|
+
cluster_create_pathways_capacity_arguments
|
|
181
220
|
)
|
|
182
221
|
|
|
183
|
-
### "cluster create-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
'
|
|
187
|
-
|
|
222
|
+
### Tensorboard arguments specific to "cluster create-pathways"
|
|
223
|
+
cluster_create_pathways_tensorboard_arguments = cluster_create_pathways_parser.add_argument_group(
|
|
224
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
225
|
+
'Arguments for creating Vertex AI Tensorboard in cluster'
|
|
226
|
+
' create-pathways.',
|
|
188
227
|
)
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
'Required Arguments',
|
|
192
|
-
'Arguments required for cluster create-ray.',
|
|
193
|
-
)
|
|
228
|
+
add_shared_cluster_create_tensorboard_arguments(
|
|
229
|
+
cluster_create_pathways_tensorboard_arguments
|
|
194
230
|
)
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
231
|
+
|
|
232
|
+
### MTC arguments specific to "cluster create"
|
|
233
|
+
cluster_create_mtc_arguments = (
|
|
234
|
+
cluster_create_pathways_parser.add_argument_group(
|
|
235
|
+
'Optional MTC Arguments',
|
|
236
|
+
'Arguments for configuring MTC in cluster create.',
|
|
199
237
|
)
|
|
200
238
|
)
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
239
|
+
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
240
|
+
cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
244
|
+
### Required arguments specific to "cluster create-ray"
|
|
245
|
+
cluster_create_ray_required_arguments = (
|
|
246
|
+
cluster_create_ray_parser.add_argument_group(
|
|
247
|
+
'Required Arguments', 'Arguments required for cluster create-ray.'
|
|
205
248
|
)
|
|
206
249
|
)
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
'Optional Vertex AI Tensorboard Arguments',
|
|
210
|
-
'Arguments for creating Vertex AI Tensorboard in cluster create.',
|
|
211
|
-
)
|
|
250
|
+
add_shared_cluster_create_required_arguments(
|
|
251
|
+
cluster_create_ray_required_arguments
|
|
212
252
|
)
|
|
213
|
-
|
|
214
|
-
### RayCluster required arguments specific to "cluster create"
|
|
215
|
-
cluster_create_ray_cluster_required_arguments.add_argument(
|
|
253
|
+
cluster_create_ray_required_arguments.add_argument(
|
|
216
254
|
'--tpu-type',
|
|
217
255
|
type=str,
|
|
218
256
|
default=None,
|
|
@@ -220,14 +258,24 @@ def set_cluster_parser(cluster_parser):
|
|
|
220
258
|
required=True,
|
|
221
259
|
)
|
|
222
260
|
# TODO(bzmarke): Add --device-type to support GPU/CPU
|
|
223
|
-
|
|
261
|
+
cluster_create_ray_required_arguments.add_argument(
|
|
224
262
|
'--ray-version',
|
|
225
263
|
type=str,
|
|
226
264
|
default=None,
|
|
227
265
|
help="The Ray version to use, e.g. '2.38.0'",
|
|
228
266
|
required=True,
|
|
229
267
|
)
|
|
230
|
-
|
|
268
|
+
|
|
269
|
+
### Optional arguments specific to "cluster create-ray"
|
|
270
|
+
cluster_create_ray_optional_arguments = (
|
|
271
|
+
cluster_create_ray_parser.add_argument_group(
|
|
272
|
+
'Optional Arguments', 'Arguments optional for cluster create-ray.'
|
|
273
|
+
)
|
|
274
|
+
)
|
|
275
|
+
add_shared_cluster_create_optional_arguments(
|
|
276
|
+
cluster_create_ray_optional_arguments
|
|
277
|
+
)
|
|
278
|
+
cluster_create_ray_optional_arguments.add_argument(
|
|
231
279
|
'--enable-pathways',
|
|
232
280
|
action='store_true',
|
|
233
281
|
help=(
|
|
@@ -236,38 +284,38 @@ def set_cluster_parser(cluster_parser):
|
|
|
236
284
|
),
|
|
237
285
|
)
|
|
238
286
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
add_shared_cluster_create_capacity_arguments([
|
|
250
|
-
cluster_create_capacity_arguments,
|
|
251
|
-
cluster_create_pathways_capacity_arguments,
|
|
252
|
-
cluster_create_ray_cluster_capacity_arguments,
|
|
253
|
-
])
|
|
254
|
-
add_shared_cluster_create_tensorboard_arguments([
|
|
255
|
-
cluster_create_tensorboard_arguments,
|
|
256
|
-
cluster_create_pathways_tensorboard_arguments,
|
|
257
|
-
cluster_create_ray_cluster_tensorboard_arguments,
|
|
258
|
-
])
|
|
287
|
+
### Capacity arguments specific to "cluster create-ray"
|
|
288
|
+
cluster_create_ray_capacity_arguments = (
|
|
289
|
+
cluster_create_ray_parser.add_argument_group(
|
|
290
|
+
'Capacity Arguments',
|
|
291
|
+
'Arguments related to capacity for cluster create-ray.',
|
|
292
|
+
)
|
|
293
|
+
)
|
|
294
|
+
add_shared_cluster_create_capacity_arguments(
|
|
295
|
+
cluster_create_ray_capacity_arguments
|
|
296
|
+
)
|
|
259
297
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
298
|
+
### Tensorboard arguments specific to "cluster create-ray"
|
|
299
|
+
cluster_create_ray_tensorboard_arguments = (
|
|
300
|
+
cluster_create_ray_parser.add_argument_group(
|
|
301
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
302
|
+
'Arguments for creating Vertex AI Tensorboard in cluster create-ray.',
|
|
303
|
+
)
|
|
304
|
+
)
|
|
305
|
+
add_shared_cluster_create_tensorboard_arguments(
|
|
306
|
+
cluster_create_ray_tensorboard_arguments
|
|
264
307
|
)
|
|
265
308
|
|
|
266
|
-
### "cluster
|
|
267
|
-
|
|
268
|
-
'
|
|
269
|
-
|
|
309
|
+
### MTC arguments specific to "cluster create"
|
|
310
|
+
cluster_create_mtc_arguments = cluster_create_ray_parser.add_argument_group(
|
|
311
|
+
'Optional MTC Arguments',
|
|
312
|
+
'Arguments for configuring MTC in cluster create.',
|
|
270
313
|
)
|
|
314
|
+
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
315
|
+
cluster_create_ray_parser.set_defaults(func=cluster_create_ray_cluster)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
|
|
271
319
|
cluster_delete_required_arguments = cluster_delete_parser.add_argument_group(
|
|
272
320
|
'Required Arguments',
|
|
273
321
|
'Arguments required for cluster delete.',
|
|
@@ -294,31 +342,25 @@ def set_cluster_parser(cluster_parser):
|
|
|
294
342
|
required=False,
|
|
295
343
|
)
|
|
296
344
|
add_shared_arguments(cluster_delete_optional_arguments)
|
|
297
|
-
|
|
298
|
-
cluster_delete_parser.add_argument(
|
|
345
|
+
cluster_delete_optional_arguments.add_argument(
|
|
299
346
|
'--force',
|
|
300
347
|
action='store_true',
|
|
301
348
|
help=(
|
|
302
|
-
'Forces
|
|
349
|
+
'Forces cluster deletion command to run without additional approval.'
|
|
303
350
|
),
|
|
304
351
|
)
|
|
305
352
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
)
|
|
353
|
+
cluster_delete_parser.set_defaults(func=cluster_delete)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
|
|
311
357
|
cluster_cacheimage_required_arguments = (
|
|
312
358
|
cluster_cacheimage_parser.add_argument_group(
|
|
313
359
|
'Required Arguments',
|
|
314
360
|
'Arguments required for cluster cacheimage.',
|
|
315
361
|
)
|
|
316
362
|
)
|
|
317
|
-
|
|
318
|
-
cluster_cacheimage_parser.add_argument_group(
|
|
319
|
-
'Optional Arguments', 'Arguments optional for cluster cacheimage.'
|
|
320
|
-
)
|
|
321
|
-
)
|
|
363
|
+
|
|
322
364
|
cluster_cacheimage_group = (
|
|
323
365
|
cluster_cacheimage_parser.add_mutually_exclusive_group(required=True)
|
|
324
366
|
)
|
|
@@ -357,6 +399,11 @@ def set_cluster_parser(cluster_parser):
|
|
|
357
399
|
)
|
|
358
400
|
|
|
359
401
|
### Optional Arguments
|
|
402
|
+
cluster_cacheimage_optional_arguments = (
|
|
403
|
+
cluster_cacheimage_parser.add_argument_group(
|
|
404
|
+
'Optional Arguments', 'Arguments optional for cluster cacheimage.'
|
|
405
|
+
)
|
|
406
|
+
)
|
|
360
407
|
add_shared_arguments(cluster_cacheimage_optional_arguments)
|
|
361
408
|
cluster_cacheimage_optional_arguments.add_argument(
|
|
362
409
|
'--cache-key',
|
|
@@ -365,26 +412,18 @@ def set_cluster_parser(cluster_parser):
|
|
|
365
412
|
help='The key to cache the docker image under.',
|
|
366
413
|
required=False,
|
|
367
414
|
)
|
|
415
|
+
|
|
368
416
|
cluster_cacheimage_parser.set_defaults(func=cluster_cacheimage)
|
|
369
417
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
help='Describe a cluster.',
|
|
374
|
-
)
|
|
418
|
+
|
|
419
|
+
def set_cluster_describe_parser(cluster_describe_parser: ArgumentParser):
|
|
420
|
+
### Required arguments
|
|
375
421
|
cluster_describe_required_arguments = (
|
|
376
422
|
cluster_describe_parser.add_argument_group(
|
|
377
423
|
'Required Arguments',
|
|
378
424
|
'Arguments required for cluster describe.',
|
|
379
425
|
)
|
|
380
426
|
)
|
|
381
|
-
cluster_describe_optional_arguments = (
|
|
382
|
-
cluster_describe_parser.add_argument_group(
|
|
383
|
-
'Optional Arguments', 'Arguments optional for cluster describe.'
|
|
384
|
-
)
|
|
385
|
-
)
|
|
386
|
-
|
|
387
|
-
### Required arguments
|
|
388
427
|
cluster_describe_required_arguments.add_argument(
|
|
389
428
|
'--cluster',
|
|
390
429
|
type=name_type,
|
|
@@ -392,280 +431,425 @@ def set_cluster_parser(cluster_parser):
|
|
|
392
431
|
help='The name of the cluster to be describe.',
|
|
393
432
|
required=True,
|
|
394
433
|
)
|
|
434
|
+
|
|
395
435
|
### Optional Arguments
|
|
436
|
+
cluster_describe_optional_arguments = (
|
|
437
|
+
cluster_describe_parser.add_argument_group(
|
|
438
|
+
'Optional Arguments', 'Arguments optional for cluster describe.'
|
|
439
|
+
)
|
|
440
|
+
)
|
|
396
441
|
add_shared_arguments(cluster_describe_optional_arguments)
|
|
397
442
|
|
|
398
443
|
cluster_describe_parser.set_defaults(func=cluster_describe)
|
|
399
444
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
)
|
|
445
|
+
|
|
446
|
+
def set_cluster_list_parser(cluster_list_parser: ArgumentParser):
|
|
447
|
+
### Optional Arguments
|
|
404
448
|
cluster_list_optional_arguments = cluster_list_parser.add_argument_group(
|
|
405
449
|
'Optional Arguments', 'Arguments optional for cluster list.'
|
|
406
450
|
)
|
|
407
|
-
### Optional Arguments
|
|
408
451
|
add_shared_arguments(cluster_list_optional_arguments)
|
|
409
452
|
|
|
410
453
|
cluster_list_parser.set_defaults(func=cluster_list)
|
|
411
454
|
|
|
412
455
|
|
|
413
|
-
def
|
|
456
|
+
def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
|
|
457
|
+
cluster_adapt_required_arguments = cluster_adapt_parser.add_argument_group(
|
|
458
|
+
'Required Arguments',
|
|
459
|
+
'Arguments required for cluster adapt.',
|
|
460
|
+
)
|
|
461
|
+
add_shared_cluster_create_required_arguments(cluster_adapt_required_arguments)
|
|
462
|
+
|
|
463
|
+
cluster_adapt_device_group = (
|
|
464
|
+
cluster_adapt_required_arguments.add_mutually_exclusive_group(
|
|
465
|
+
required=True
|
|
466
|
+
)
|
|
467
|
+
)
|
|
468
|
+
cluster_adapt_device_group.add_argument(
|
|
469
|
+
'--tpu-type',
|
|
470
|
+
type=str,
|
|
471
|
+
default=None,
|
|
472
|
+
help='The tpu type used on cluster, v5litepod-16, etc.',
|
|
473
|
+
)
|
|
474
|
+
cluster_adapt_device_group.add_argument(
|
|
475
|
+
'--device-type',
|
|
476
|
+
type=str,
|
|
477
|
+
default=None,
|
|
478
|
+
help=(
|
|
479
|
+
'The device type used on cluster (can be tpu or gpu or cpu), eg.'
|
|
480
|
+
' h100-80gb-8, n2-standard-32-4 etc.'
|
|
481
|
+
),
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
cluster_adapt_optional_arguments = cluster_adapt_parser.add_argument_group(
|
|
485
|
+
'Optional Arguments',
|
|
486
|
+
'Arguments optional for cluster adapt.',
|
|
487
|
+
)
|
|
488
|
+
cluster_adapt_optional_arguments.add_argument(
|
|
489
|
+
'--num-nodes',
|
|
490
|
+
type=int,
|
|
491
|
+
help='The number of nodes of a cluster.',
|
|
492
|
+
)
|
|
493
|
+
cluster_adapt_optional_arguments.add_argument(
|
|
494
|
+
'--enable-workload-identity',
|
|
495
|
+
action='store_true',
|
|
496
|
+
help='Enable Workload Identity Federation on the cluster and node-pools.',
|
|
497
|
+
)
|
|
498
|
+
cluster_adapt_optional_arguments.add_argument(
|
|
499
|
+
'--num-slices',
|
|
500
|
+
type=int,
|
|
501
|
+
default=1,
|
|
502
|
+
help='The number of slices to run the job on, defaults to 1.',
|
|
503
|
+
required=False,
|
|
504
|
+
)
|
|
505
|
+
add_driver_arguments(cluster_adapt_optional_arguments)
|
|
506
|
+
add_shared_arguments(cluster_adapt_optional_arguments)
|
|
507
|
+
|
|
508
|
+
cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
|
|
509
|
+
'Capacity Arguments', 'Arguments related to capacity for cluster create.'
|
|
510
|
+
)
|
|
511
|
+
add_shared_cluster_create_capacity_arguments(cluster_adapt_capacity_arguments)
|
|
512
|
+
|
|
513
|
+
cluster_adapt_autoprovisioning_arguments = (
|
|
514
|
+
cluster_adapt_parser.add_argument_group(
|
|
515
|
+
'Autoprovisioning Arguments',
|
|
516
|
+
'Optional arguments for enabling autoprovisioning.',
|
|
517
|
+
)
|
|
518
|
+
)
|
|
519
|
+
add_autoprovisioning_arguments(cluster_adapt_autoprovisioning_arguments)
|
|
520
|
+
|
|
521
|
+
cluster_adapt_tensorboard_arguments = cluster_adapt_parser.add_argument_group(
|
|
522
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
523
|
+
'Arguments for creating Vertex AI Tensorboard in cluster adapt.',
|
|
524
|
+
)
|
|
525
|
+
add_shared_cluster_create_tensorboard_arguments(
|
|
526
|
+
cluster_adapt_tensorboard_arguments
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
cluster_adapt_parser.set_defaults(func=cluster_adapt)
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def add_autoprovisioning_arguments(parser: ArgumentParser):
|
|
533
|
+
parser.add_argument(
|
|
534
|
+
'--enable-autoprovisioning',
|
|
535
|
+
action='store_true',
|
|
536
|
+
help=(
|
|
537
|
+
'Enable GKE features for autoprovisioning node pools in GKE clusters.'
|
|
538
|
+
),
|
|
539
|
+
)
|
|
540
|
+
parser.add_argument(
|
|
541
|
+
'--autoprovisioning-min-chips',
|
|
542
|
+
type=int,
|
|
543
|
+
help=(
|
|
544
|
+
'Optionally set the minimum autoprovisioning accelerator resources in'
|
|
545
|
+
' units of chips.By default, autoprovisioning will use the number of'
|
|
546
|
+
' resources in the cluster as the minimum, and maximum.'
|
|
547
|
+
),
|
|
548
|
+
)
|
|
549
|
+
parser.add_argument(
|
|
550
|
+
'--autoprovisioning-max-chips',
|
|
551
|
+
type=int,
|
|
552
|
+
help=(
|
|
553
|
+
'Optionally set the maximum autoprovisioning accelerator resources in'
|
|
554
|
+
' units of chips.By default, autoprovisioning will use the number of'
|
|
555
|
+
' resources in the cluster as the minimum, and maximum.'
|
|
556
|
+
),
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
def add_shared_cluster_create_required_arguments(parser: ArgumentParser):
|
|
414
561
|
"""Add shared required arguments in cluster create and Pathways cluster create.
|
|
415
562
|
|
|
416
563
|
Args:
|
|
417
|
-
|
|
564
|
+
parser: cluster create argument parser or argument group
|
|
418
565
|
"""
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
def add_shared_cluster_create_optional_arguments(args_parsers):
|
|
566
|
+
parser.add_argument(
|
|
567
|
+
'--cluster',
|
|
568
|
+
type=name_type,
|
|
569
|
+
default=None,
|
|
570
|
+
help=(
|
|
571
|
+
'The name of the cluster. Will be used as the prefix for internal'
|
|
572
|
+
' objects in the cluster.'
|
|
573
|
+
),
|
|
574
|
+
required=True,
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
433
579
|
"""Add shared optional arguments in cluster create and Pathways cluster create.
|
|
434
580
|
|
|
435
581
|
Args:
|
|
436
|
-
|
|
582
|
+
parser: cluster create argument parser or argument group
|
|
437
583
|
"""
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
def add_shared_cluster_create_tensorboard_arguments(args_parsers):
|
|
584
|
+
add_shared_arguments(parser)
|
|
585
|
+
parser.add_argument(
|
|
586
|
+
'--host-maintenance-interval',
|
|
587
|
+
type=str,
|
|
588
|
+
choices=['AS_NEEDED', 'PERIODIC'],
|
|
589
|
+
default='AS_NEEDED',
|
|
590
|
+
help='The maintenance policy of the cluster and respective clusters.',
|
|
591
|
+
)
|
|
592
|
+
parser.add_argument(
|
|
593
|
+
'--gke-version',
|
|
594
|
+
type=str,
|
|
595
|
+
help=(
|
|
596
|
+
'The GKE version of the cluster and respective clusters. The'
|
|
597
|
+
' default is determined dynamically based on RAPID channel'
|
|
598
|
+
' recommended version.'
|
|
599
|
+
),
|
|
600
|
+
)
|
|
601
|
+
parser.add_argument(
|
|
602
|
+
'--num-slices',
|
|
603
|
+
type=int,
|
|
604
|
+
default=1,
|
|
605
|
+
help='The number of slices to run the job on, defaults to 1.',
|
|
606
|
+
required=False,
|
|
607
|
+
)
|
|
608
|
+
parser.add_argument(
|
|
609
|
+
'--pathways-gce-machine-type',
|
|
610
|
+
type=str,
|
|
611
|
+
default='n2-standard-64',
|
|
612
|
+
help='The CPU type for Pathways CPU nodepools',
|
|
613
|
+
)
|
|
614
|
+
parser.add_argument(
|
|
615
|
+
'--default-pool-cpu-machine-type',
|
|
616
|
+
type=str,
|
|
617
|
+
default='e2-standard-16',
|
|
618
|
+
help=(
|
|
619
|
+
'Set the machine type within the default cpu node pool. For'
|
|
620
|
+
' regional clusters, all zones must support the machine type.'
|
|
621
|
+
),
|
|
622
|
+
)
|
|
623
|
+
parser.add_argument(
|
|
624
|
+
'--cluster-cpu-machine-type',
|
|
625
|
+
type=str,
|
|
626
|
+
default='',
|
|
627
|
+
help=(
|
|
628
|
+
'Getting deprecated soon! Please use'
|
|
629
|
+
' --default-pool-cpu-machine-typeinstead, to denote the machine'
|
|
630
|
+
' type of the default cpu node pool. Set the machine type of other'
|
|
631
|
+
' cpu nodepools using --device-type.'
|
|
632
|
+
),
|
|
633
|
+
)
|
|
634
|
+
parser.add_argument(
|
|
635
|
+
'--default-pool-cpu-num-nodes',
|
|
636
|
+
type=int,
|
|
637
|
+
default=6,
|
|
638
|
+
help=(
|
|
639
|
+
'Set the number of nodes within the default cpu node pool. This is'
|
|
640
|
+
' set to 6 by default. Autoscaling is enabled to scale this value'
|
|
641
|
+
' over time.'
|
|
642
|
+
),
|
|
643
|
+
)
|
|
644
|
+
parser.add_argument(
|
|
645
|
+
'--custom-cluster-arguments',
|
|
646
|
+
type=str,
|
|
647
|
+
default='',
|
|
648
|
+
help=(
|
|
649
|
+
'Users can add their own arguments to customize their cluster'
|
|
650
|
+
' create command. Do note, these will not override already used'
|
|
651
|
+
' cluster creation arguments. e.g.'
|
|
652
|
+
" --custom-cluster-arguments='--network=mtu9k --subnetwork=mtu9k'"
|
|
653
|
+
),
|
|
654
|
+
)
|
|
655
|
+
parser.add_argument(
|
|
656
|
+
'--custom-nodepool-arguments',
|
|
657
|
+
type=str,
|
|
658
|
+
default='',
|
|
659
|
+
help=(
|
|
660
|
+
'Users can add their own arguments to customize their node pool '
|
|
661
|
+
' create command. Do note, these will not override already used'
|
|
662
|
+
' node pool creation arguments. e.g.'
|
|
663
|
+
' --custom-nodepool-arguments="--disk-size=300"'
|
|
664
|
+
),
|
|
665
|
+
)
|
|
666
|
+
parser.add_argument(
|
|
667
|
+
'--force',
|
|
668
|
+
action='store_true',
|
|
669
|
+
help=(
|
|
670
|
+
'Forces node pool creation and delete commands to run without'
|
|
671
|
+
' additional approval.'
|
|
672
|
+
),
|
|
673
|
+
)
|
|
674
|
+
parser.add_argument(
|
|
675
|
+
'--custom-tpu-nodepool-arguments',
|
|
676
|
+
type=str,
|
|
677
|
+
default='',
|
|
678
|
+
help=(
|
|
679
|
+
'DEPRECATING SOON! Please use --custom-nodepool-arguments to'
|
|
680
|
+
' customize node pool create command. Do note, these will not'
|
|
681
|
+
' override already used node pool creation arguments. Example usage'
|
|
682
|
+
' --custom-tpu-nodepool-arguments="--enable-ip-alias"'
|
|
683
|
+
),
|
|
684
|
+
)
|
|
685
|
+
parser.add_argument(
|
|
686
|
+
'--private',
|
|
687
|
+
action='store_true',
|
|
688
|
+
help=(
|
|
689
|
+
'Creates a private GKE cluster, a VPC-native cluster in which Nodes'
|
|
690
|
+
' and Pods are isolated from the internet. If set,'
|
|
691
|
+
' master_authorized_networks will also be enabled and access to the'
|
|
692
|
+
" cluster's control plane will be restricted only to current"
|
|
693
|
+
" machine's IP address unless more IP ranges are authorized by"
|
|
694
|
+
' providing --authorized-networks. This works only on creating new'
|
|
695
|
+
' clusters.'
|
|
696
|
+
),
|
|
697
|
+
)
|
|
698
|
+
parser.add_argument(
|
|
699
|
+
'--authorized-networks',
|
|
700
|
+
action='extend',
|
|
701
|
+
nargs='+',
|
|
702
|
+
help=(
|
|
703
|
+
'Sets the provided cidrs as authorized IP ranges to access the'
|
|
704
|
+
" private cluster's control plan. Access to the control plane will"
|
|
705
|
+
" be provided to current machine's IP address even if"
|
|
706
|
+
' --authorized-networks is not set or it does not cover the IP'
|
|
707
|
+
' address. If set, --private is considered true and a private'
|
|
708
|
+
' cluster will be provisioned. It replaces existing authorized'
|
|
709
|
+
' networks if used with an existing private cluster.'
|
|
710
|
+
' Example usage: --authorized-networks 1.2.3.0/24 1.2.4.5/32'
|
|
711
|
+
),
|
|
712
|
+
)
|
|
713
|
+
parser.add_argument(
|
|
714
|
+
'--enable-workload-identity',
|
|
715
|
+
action='store_true',
|
|
716
|
+
help='Enable Workload Identity Federation on the cluster and node-pools.',
|
|
717
|
+
)
|
|
718
|
+
add_driver_arguments(parser)
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def add_driver_arguments(parser: ArgumentParser):
|
|
722
|
+
parser.add_argument(
|
|
723
|
+
'--enable-gcsfuse-csi-driver',
|
|
724
|
+
action='store_true',
|
|
725
|
+
help=(
|
|
726
|
+
'Enable GSCFuse driver on the cluster. This enables Workload'
|
|
727
|
+
' Identity Federation. When using A3 ultra/A3 mega Workload'
|
|
728
|
+
' Identity is enabled by default.'
|
|
729
|
+
),
|
|
730
|
+
)
|
|
731
|
+
parser.add_argument(
|
|
732
|
+
'--enable-gcpfilestore-csi-driver',
|
|
733
|
+
action='store_true',
|
|
734
|
+
help='Enable GCPFilestore driver on the cluster.',
|
|
735
|
+
)
|
|
736
|
+
parser.add_argument(
|
|
737
|
+
'--enable-parallelstore-csi-driver',
|
|
738
|
+
action='store_true',
|
|
739
|
+
help='Enable Parallelstore CSI driver on the cluster.',
|
|
740
|
+
)
|
|
741
|
+
parser.add_argument(
|
|
742
|
+
'--enable-pd-csi-driver',
|
|
743
|
+
action='store_true',
|
|
744
|
+
help='Enable PersistentDisk CSI driver on the cluster.',
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def add_shared_cluster_create_tensorboard_arguments(parser: ArgumentParser):
|
|
604
749
|
"""Add shared tensorboard arguments in cluster create and Pathways cluster create.
|
|
605
750
|
Note that this feature enables non-Pathways workloads to use tensorboard arguments
|
|
606
751
|
on a Pathways cluster.
|
|
752
|
+
|
|
607
753
|
Args:
|
|
608
|
-
|
|
754
|
+
parser: cluster create argument parser or argument group
|
|
609
755
|
"""
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
def add_shared_cluster_create_capacity_arguments(args_parsers):
|
|
756
|
+
parser.add_argument(
|
|
757
|
+
'--create-vertex-tensorboard',
|
|
758
|
+
action='store_true',
|
|
759
|
+
help='Set this flag to create a Tensorboard instance in Vertex AI.',
|
|
760
|
+
)
|
|
761
|
+
parser.add_argument(
|
|
762
|
+
'--tensorboard-region',
|
|
763
|
+
type=str,
|
|
764
|
+
default='us-central1',
|
|
765
|
+
help=(
|
|
766
|
+
'The region to create Vertex Tensorboard instance in. Visit'
|
|
767
|
+
' https://cloud.google.com/vertex-ai/docs/general/locations#available-regions'
|
|
768
|
+
' to view regions supported by Vertex AI. By default, Tensorboard'
|
|
769
|
+
' instance will be created in us-central1.'
|
|
770
|
+
),
|
|
771
|
+
)
|
|
772
|
+
parser.add_argument(
|
|
773
|
+
'--tensorboard-name',
|
|
774
|
+
type=str,
|
|
775
|
+
required=False,
|
|
776
|
+
help=(
|
|
777
|
+
'The name of Vertex Tensorboard instance to create. If not'
|
|
778
|
+
' specified, a Tensorboard instance with the name'
|
|
779
|
+
f' <cluster>-{DEFAULT_VERTEX_TENSORBOARD_NAME} will be'
|
|
780
|
+
' created.'
|
|
781
|
+
),
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
def add_shared_cluster_create_capacity_arguments(parser: ArgumentParser):
|
|
641
786
|
"""Add shared capacity arguments in cluster create and Pathways cluster create.
|
|
642
787
|
|
|
643
788
|
Args:
|
|
644
|
-
|
|
789
|
+
parser: cluster create argument parser or argument group
|
|
790
|
+
"""
|
|
791
|
+
parser.add_argument(
|
|
792
|
+
'--on-demand',
|
|
793
|
+
action='store_true',
|
|
794
|
+
help=(
|
|
795
|
+
'Sets node pool creation to use on-demand resources. '
|
|
796
|
+
' See `--reservation` or `--spot` for other capacity types.'
|
|
797
|
+
),
|
|
798
|
+
)
|
|
799
|
+
parser.add_argument(
|
|
800
|
+
'--reservation',
|
|
801
|
+
type=str,
|
|
802
|
+
help=(
|
|
803
|
+
'The reservation to be used for acquiring resources in the'
|
|
804
|
+
' cluster. This will attempt to find the provided reservation.'
|
|
805
|
+
' See `--spot` or `--on-demand` for other capacity types.'
|
|
806
|
+
),
|
|
807
|
+
)
|
|
808
|
+
parser.add_argument(
|
|
809
|
+
'--spot',
|
|
810
|
+
action='store_true',
|
|
811
|
+
help=(
|
|
812
|
+
'Sets node pool creation to use spot resources.'
|
|
813
|
+
' See `--reservation` or `--on-demand` for other capacity types.'
|
|
814
|
+
),
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def add_shared_cluster_create_mtc_arguments(parser: ArgumentParser):
|
|
819
|
+
"""Add shared Multi-tier Checkpointing arguments in cluster create and Pathways cluster create.
|
|
820
|
+
|
|
821
|
+
Args:
|
|
822
|
+
List of cluster create MTC arguments parsers
|
|
645
823
|
"""
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
824
|
+
parser.add_argument(
|
|
825
|
+
'--enable-mtc',
|
|
826
|
+
action='store_true',
|
|
827
|
+
help='Enable MTC on the cluster.',
|
|
828
|
+
)
|
|
829
|
+
parser.add_argument(
|
|
830
|
+
'--mtc-ramdisk-size',
|
|
831
|
+
type=str,
|
|
832
|
+
default=None,
|
|
833
|
+
help=(
|
|
834
|
+
'(Required if --enable-mtc is true) The size of the RAM disk to be'
|
|
835
|
+
' used for multi-tier checkpointing. e.g. "64Mi" '
|
|
836
|
+
),
|
|
837
|
+
)
|
|
838
|
+
parser.add_argument(
|
|
839
|
+
'--mtc-gcs-bucket',
|
|
840
|
+
type=str,
|
|
841
|
+
default=None,
|
|
842
|
+
help=(
|
|
843
|
+
'(Required if --enable-mtc is true) The GCS bucket to be used for'
|
|
844
|
+
' multi-tier checkpointing.'
|
|
845
|
+
),
|
|
846
|
+
)
|
|
847
|
+
parser.add_argument(
|
|
848
|
+
'--mtc-toleration-key',
|
|
849
|
+
type=str,
|
|
850
|
+
default=None,
|
|
851
|
+
help=(
|
|
852
|
+
'(Optional) The tolerance key to be used for multi-tier'
|
|
853
|
+
' checkpointing. By default, it is set to "google.com/tpu".'
|
|
854
|
+
),
|
|
855
|
+
)
|