xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -13
- xpk/commands/cluster.py +240 -71
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/common.py +33 -1
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +30 -18
- xpk/commands/run.py +17 -12
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +75 -19
- xpk/commands/workload.py +161 -324
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +335 -45
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +193 -12
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +5 -1
- xpk/core/gcsfuse.py +27 -6
- xpk/core/kjob.py +66 -20
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/nap.py +4 -0
- xpk/core/network.py +34 -22
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/resources.py +21 -0
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +28 -83
- xpk/core/workload_decorators/rdma_decorator.py +11 -15
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
- xpk/parser/cluster.py +574 -381
- xpk/parser/storage.py +25 -5
- xpk/parser/workload.py +59 -31
- xpk/utils/kubectl.py +4 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/parser/cluster.py
CHANGED
|
@@ -14,7 +14,10 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from argparse import ArgumentParser
|
|
18
|
+
|
|
17
19
|
from ..commands.cluster import (
|
|
20
|
+
cluster_adapt,
|
|
18
21
|
cluster_cacheimage,
|
|
19
22
|
cluster_create,
|
|
20
23
|
cluster_create_pathways,
|
|
@@ -23,14 +26,14 @@ from ..commands.cluster import (
|
|
|
23
26
|
cluster_describe,
|
|
24
27
|
cluster_list,
|
|
25
28
|
)
|
|
29
|
+
from ..commands.config import xpk_cfg
|
|
30
|
+
from ..core.config import CFG_BUCKET_KEY
|
|
26
31
|
from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
|
|
27
32
|
from .common import add_shared_arguments
|
|
28
33
|
from .validators import name_type
|
|
29
|
-
from ..commands.config import xpk_cfg
|
|
30
|
-
from ..core.config import CFG_BUCKET_KEY
|
|
31
34
|
|
|
32
35
|
|
|
33
|
-
def set_cluster_parser(cluster_parser):
|
|
36
|
+
def set_cluster_parser(cluster_parser: ArgumentParser):
|
|
34
37
|
cluster_subcommands = cluster_parser.add_subparsers(
|
|
35
38
|
title='cluster subcommands',
|
|
36
39
|
dest='xpk_cluster_subcommands',
|
|
@@ -40,28 +43,54 @@ def set_cluster_parser(cluster_parser):
|
|
|
40
43
|
),
|
|
41
44
|
)
|
|
42
45
|
|
|
43
|
-
### "cluster create" command parser ###
|
|
44
46
|
cluster_create_parser = cluster_subcommands.add_parser(
|
|
45
47
|
'create', help='Create cloud clusters.'
|
|
46
48
|
)
|
|
47
|
-
|
|
48
|
-
'
|
|
49
|
-
'
|
|
49
|
+
cluster_create_pathways_parser = cluster_subcommands.add_parser(
|
|
50
|
+
'create-pathways',
|
|
51
|
+
help='Create Pathways-on-Cloud clusters.',
|
|
50
52
|
)
|
|
51
|
-
|
|
52
|
-
'
|
|
53
|
+
cluster_create_ray_cluster_parser = cluster_subcommands.add_parser(
|
|
54
|
+
'create-ray',
|
|
55
|
+
help='Create RayCluster',
|
|
53
56
|
)
|
|
54
|
-
|
|
55
|
-
'
|
|
57
|
+
cluster_delete_parser = cluster_subcommands.add_parser(
|
|
58
|
+
'delete',
|
|
59
|
+
help='Delete cloud clusters.',
|
|
56
60
|
)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
61
|
+
cluster_cacheimage_parser = cluster_subcommands.add_parser(
|
|
62
|
+
'cacheimage',
|
|
63
|
+
help='Cache image.',
|
|
64
|
+
)
|
|
65
|
+
cluster_describe_parser = cluster_subcommands.add_parser(
|
|
66
|
+
'describe',
|
|
67
|
+
help='Describe a cluster.',
|
|
68
|
+
)
|
|
69
|
+
cluster_list_parser = cluster_subcommands.add_parser(
|
|
70
|
+
'list', help='List cloud clusters.'
|
|
71
|
+
)
|
|
72
|
+
cluster_adapt_parser = cluster_subcommands.add_parser(
|
|
73
|
+
'adapt', help='Adapt an existing cluster for XPK.'
|
|
62
74
|
)
|
|
63
75
|
|
|
76
|
+
set_cluster_create_parser(cluster_create_parser)
|
|
77
|
+
set_cluster_create_pathways_parser(cluster_create_pathways_parser)
|
|
78
|
+
set_cluster_create_ray_parser(cluster_create_ray_cluster_parser)
|
|
79
|
+
set_cluster_delete_parser(cluster_delete_parser)
|
|
80
|
+
set_cluster_cacheimage_parser(cluster_cacheimage_parser)
|
|
81
|
+
set_cluster_describe_parser(cluster_describe_parser)
|
|
82
|
+
set_cluster_list_parser(cluster_list_parser)
|
|
83
|
+
set_cluster_adapt_parser(cluster_adapt_parser)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
64
87
|
### Required arguments specific to "cluster create"
|
|
88
|
+
cluster_create_required_arguments = cluster_create_parser.add_argument_group(
|
|
89
|
+
'Required Arguments', 'Arguments required for cluster create.'
|
|
90
|
+
)
|
|
91
|
+
add_shared_cluster_create_required_arguments(
|
|
92
|
+
cluster_create_required_arguments
|
|
93
|
+
)
|
|
65
94
|
|
|
66
95
|
cluster_device_group = (
|
|
67
96
|
cluster_create_required_arguments.add_mutually_exclusive_group(
|
|
@@ -85,6 +114,12 @@ def set_cluster_parser(cluster_parser):
|
|
|
85
114
|
)
|
|
86
115
|
|
|
87
116
|
### Optional arguments specific to "cluster create"
|
|
117
|
+
cluster_create_optional_arguments = cluster_create_parser.add_argument_group(
|
|
118
|
+
'Optional Arguments', 'Arguments optional for cluster create.'
|
|
119
|
+
)
|
|
120
|
+
add_shared_cluster_create_optional_arguments(
|
|
121
|
+
cluster_create_optional_arguments
|
|
122
|
+
)
|
|
88
123
|
cluster_create_optional_arguments.add_argument(
|
|
89
124
|
'--cluster-state-gcs-bucket',
|
|
90
125
|
type=str,
|
|
@@ -107,111 +142,115 @@ def set_cluster_parser(cluster_parser):
|
|
|
107
142
|
' enable cluster to accept Pathways workloads.'
|
|
108
143
|
),
|
|
109
144
|
)
|
|
110
|
-
|
|
111
|
-
|
|
145
|
+
|
|
146
|
+
autoprovisioning_arguments = cluster_create_parser.add_argument_group(
|
|
147
|
+
'Autoprovisioning Arguments',
|
|
148
|
+
'Optional arguments for enabling autoprovisioning.',
|
|
149
|
+
)
|
|
150
|
+
add_autoprovisioning_arguments(autoprovisioning_arguments)
|
|
151
|
+
|
|
152
|
+
### Capacity arguments specific to "cluster create"
|
|
153
|
+
cluster_create_capacity_arguments = cluster_create_parser.add_argument_group(
|
|
154
|
+
'Capacity Arguments', 'Arguments related to capacity for cluster create.'
|
|
155
|
+
)
|
|
156
|
+
add_shared_cluster_create_capacity_arguments(
|
|
157
|
+
cluster_create_capacity_arguments
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
### Tensorboard arguments specific to "cluster create"
|
|
161
|
+
cluster_create_tensorboard_arguments = (
|
|
112
162
|
cluster_create_parser.add_argument_group(
|
|
113
|
-
'Optional
|
|
114
|
-
'Arguments
|
|
163
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
164
|
+
'Arguments for creating Vertex AI Tensorboard in cluster create.',
|
|
115
165
|
)
|
|
116
166
|
)
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
action='store_true',
|
|
120
|
-
help=(
|
|
121
|
-
'Enable GKE features for autoprovisioning node pools in GKE clusters.'
|
|
122
|
-
),
|
|
167
|
+
add_shared_cluster_create_tensorboard_arguments(
|
|
168
|
+
cluster_create_tensorboard_arguments
|
|
123
169
|
)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
' units of chips.By default, autoprovisioning will use the number of'
|
|
130
|
-
' resources in the cluster as the minimum, and maximum.'
|
|
131
|
-
),
|
|
132
|
-
)
|
|
133
|
-
cluster_create_autoprovisioning_arguments.add_argument(
|
|
134
|
-
'--autoprovisioning-max-chips',
|
|
135
|
-
type=int,
|
|
136
|
-
help=(
|
|
137
|
-
'Optionally set the maximum autoprovisioning accelerator resources in'
|
|
138
|
-
' units of chips.By default, autoprovisioning will use the number of'
|
|
139
|
-
' resources in the cluster as the minimum, and maximum.'
|
|
140
|
-
),
|
|
170
|
+
|
|
171
|
+
### MTC arguments specific to "cluster create"
|
|
172
|
+
cluster_create_mtc_arguments = cluster_create_parser.add_argument_group(
|
|
173
|
+
'Optional MTC Arguments',
|
|
174
|
+
'Arguments for configuring MTC in cluster create.',
|
|
141
175
|
)
|
|
176
|
+
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
177
|
+
cluster_create_parser.set_defaults(func=cluster_create)
|
|
142
178
|
|
|
143
|
-
### "cluster create-pathways" command parser ###
|
|
144
179
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
180
|
+
def set_cluster_create_pathways_parser(
|
|
181
|
+
cluster_create_pathways_parser: ArgumentParser,
|
|
182
|
+
):
|
|
183
|
+
### Required arguments specific to "cluster create-pathways"
|
|
149
184
|
cluster_create_pathways_required_arguments = (
|
|
150
185
|
cluster_create_pathways_parser.add_argument_group(
|
|
151
186
|
'Required Arguments',
|
|
152
187
|
'Arguments required for cluster create-pathways.',
|
|
153
188
|
)
|
|
154
189
|
)
|
|
190
|
+
add_shared_cluster_create_required_arguments(
|
|
191
|
+
cluster_create_pathways_required_arguments
|
|
192
|
+
)
|
|
193
|
+
cluster_create_pathways_required_arguments.add_argument(
|
|
194
|
+
'--tpu-type',
|
|
195
|
+
type=str,
|
|
196
|
+
default=None,
|
|
197
|
+
help='The tpu type to use, v5litepod-16, etc.',
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
### Optional arguments specific to "cluster create-pathways"
|
|
155
201
|
cluster_create_pathways_optional_arguments = (
|
|
156
202
|
cluster_create_pathways_parser.add_argument_group(
|
|
157
203
|
'Optional Arguments',
|
|
158
204
|
'Arguments optional for cluster create-pathways.',
|
|
159
205
|
)
|
|
160
206
|
)
|
|
207
|
+
add_shared_cluster_create_optional_arguments(
|
|
208
|
+
cluster_create_pathways_optional_arguments
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
### Capacity arguments specific to "cluster create-pathways"
|
|
161
212
|
cluster_create_pathways_capacity_arguments = (
|
|
162
213
|
cluster_create_pathways_parser.add_argument_group(
|
|
163
214
|
'Capacity Arguments',
|
|
164
215
|
'Arguments related to capacity for cluster create-pathways.',
|
|
165
216
|
)
|
|
166
217
|
)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
'Optional Vertex AI Tensorboard Arguments',
|
|
170
|
-
'Arguments for creating Vertex AI Tensorboard in cluster create.',
|
|
171
|
-
)
|
|
218
|
+
add_shared_cluster_create_capacity_arguments(
|
|
219
|
+
cluster_create_pathways_capacity_arguments
|
|
172
220
|
)
|
|
173
221
|
|
|
174
|
-
###
|
|
175
|
-
|
|
176
|
-
'
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
help='The tpu type to use, v5litepod-16, etc.',
|
|
222
|
+
### Tensorboard arguments specific to "cluster create-pathways"
|
|
223
|
+
cluster_create_pathways_tensorboard_arguments = cluster_create_pathways_parser.add_argument_group(
|
|
224
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
225
|
+
'Arguments for creating Vertex AI Tensorboard in cluster'
|
|
226
|
+
' create-pathways.',
|
|
180
227
|
)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
cluster_create_ray_cluster_parser = cluster_subcommands.add_parser(
|
|
185
|
-
'create-ray',
|
|
186
|
-
help='Create RayCluster',
|
|
187
|
-
)
|
|
188
|
-
cluster_create_ray_cluster_required_arguments = (
|
|
189
|
-
cluster_create_ray_cluster_parser.add_argument_group(
|
|
190
|
-
'Required Arguments',
|
|
191
|
-
'Arguments required for cluster create-ray.',
|
|
192
|
-
)
|
|
228
|
+
add_shared_cluster_create_tensorboard_arguments(
|
|
229
|
+
cluster_create_pathways_tensorboard_arguments
|
|
193
230
|
)
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
231
|
+
|
|
232
|
+
### MTC arguments specific to "cluster create"
|
|
233
|
+
cluster_create_mtc_arguments = (
|
|
234
|
+
cluster_create_pathways_parser.add_argument_group(
|
|
235
|
+
'Optional MTC Arguments',
|
|
236
|
+
'Arguments for configuring MTC in cluster create.',
|
|
198
237
|
)
|
|
199
238
|
)
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
239
|
+
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
240
|
+
cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
244
|
+
### Required arguments specific to "cluster create-ray"
|
|
245
|
+
cluster_create_ray_required_arguments = (
|
|
246
|
+
cluster_create_ray_parser.add_argument_group(
|
|
247
|
+
'Required Arguments', 'Arguments required for cluster create-ray.'
|
|
204
248
|
)
|
|
205
249
|
)
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
'Optional Vertex AI Tensorboard Arguments',
|
|
209
|
-
'Arguments for creating Vertex AI Tensorboard in cluster create.',
|
|
210
|
-
)
|
|
250
|
+
add_shared_cluster_create_required_arguments(
|
|
251
|
+
cluster_create_ray_required_arguments
|
|
211
252
|
)
|
|
212
|
-
|
|
213
|
-
### RayCluster required arguments specific to "cluster create"
|
|
214
|
-
cluster_create_ray_cluster_required_arguments.add_argument(
|
|
253
|
+
cluster_create_ray_required_arguments.add_argument(
|
|
215
254
|
'--tpu-type',
|
|
216
255
|
type=str,
|
|
217
256
|
default=None,
|
|
@@ -219,14 +258,24 @@ def set_cluster_parser(cluster_parser):
|
|
|
219
258
|
required=True,
|
|
220
259
|
)
|
|
221
260
|
# TODO(bzmarke): Add --device-type to support GPU/CPU
|
|
222
|
-
|
|
261
|
+
cluster_create_ray_required_arguments.add_argument(
|
|
223
262
|
'--ray-version',
|
|
224
263
|
type=str,
|
|
225
264
|
default=None,
|
|
226
265
|
help="The Ray version to use, e.g. '2.38.0'",
|
|
227
266
|
required=True,
|
|
228
267
|
)
|
|
229
|
-
|
|
268
|
+
|
|
269
|
+
### Optional arguments specific to "cluster create-ray"
|
|
270
|
+
cluster_create_ray_optional_arguments = (
|
|
271
|
+
cluster_create_ray_parser.add_argument_group(
|
|
272
|
+
'Optional Arguments', 'Arguments optional for cluster create-ray.'
|
|
273
|
+
)
|
|
274
|
+
)
|
|
275
|
+
add_shared_cluster_create_optional_arguments(
|
|
276
|
+
cluster_create_ray_optional_arguments
|
|
277
|
+
)
|
|
278
|
+
cluster_create_ray_optional_arguments.add_argument(
|
|
230
279
|
'--enable-pathways',
|
|
231
280
|
action='store_true',
|
|
232
281
|
help=(
|
|
@@ -235,38 +284,38 @@ def set_cluster_parser(cluster_parser):
|
|
|
235
284
|
),
|
|
236
285
|
)
|
|
237
286
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
add_shared_cluster_create_capacity_arguments([
|
|
249
|
-
cluster_create_capacity_arguments,
|
|
250
|
-
cluster_create_pathways_capacity_arguments,
|
|
251
|
-
cluster_create_ray_cluster_capacity_arguments,
|
|
252
|
-
])
|
|
253
|
-
add_shared_cluster_create_tensorboard_arguments([
|
|
254
|
-
cluster_create_tensorboard_arguments,
|
|
255
|
-
cluster_create_pathways_tensorboard_arguments,
|
|
256
|
-
cluster_create_ray_cluster_tensorboard_arguments,
|
|
257
|
-
])
|
|
287
|
+
### Capacity arguments specific to "cluster create-ray"
|
|
288
|
+
cluster_create_ray_capacity_arguments = (
|
|
289
|
+
cluster_create_ray_parser.add_argument_group(
|
|
290
|
+
'Capacity Arguments',
|
|
291
|
+
'Arguments related to capacity for cluster create-ray.',
|
|
292
|
+
)
|
|
293
|
+
)
|
|
294
|
+
add_shared_cluster_create_capacity_arguments(
|
|
295
|
+
cluster_create_ray_capacity_arguments
|
|
296
|
+
)
|
|
258
297
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
298
|
+
### Tensorboard arguments specific to "cluster create-ray"
|
|
299
|
+
cluster_create_ray_tensorboard_arguments = (
|
|
300
|
+
cluster_create_ray_parser.add_argument_group(
|
|
301
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
302
|
+
'Arguments for creating Vertex AI Tensorboard in cluster create-ray.',
|
|
303
|
+
)
|
|
304
|
+
)
|
|
305
|
+
add_shared_cluster_create_tensorboard_arguments(
|
|
306
|
+
cluster_create_ray_tensorboard_arguments
|
|
263
307
|
)
|
|
264
308
|
|
|
265
|
-
### "cluster
|
|
266
|
-
|
|
267
|
-
'
|
|
268
|
-
|
|
309
|
+
### MTC arguments specific to "cluster create"
|
|
310
|
+
cluster_create_mtc_arguments = cluster_create_ray_parser.add_argument_group(
|
|
311
|
+
'Optional MTC Arguments',
|
|
312
|
+
'Arguments for configuring MTC in cluster create.',
|
|
269
313
|
)
|
|
314
|
+
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
315
|
+
cluster_create_ray_parser.set_defaults(func=cluster_create_ray_cluster)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
|
|
270
319
|
cluster_delete_required_arguments = cluster_delete_parser.add_argument_group(
|
|
271
320
|
'Required Arguments',
|
|
272
321
|
'Arguments required for cluster delete.',
|
|
@@ -293,31 +342,25 @@ def set_cluster_parser(cluster_parser):
|
|
|
293
342
|
required=False,
|
|
294
343
|
)
|
|
295
344
|
add_shared_arguments(cluster_delete_optional_arguments)
|
|
296
|
-
|
|
297
|
-
cluster_delete_parser.add_argument(
|
|
345
|
+
cluster_delete_optional_arguments.add_argument(
|
|
298
346
|
'--force',
|
|
299
347
|
action='store_true',
|
|
300
348
|
help=(
|
|
301
|
-
'Forces
|
|
349
|
+
'Forces cluster deletion command to run without additional approval.'
|
|
302
350
|
),
|
|
303
351
|
)
|
|
304
352
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
)
|
|
353
|
+
cluster_delete_parser.set_defaults(func=cluster_delete)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
|
|
310
357
|
cluster_cacheimage_required_arguments = (
|
|
311
358
|
cluster_cacheimage_parser.add_argument_group(
|
|
312
359
|
'Required Arguments',
|
|
313
360
|
'Arguments required for cluster cacheimage.',
|
|
314
361
|
)
|
|
315
362
|
)
|
|
316
|
-
|
|
317
|
-
cluster_cacheimage_parser.add_argument_group(
|
|
318
|
-
'Optional Arguments', 'Arguments optional for cluster cacheimage.'
|
|
319
|
-
)
|
|
320
|
-
)
|
|
363
|
+
|
|
321
364
|
cluster_cacheimage_group = (
|
|
322
365
|
cluster_cacheimage_parser.add_mutually_exclusive_group(required=True)
|
|
323
366
|
)
|
|
@@ -356,6 +399,11 @@ def set_cluster_parser(cluster_parser):
|
|
|
356
399
|
)
|
|
357
400
|
|
|
358
401
|
### Optional Arguments
|
|
402
|
+
cluster_cacheimage_optional_arguments = (
|
|
403
|
+
cluster_cacheimage_parser.add_argument_group(
|
|
404
|
+
'Optional Arguments', 'Arguments optional for cluster cacheimage.'
|
|
405
|
+
)
|
|
406
|
+
)
|
|
359
407
|
add_shared_arguments(cluster_cacheimage_optional_arguments)
|
|
360
408
|
cluster_cacheimage_optional_arguments.add_argument(
|
|
361
409
|
'--cache-key',
|
|
@@ -364,26 +412,18 @@ def set_cluster_parser(cluster_parser):
|
|
|
364
412
|
help='The key to cache the docker image under.',
|
|
365
413
|
required=False,
|
|
366
414
|
)
|
|
415
|
+
|
|
367
416
|
cluster_cacheimage_parser.set_defaults(func=cluster_cacheimage)
|
|
368
417
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
help='Describe a cluster.',
|
|
373
|
-
)
|
|
418
|
+
|
|
419
|
+
def set_cluster_describe_parser(cluster_describe_parser: ArgumentParser):
|
|
420
|
+
### Required arguments
|
|
374
421
|
cluster_describe_required_arguments = (
|
|
375
422
|
cluster_describe_parser.add_argument_group(
|
|
376
423
|
'Required Arguments',
|
|
377
424
|
'Arguments required for cluster describe.',
|
|
378
425
|
)
|
|
379
426
|
)
|
|
380
|
-
cluster_describe_optional_arguments = (
|
|
381
|
-
cluster_describe_parser.add_argument_group(
|
|
382
|
-
'Optional Arguments', 'Arguments optional for cluster describe.'
|
|
383
|
-
)
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
### Required arguments
|
|
387
427
|
cluster_describe_required_arguments.add_argument(
|
|
388
428
|
'--cluster',
|
|
389
429
|
type=name_type,
|
|
@@ -391,272 +431,425 @@ def set_cluster_parser(cluster_parser):
|
|
|
391
431
|
help='The name of the cluster to be describe.',
|
|
392
432
|
required=True,
|
|
393
433
|
)
|
|
434
|
+
|
|
394
435
|
### Optional Arguments
|
|
436
|
+
cluster_describe_optional_arguments = (
|
|
437
|
+
cluster_describe_parser.add_argument_group(
|
|
438
|
+
'Optional Arguments', 'Arguments optional for cluster describe.'
|
|
439
|
+
)
|
|
440
|
+
)
|
|
395
441
|
add_shared_arguments(cluster_describe_optional_arguments)
|
|
396
442
|
|
|
397
443
|
cluster_describe_parser.set_defaults(func=cluster_describe)
|
|
398
444
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
)
|
|
445
|
+
|
|
446
|
+
def set_cluster_list_parser(cluster_list_parser: ArgumentParser):
|
|
447
|
+
### Optional Arguments
|
|
403
448
|
cluster_list_optional_arguments = cluster_list_parser.add_argument_group(
|
|
404
449
|
'Optional Arguments', 'Arguments optional for cluster list.'
|
|
405
450
|
)
|
|
406
|
-
### Optional Arguments
|
|
407
451
|
add_shared_arguments(cluster_list_optional_arguments)
|
|
408
452
|
|
|
409
453
|
cluster_list_parser.set_defaults(func=cluster_list)
|
|
410
454
|
|
|
411
455
|
|
|
412
|
-
def
|
|
456
|
+
def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
|
|
457
|
+
cluster_adapt_required_arguments = cluster_adapt_parser.add_argument_group(
|
|
458
|
+
'Required Arguments',
|
|
459
|
+
'Arguments required for cluster adapt.',
|
|
460
|
+
)
|
|
461
|
+
add_shared_cluster_create_required_arguments(cluster_adapt_required_arguments)
|
|
462
|
+
|
|
463
|
+
cluster_adapt_device_group = (
|
|
464
|
+
cluster_adapt_required_arguments.add_mutually_exclusive_group(
|
|
465
|
+
required=True
|
|
466
|
+
)
|
|
467
|
+
)
|
|
468
|
+
cluster_adapt_device_group.add_argument(
|
|
469
|
+
'--tpu-type',
|
|
470
|
+
type=str,
|
|
471
|
+
default=None,
|
|
472
|
+
help='The tpu type used on cluster, v5litepod-16, etc.',
|
|
473
|
+
)
|
|
474
|
+
cluster_adapt_device_group.add_argument(
|
|
475
|
+
'--device-type',
|
|
476
|
+
type=str,
|
|
477
|
+
default=None,
|
|
478
|
+
help=(
|
|
479
|
+
'The device type used on cluster (can be tpu or gpu or cpu), eg.'
|
|
480
|
+
' h100-80gb-8, n2-standard-32-4 etc.'
|
|
481
|
+
),
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
cluster_adapt_optional_arguments = cluster_adapt_parser.add_argument_group(
|
|
485
|
+
'Optional Arguments',
|
|
486
|
+
'Arguments optional for cluster adapt.',
|
|
487
|
+
)
|
|
488
|
+
cluster_adapt_optional_arguments.add_argument(
|
|
489
|
+
'--num-nodes',
|
|
490
|
+
type=int,
|
|
491
|
+
help='The number of nodes of a cluster.',
|
|
492
|
+
)
|
|
493
|
+
cluster_adapt_optional_arguments.add_argument(
|
|
494
|
+
'--enable-workload-identity',
|
|
495
|
+
action='store_true',
|
|
496
|
+
help='Enable Workload Identity Federation on the cluster and node-pools.',
|
|
497
|
+
)
|
|
498
|
+
cluster_adapt_optional_arguments.add_argument(
|
|
499
|
+
'--num-slices',
|
|
500
|
+
type=int,
|
|
501
|
+
default=1,
|
|
502
|
+
help='The number of slices to run the job on, defaults to 1.',
|
|
503
|
+
required=False,
|
|
504
|
+
)
|
|
505
|
+
add_driver_arguments(cluster_adapt_optional_arguments)
|
|
506
|
+
add_shared_arguments(cluster_adapt_optional_arguments)
|
|
507
|
+
|
|
508
|
+
cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
|
|
509
|
+
'Capacity Arguments', 'Arguments related to capacity for cluster create.'
|
|
510
|
+
)
|
|
511
|
+
add_shared_cluster_create_capacity_arguments(cluster_adapt_capacity_arguments)
|
|
512
|
+
|
|
513
|
+
cluster_adapt_autoprovisioning_arguments = (
|
|
514
|
+
cluster_adapt_parser.add_argument_group(
|
|
515
|
+
'Autoprovisioning Arguments',
|
|
516
|
+
'Optional arguments for enabling autoprovisioning.',
|
|
517
|
+
)
|
|
518
|
+
)
|
|
519
|
+
add_autoprovisioning_arguments(cluster_adapt_autoprovisioning_arguments)
|
|
520
|
+
|
|
521
|
+
cluster_adapt_tensorboard_arguments = cluster_adapt_parser.add_argument_group(
|
|
522
|
+
'Optional Vertex AI Tensorboard Arguments',
|
|
523
|
+
'Arguments for creating Vertex AI Tensorboard in cluster adapt.',
|
|
524
|
+
)
|
|
525
|
+
add_shared_cluster_create_tensorboard_arguments(
|
|
526
|
+
cluster_adapt_tensorboard_arguments
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
cluster_adapt_parser.set_defaults(func=cluster_adapt)
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def add_autoprovisioning_arguments(parser: ArgumentParser):
|
|
533
|
+
parser.add_argument(
|
|
534
|
+
'--enable-autoprovisioning',
|
|
535
|
+
action='store_true',
|
|
536
|
+
help=(
|
|
537
|
+
'Enable GKE features for autoprovisioning node pools in GKE clusters.'
|
|
538
|
+
),
|
|
539
|
+
)
|
|
540
|
+
parser.add_argument(
|
|
541
|
+
'--autoprovisioning-min-chips',
|
|
542
|
+
type=int,
|
|
543
|
+
help=(
|
|
544
|
+
'Optionally set the minimum autoprovisioning accelerator resources in'
|
|
545
|
+
' units of chips.By default, autoprovisioning will use the number of'
|
|
546
|
+
' resources in the cluster as the minimum, and maximum.'
|
|
547
|
+
),
|
|
548
|
+
)
|
|
549
|
+
parser.add_argument(
|
|
550
|
+
'--autoprovisioning-max-chips',
|
|
551
|
+
type=int,
|
|
552
|
+
help=(
|
|
553
|
+
'Optionally set the maximum autoprovisioning accelerator resources in'
|
|
554
|
+
' units of chips.By default, autoprovisioning will use the number of'
|
|
555
|
+
' resources in the cluster as the minimum, and maximum.'
|
|
556
|
+
),
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
def add_shared_cluster_create_required_arguments(parser: ArgumentParser):
|
|
413
561
|
"""Add shared required arguments in cluster create and Pathways cluster create.
|
|
414
562
|
|
|
415
563
|
Args:
|
|
416
|
-
|
|
564
|
+
parser: cluster create argument parser or argument group
|
|
417
565
|
"""
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
def add_shared_cluster_create_optional_arguments(args_parsers):
|
|
566
|
+
parser.add_argument(
|
|
567
|
+
'--cluster',
|
|
568
|
+
type=name_type,
|
|
569
|
+
default=None,
|
|
570
|
+
help=(
|
|
571
|
+
'The name of the cluster. Will be used as the prefix for internal'
|
|
572
|
+
' objects in the cluster.'
|
|
573
|
+
),
|
|
574
|
+
required=True,
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
|
|
432
579
|
"""Add shared optional arguments in cluster create and Pathways cluster create.
|
|
433
580
|
|
|
434
581
|
Args:
|
|
435
|
-
|
|
582
|
+
parser: cluster create argument parser or argument group
|
|
436
583
|
"""
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
584
|
+
add_shared_arguments(parser)
|
|
585
|
+
parser.add_argument(
|
|
586
|
+
'--host-maintenance-interval',
|
|
587
|
+
type=str,
|
|
588
|
+
choices=['AS_NEEDED', 'PERIODIC'],
|
|
589
|
+
default='AS_NEEDED',
|
|
590
|
+
help='The maintenance policy of the cluster and respective clusters.',
|
|
591
|
+
)
|
|
592
|
+
parser.add_argument(
|
|
593
|
+
'--gke-version',
|
|
594
|
+
type=str,
|
|
595
|
+
help=(
|
|
596
|
+
'The GKE version of the cluster and respective clusters. The'
|
|
597
|
+
' default is determined dynamically based on RAPID channel'
|
|
598
|
+
' recommended version.'
|
|
599
|
+
),
|
|
600
|
+
)
|
|
601
|
+
parser.add_argument(
|
|
602
|
+
'--num-slices',
|
|
603
|
+
type=int,
|
|
604
|
+
default=1,
|
|
605
|
+
help='The number of slices to run the job on, defaults to 1.',
|
|
606
|
+
required=False,
|
|
607
|
+
)
|
|
608
|
+
parser.add_argument(
|
|
609
|
+
'--pathways-gce-machine-type',
|
|
610
|
+
type=str,
|
|
611
|
+
default='n2-standard-64',
|
|
612
|
+
help='The CPU type for Pathways CPU nodepools',
|
|
613
|
+
)
|
|
614
|
+
parser.add_argument(
|
|
615
|
+
'--default-pool-cpu-machine-type',
|
|
616
|
+
type=str,
|
|
617
|
+
default='e2-standard-16',
|
|
618
|
+
help=(
|
|
619
|
+
'Set the machine type within the default cpu node pool. For'
|
|
620
|
+
' regional clusters, all zones must support the machine type.'
|
|
621
|
+
),
|
|
622
|
+
)
|
|
623
|
+
parser.add_argument(
|
|
624
|
+
'--cluster-cpu-machine-type',
|
|
625
|
+
type=str,
|
|
626
|
+
default='',
|
|
627
|
+
help=(
|
|
628
|
+
'Getting deprecated soon! Please use'
|
|
629
|
+
' --default-pool-cpu-machine-typeinstead, to denote the machine'
|
|
630
|
+
' type of the default cpu node pool. Set the machine type of other'
|
|
631
|
+
' cpu nodepools using --device-type.'
|
|
632
|
+
),
|
|
633
|
+
)
|
|
634
|
+
parser.add_argument(
|
|
635
|
+
'--default-pool-cpu-num-nodes',
|
|
636
|
+
type=int,
|
|
637
|
+
default=6,
|
|
638
|
+
help=(
|
|
639
|
+
'Set the number of nodes within the default cpu node pool. This is'
|
|
640
|
+
' set to 6 by default. Autoscaling is enabled to scale this value'
|
|
641
|
+
' over time.'
|
|
642
|
+
),
|
|
643
|
+
)
|
|
644
|
+
parser.add_argument(
|
|
645
|
+
'--custom-cluster-arguments',
|
|
646
|
+
type=str,
|
|
647
|
+
default='',
|
|
648
|
+
help=(
|
|
649
|
+
'Users can add their own arguments to customize their cluster'
|
|
650
|
+
' create command. Do note, these will not override already used'
|
|
651
|
+
' cluster creation arguments. e.g.'
|
|
652
|
+
" --custom-cluster-arguments='--network=mtu9k --subnetwork=mtu9k'"
|
|
653
|
+
),
|
|
654
|
+
)
|
|
655
|
+
parser.add_argument(
|
|
656
|
+
'--custom-nodepool-arguments',
|
|
657
|
+
type=str,
|
|
658
|
+
default='',
|
|
659
|
+
help=(
|
|
660
|
+
'Users can add their own arguments to customize their node pool '
|
|
661
|
+
' create command. Do note, these will not override already used'
|
|
662
|
+
' node pool creation arguments. e.g.'
|
|
663
|
+
' --custom-nodepool-arguments="--disk-size=300"'
|
|
664
|
+
),
|
|
665
|
+
)
|
|
666
|
+
parser.add_argument(
|
|
667
|
+
'--force',
|
|
668
|
+
action='store_true',
|
|
669
|
+
help=(
|
|
670
|
+
'Forces node pool creation and delete commands to run without'
|
|
671
|
+
' additional approval.'
|
|
672
|
+
),
|
|
673
|
+
)
|
|
674
|
+
parser.add_argument(
|
|
675
|
+
'--custom-tpu-nodepool-arguments',
|
|
676
|
+
type=str,
|
|
677
|
+
default='',
|
|
678
|
+
help=(
|
|
679
|
+
'DEPRECATING SOON! Please use --custom-nodepool-arguments to'
|
|
680
|
+
' customize node pool create command. Do note, these will not'
|
|
681
|
+
' override already used node pool creation arguments. Example usage'
|
|
682
|
+
' --custom-tpu-nodepool-arguments="--enable-ip-alias"'
|
|
683
|
+
),
|
|
684
|
+
)
|
|
685
|
+
parser.add_argument(
|
|
686
|
+
'--private',
|
|
687
|
+
action='store_true',
|
|
688
|
+
help=(
|
|
689
|
+
'Creates a private GKE cluster, a VPC-native cluster in which Nodes'
|
|
690
|
+
' and Pods are isolated from the internet. If set,'
|
|
691
|
+
' master_authorized_networks will also be enabled and access to the'
|
|
692
|
+
" cluster's control plane will be restricted only to current"
|
|
693
|
+
" machine's IP address unless more IP ranges are authorized by"
|
|
694
|
+
' providing --authorized-networks. This works only on creating new'
|
|
695
|
+
' clusters.'
|
|
696
|
+
),
|
|
697
|
+
)
|
|
698
|
+
parser.add_argument(
|
|
699
|
+
'--authorized-networks',
|
|
700
|
+
action='extend',
|
|
701
|
+
nargs='+',
|
|
702
|
+
help=(
|
|
703
|
+
'Sets the provided cidrs as authorized IP ranges to access the'
|
|
704
|
+
" private cluster's control plan. Access to the control plane will"
|
|
705
|
+
" be provided to current machine's IP address even if"
|
|
706
|
+
' --authorized-networks is not set or it does not cover the IP'
|
|
707
|
+
' address. If set, --private is considered true and a private'
|
|
708
|
+
' cluster will be provisioned. It replaces existing authorized'
|
|
709
|
+
' networks if used with an existing private cluster.'
|
|
710
|
+
' Example usage: --authorized-networks 1.2.3.0/24 1.2.4.5/32'
|
|
711
|
+
),
|
|
712
|
+
)
|
|
713
|
+
parser.add_argument(
|
|
714
|
+
'--enable-workload-identity',
|
|
715
|
+
action='store_true',
|
|
716
|
+
help='Enable Workload Identity Federation on the cluster and node-pools.',
|
|
717
|
+
)
|
|
718
|
+
add_driver_arguments(parser)
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def add_driver_arguments(parser: ArgumentParser):
|
|
722
|
+
parser.add_argument(
|
|
723
|
+
'--enable-gcsfuse-csi-driver',
|
|
724
|
+
action='store_true',
|
|
725
|
+
help=(
|
|
726
|
+
'Enable GSCFuse driver on the cluster. This enables Workload'
|
|
727
|
+
' Identity Federation. When using A3 ultra/A3 mega Workload'
|
|
728
|
+
' Identity is enabled by default.'
|
|
729
|
+
),
|
|
730
|
+
)
|
|
731
|
+
parser.add_argument(
|
|
732
|
+
'--enable-gcpfilestore-csi-driver',
|
|
733
|
+
action='store_true',
|
|
734
|
+
help='Enable GCPFilestore driver on the cluster.',
|
|
735
|
+
)
|
|
736
|
+
parser.add_argument(
|
|
737
|
+
'--enable-parallelstore-csi-driver',
|
|
738
|
+
action='store_true',
|
|
739
|
+
help='Enable Parallelstore CSI driver on the cluster.',
|
|
740
|
+
)
|
|
741
|
+
parser.add_argument(
|
|
742
|
+
'--enable-pd-csi-driver',
|
|
743
|
+
action='store_true',
|
|
744
|
+
help='Enable PersistentDisk CSI driver on the cluster.',
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def add_shared_cluster_create_tensorboard_arguments(parser: ArgumentParser):
|
|
595
749
|
"""Add shared tensorboard arguments in cluster create and Pathways cluster create.
|
|
596
750
|
Note that this feature enables non-Pathways workloads to use tensorboard arguments
|
|
597
751
|
on a Pathways cluster.
|
|
752
|
+
|
|
598
753
|
Args:
|
|
599
|
-
|
|
754
|
+
parser: cluster create argument parser or argument group
|
|
600
755
|
"""
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
def add_shared_cluster_create_capacity_arguments(args_parsers):
|
|
756
|
+
parser.add_argument(
|
|
757
|
+
'--create-vertex-tensorboard',
|
|
758
|
+
action='store_true',
|
|
759
|
+
help='Set this flag to create a Tensorboard instance in Vertex AI.',
|
|
760
|
+
)
|
|
761
|
+
parser.add_argument(
|
|
762
|
+
'--tensorboard-region',
|
|
763
|
+
type=str,
|
|
764
|
+
default='us-central1',
|
|
765
|
+
help=(
|
|
766
|
+
'The region to create Vertex Tensorboard instance in. Visit'
|
|
767
|
+
' https://cloud.google.com/vertex-ai/docs/general/locations#available-regions'
|
|
768
|
+
' to view regions supported by Vertex AI. By default, Tensorboard'
|
|
769
|
+
' instance will be created in us-central1.'
|
|
770
|
+
),
|
|
771
|
+
)
|
|
772
|
+
parser.add_argument(
|
|
773
|
+
'--tensorboard-name',
|
|
774
|
+
type=str,
|
|
775
|
+
required=False,
|
|
776
|
+
help=(
|
|
777
|
+
'The name of Vertex Tensorboard instance to create. If not'
|
|
778
|
+
' specified, a Tensorboard instance with the name'
|
|
779
|
+
f' <cluster>-{DEFAULT_VERTEX_TENSORBOARD_NAME} will be'
|
|
780
|
+
' created.'
|
|
781
|
+
),
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
def add_shared_cluster_create_capacity_arguments(parser: ArgumentParser):
|
|
632
786
|
"""Add shared capacity arguments in cluster create and Pathways cluster create.
|
|
633
787
|
|
|
634
788
|
Args:
|
|
635
|
-
|
|
789
|
+
parser: cluster create argument parser or argument group
|
|
790
|
+
"""
|
|
791
|
+
parser.add_argument(
|
|
792
|
+
'--on-demand',
|
|
793
|
+
action='store_true',
|
|
794
|
+
help=(
|
|
795
|
+
'Sets node pool creation to use on-demand resources. '
|
|
796
|
+
' See `--reservation` or `--spot` for other capacity types.'
|
|
797
|
+
),
|
|
798
|
+
)
|
|
799
|
+
parser.add_argument(
|
|
800
|
+
'--reservation',
|
|
801
|
+
type=str,
|
|
802
|
+
help=(
|
|
803
|
+
'The reservation to be used for acquiring resources in the'
|
|
804
|
+
' cluster. This will attempt to find the provided reservation.'
|
|
805
|
+
' See `--spot` or `--on-demand` for other capacity types.'
|
|
806
|
+
),
|
|
807
|
+
)
|
|
808
|
+
parser.add_argument(
|
|
809
|
+
'--spot',
|
|
810
|
+
action='store_true',
|
|
811
|
+
help=(
|
|
812
|
+
'Sets node pool creation to use spot resources.'
|
|
813
|
+
' See `--reservation` or `--on-demand` for other capacity types.'
|
|
814
|
+
),
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def add_shared_cluster_create_mtc_arguments(parser: ArgumentParser):
|
|
819
|
+
"""Add shared Multi-tier Checkpointing arguments in cluster create and Pathways cluster create.
|
|
820
|
+
|
|
821
|
+
Args:
|
|
822
|
+
List of cluster create MTC arguments parsers
|
|
636
823
|
"""
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
824
|
+
parser.add_argument(
|
|
825
|
+
'--enable-mtc',
|
|
826
|
+
action='store_true',
|
|
827
|
+
help='Enable MTC on the cluster.',
|
|
828
|
+
)
|
|
829
|
+
parser.add_argument(
|
|
830
|
+
'--mtc-ramdisk-size',
|
|
831
|
+
type=str,
|
|
832
|
+
default=None,
|
|
833
|
+
help=(
|
|
834
|
+
'(Required if --enable-mtc is true) The size of the RAM disk to be'
|
|
835
|
+
' used for multi-tier checkpointing. e.g. "64Mi" '
|
|
836
|
+
),
|
|
837
|
+
)
|
|
838
|
+
parser.add_argument(
|
|
839
|
+
'--mtc-gcs-bucket',
|
|
840
|
+
type=str,
|
|
841
|
+
default=None,
|
|
842
|
+
help=(
|
|
843
|
+
'(Required if --enable-mtc is true) The GCS bucket to be used for'
|
|
844
|
+
' multi-tier checkpointing.'
|
|
845
|
+
),
|
|
846
|
+
)
|
|
847
|
+
parser.add_argument(
|
|
848
|
+
'--mtc-toleration-key',
|
|
849
|
+
type=str,
|
|
850
|
+
default=None,
|
|
851
|
+
help=(
|
|
852
|
+
'(Optional) The tolerance key to be used for multi-tier'
|
|
853
|
+
' checkpointing. By default, it is set to "google.com/tpu".'
|
|
854
|
+
),
|
|
855
|
+
)
|