xpk 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +5 -6
- xpk/commands/cluster.py +246 -73
- xpk/commands/cluster_gcluster.py +27 -0
- xpk/commands/common.py +40 -1
- xpk/commands/kjob_common.py +13 -1
- xpk/commands/run.py +4 -5
- xpk/commands/shell.py +2 -2
- xpk/commands/storage.py +24 -6
- xpk/commands/workload.py +66 -27
- xpk/core/blueprint/blueprint_generator.py +115 -47
- xpk/core/capacity.py +66 -6
- xpk/core/cluster.py +282 -13
- xpk/core/config.py +1 -65
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +145 -72
- xpk/core/filestore.py +2 -6
- xpk/core/gcsfuse.py +22 -4
- xpk/core/jobset.py +143 -0
- xpk/core/kjob.py +21 -18
- xpk/core/kueue.py +194 -4
- xpk/core/mtc.py +195 -0
- xpk/core/network.py +23 -1
- xpk/core/nodepool.py +17 -4
- xpk/core/pathways.py +2 -3
- xpk/core/resources.py +21 -0
- xpk/core/storage.py +1 -95
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +1 -45
- xpk/core/workload_decorators/rdma_decorator.py +8 -10
- xpk/core/workload_decorators/tcpx_decorator.py +185 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +22 -14
- xpk/parser/cluster.py +589 -389
- xpk/parser/storage.py +12 -3
- xpk/parser/workload.py +21 -3
- xpk/utils/kubectl.py +4 -1
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/METADATA +178 -96
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/RECORD +41 -38
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/WHEEL +1 -1
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/top_level.txt +0 -0
xpk/commands/batch.py
CHANGED
|
@@ -18,7 +18,7 @@ import re
|
|
|
18
18
|
from argparse import Namespace
|
|
19
19
|
|
|
20
20
|
from ..core.cluster import (
|
|
21
|
-
|
|
21
|
+
setup_k8s_service_accounts,
|
|
22
22
|
get_cluster_credentials,
|
|
23
23
|
)
|
|
24
24
|
from ..core.commands import run_command_for_value
|
|
@@ -26,14 +26,13 @@ from ..core.gcloud_context import add_zone_and_project
|
|
|
26
26
|
from ..core.kjob import (
|
|
27
27
|
AppProfileDefaults,
|
|
28
28
|
JobTemplateDefaults,
|
|
29
|
-
Kueue_TAS_annotation,
|
|
30
29
|
get_storage_annotations,
|
|
31
30
|
prepare_kjob,
|
|
32
31
|
)
|
|
33
32
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
34
33
|
from ..utils.console import xpk_exit, xpk_print
|
|
35
34
|
from .kind import set_local_cluster_command
|
|
36
|
-
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
35
|
+
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
37
36
|
|
|
38
37
|
|
|
39
38
|
def batch(args: Namespace) -> None:
|
|
@@ -55,24 +54,24 @@ def batch(args: Namespace) -> None:
|
|
|
55
54
|
err_code = prepare_kjob(args)
|
|
56
55
|
if err_code > 0:
|
|
57
56
|
xpk_exit(err_code)
|
|
58
|
-
|
|
57
|
+
setup_k8s_service_accounts()
|
|
59
58
|
|
|
60
59
|
submit_job(args)
|
|
61
60
|
|
|
62
61
|
|
|
63
62
|
def submit_job(args: Namespace) -> None:
|
|
64
63
|
|
|
65
|
-
|
|
64
|
+
setup_k8s_service_accounts()
|
|
66
65
|
|
|
67
66
|
cmd = (
|
|
68
67
|
'kubectl kjob create slurm'
|
|
69
68
|
f' --profile {AppProfileDefaults.NAME.value}'
|
|
70
69
|
f' --localqueue {LOCAL_QUEUE_NAME}'
|
|
71
|
-
f' --pod-template-annotation {Kueue_TAS_annotation}'
|
|
72
70
|
f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
73
71
|
' --first-node-ip'
|
|
74
72
|
)
|
|
75
73
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
74
|
+
cmd = add_TAS_annotations_to_command(args, cmd)
|
|
76
75
|
|
|
77
76
|
for annotation in get_storage_annotations(args):
|
|
78
77
|
cmd += f' --pod-template-annotation {annotation}'
|
xpk/commands/cluster.py
CHANGED
|
@@ -16,19 +16,23 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from tabulate import tabulate
|
|
18
18
|
|
|
19
|
-
from ..core.capacity import H100_DEVICE_TYPE
|
|
19
|
+
from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
|
|
20
20
|
from ..core.cluster import (
|
|
21
21
|
get_all_clusters_programmatic,
|
|
22
22
|
get_cluster_credentials,
|
|
23
23
|
install_nccl_on_cluster,
|
|
24
|
+
install_nri_on_cluster,
|
|
24
25
|
set_jobset_on_cluster,
|
|
25
26
|
set_pathways_job_on_cluster,
|
|
26
27
|
setup_k8s_env,
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
disable_mglru_on_cluster,
|
|
29
|
+
count_nodes_on_cluster,
|
|
29
30
|
update_cluster_with_gcpfilestore_driver_if_necessary,
|
|
31
|
+
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
30
32
|
update_cluster_with_parallelstore_driver_if_necessary,
|
|
31
33
|
update_cluster_with_pd_driver_if_necessary,
|
|
34
|
+
update_cluster_with_lustre_driver_if_necessary,
|
|
35
|
+
update_cluster_with_workload_identity_if_necessary,
|
|
32
36
|
)
|
|
33
37
|
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
|
|
34
38
|
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
@@ -39,12 +43,14 @@ from ..core.gcloud_context import (
|
|
|
39
43
|
get_gke_server_config,
|
|
40
44
|
zone_to_region,
|
|
41
45
|
)
|
|
46
|
+
from ..core.jobset import update_jobset_resources_if_necessary
|
|
42
47
|
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
|
|
43
48
|
from ..core.kueue import (
|
|
44
49
|
cluster_preheat_yml,
|
|
45
50
|
install_kueue_crs,
|
|
46
51
|
install_kueue_on_cluster,
|
|
47
52
|
wait_for_kueue_available,
|
|
53
|
+
update_kueue_resources_if_necessary,
|
|
48
54
|
)
|
|
49
55
|
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
50
56
|
from ..core.network import (
|
|
@@ -52,8 +58,12 @@ from ..core.network import (
|
|
|
52
58
|
delete_cluster_subnets,
|
|
53
59
|
set_up_cluster_network_for_a3,
|
|
54
60
|
)
|
|
55
|
-
from ..core.nodepool import
|
|
61
|
+
from ..core.nodepool import (
|
|
62
|
+
get_gke_node_pool_version,
|
|
63
|
+
run_gke_node_pool_create_command,
|
|
64
|
+
)
|
|
56
65
|
from ..core.ray import install_ray_cluster
|
|
66
|
+
from ..core.mtc import install_mtc_on_cluster
|
|
57
67
|
from ..core.resources import create_cluster_configmaps
|
|
58
68
|
from ..core.storage import install_storage_crd
|
|
59
69
|
from ..core.system_characteristics import (
|
|
@@ -70,14 +80,122 @@ from . import cluster_gcluster
|
|
|
70
80
|
from .common import set_cluster_command
|
|
71
81
|
|
|
72
82
|
|
|
83
|
+
def cluster_adapt(args) -> None:
|
|
84
|
+
"""Function that performs cluster adaptation.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
args: user provided arguments for running the command.
|
|
88
|
+
"""
|
|
89
|
+
args.enable_pathways = False
|
|
90
|
+
|
|
91
|
+
system, return_code = get_system_characteristics(args)
|
|
92
|
+
|
|
93
|
+
if return_code > 0:
|
|
94
|
+
xpk_print('Fetching system characteristics failed!')
|
|
95
|
+
xpk_exit(return_code)
|
|
96
|
+
|
|
97
|
+
xpk_print(
|
|
98
|
+
f'Starting cluster adaptation for cluster {args.cluster}:', flush=True
|
|
99
|
+
)
|
|
100
|
+
add_zone_and_project(args)
|
|
101
|
+
|
|
102
|
+
if system.accelerator_type == AcceleratorType['GPU'] and not getattr(
|
|
103
|
+
args, 'num_nodes'
|
|
104
|
+
):
|
|
105
|
+
xpk_print(
|
|
106
|
+
'Argument --num-nodes was not provided, trying to determine number of'
|
|
107
|
+
' nodes based on the available nodes in the cluster...'
|
|
108
|
+
)
|
|
109
|
+
args.num_nodes = count_nodes_on_cluster(args, system)
|
|
110
|
+
if args.num_nodes == 0:
|
|
111
|
+
xpk_print(
|
|
112
|
+
'Found unexpected number of nodes. Is the --device-type correct?'
|
|
113
|
+
)
|
|
114
|
+
xpk_exit(1)
|
|
115
|
+
else:
|
|
116
|
+
xpk_print(f'Using {args.num_nodes} nodes.')
|
|
117
|
+
|
|
118
|
+
# ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
|
|
119
|
+
# Enable WorkloadIdentity if not enabled already.
|
|
120
|
+
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
121
|
+
update_cluster_command_code = (
|
|
122
|
+
update_cluster_with_workload_identity_if_necessary(args)
|
|
123
|
+
)
|
|
124
|
+
if update_cluster_command_code != 0:
|
|
125
|
+
xpk_exit(update_cluster_command_code)
|
|
126
|
+
|
|
127
|
+
get_cluster_credentials(args)
|
|
128
|
+
|
|
129
|
+
k8s_client = setup_k8s_env(args)
|
|
130
|
+
|
|
131
|
+
install_storage_crd(k8s_client)
|
|
132
|
+
install_storage_csis(args)
|
|
133
|
+
|
|
134
|
+
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
135
|
+
tensorboard_config = {}
|
|
136
|
+
if VERTEX_TENSORBOARD_FEATURE_FLAG and args.create_vertex_tensorboard:
|
|
137
|
+
tensorboard_config = create_vertex_tensorboard(args)
|
|
138
|
+
# exit if failed to create Tensorboard in Vertex AI
|
|
139
|
+
if not tensorboard_config:
|
|
140
|
+
xpk_exit(1)
|
|
141
|
+
|
|
142
|
+
# Provision node pools dynamically based on incoming workloads:
|
|
143
|
+
# Currently autoprovisioning is not supported with Pathways.
|
|
144
|
+
autoprovisioning_config = None
|
|
145
|
+
if args.enable_autoprovisioning:
|
|
146
|
+
xpk_print('Enabling Autoprovisioning')
|
|
147
|
+
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
|
|
148
|
+
args, system
|
|
149
|
+
)
|
|
150
|
+
if return_code != 0:
|
|
151
|
+
xpk_exit(return_code)
|
|
152
|
+
|
|
153
|
+
xpk_print('Creating ConfigMap for cluster')
|
|
154
|
+
create_cluster_configmaps_code = create_cluster_configmaps(
|
|
155
|
+
args, system, tensorboard_config, autoprovisioning_config
|
|
156
|
+
)
|
|
157
|
+
if create_cluster_configmaps_code != 0:
|
|
158
|
+
xpk_exit(create_cluster_configmaps_code)
|
|
159
|
+
|
|
160
|
+
xpk_print(
|
|
161
|
+
'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
|
|
162
|
+
' globally available'
|
|
163
|
+
)
|
|
164
|
+
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
|
|
165
|
+
if set_jobset_on_cluster_code != 0:
|
|
166
|
+
xpk_exit(set_jobset_on_cluster_code)
|
|
167
|
+
|
|
168
|
+
# TODO: Uncomment when cluster_adapt will support TPU cluters
|
|
169
|
+
# set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
|
|
170
|
+
# if set_pathways_job_on_cluster_code != 0:
|
|
171
|
+
# xpk_exit(set_pathways_job_on_cluster_code)
|
|
172
|
+
|
|
173
|
+
install_kueue(args, system, autoprovisioning_config)
|
|
174
|
+
|
|
175
|
+
install_kjob(args)
|
|
176
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
177
|
+
prepare_gpus(args, system)
|
|
178
|
+
|
|
179
|
+
if args.enable_ray_cluster:
|
|
180
|
+
return_code = install_ray_cluster(args, system)
|
|
181
|
+
if return_code != 0:
|
|
182
|
+
xpk_print('Installation of RayCluster failed.')
|
|
183
|
+
xpk_exit(return_code)
|
|
184
|
+
|
|
185
|
+
xpk_print('GKE commands done! Resources are created.')
|
|
186
|
+
xpk_print(
|
|
187
|
+
'See your GKE Cluster here:'
|
|
188
|
+
# pylint: disable=line-too-long
|
|
189
|
+
f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
|
|
190
|
+
)
|
|
191
|
+
xpk_exit(0)
|
|
192
|
+
|
|
193
|
+
|
|
73
194
|
def cluster_create(args) -> None:
|
|
74
195
|
"""Function around cluster creation.
|
|
75
196
|
|
|
76
197
|
Args:
|
|
77
198
|
args: user provided arguments for running the command.
|
|
78
|
-
|
|
79
|
-
Returns:
|
|
80
|
-
0 if successful and 1 otherwise.
|
|
81
199
|
"""
|
|
82
200
|
system, return_code = get_system_characteristics(args)
|
|
83
201
|
|
|
@@ -127,38 +245,12 @@ def cluster_create(args) -> None:
|
|
|
127
245
|
if update_cluster_command_code != 0:
|
|
128
246
|
xpk_exit(update_cluster_command_code)
|
|
129
247
|
|
|
130
|
-
|
|
131
|
-
if args.enable_gcsfuse_csi_driver:
|
|
132
|
-
update_cluster_command_code = (
|
|
133
|
-
update_cluster_with_gcsfuse_driver_if_necessary(args)
|
|
134
|
-
)
|
|
135
|
-
if update_cluster_command_code != 0:
|
|
136
|
-
xpk_exit(update_cluster_command_code)
|
|
137
|
-
|
|
138
|
-
if args.enable_gcpfilestore_csi_driver:
|
|
139
|
-
update_cluster_command_code = (
|
|
140
|
-
update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
141
|
-
)
|
|
142
|
-
if update_cluster_command_code != 0:
|
|
143
|
-
xpk_exit(update_cluster_command_code)
|
|
144
|
-
|
|
145
|
-
if args.enable_parallelstore_csi_driver:
|
|
146
|
-
update_cluster_command_code = (
|
|
147
|
-
update_cluster_with_parallelstore_driver_if_necessary(args)
|
|
148
|
-
)
|
|
149
|
-
if update_cluster_command_code != 0:
|
|
150
|
-
xpk_exit(update_cluster_command_code)
|
|
151
|
-
|
|
152
|
-
if args.enable_pd_csi_driver:
|
|
153
|
-
update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
|
|
154
|
-
args
|
|
155
|
-
)
|
|
156
|
-
if update_cluster_command_code != 0:
|
|
157
|
-
xpk_exit(update_cluster_command_code)
|
|
248
|
+
get_cluster_credentials(args)
|
|
158
249
|
|
|
159
|
-
|
|
250
|
+
k8s_client = setup_k8s_env(args)
|
|
160
251
|
|
|
161
|
-
|
|
252
|
+
install_storage_crd(k8s_client)
|
|
253
|
+
install_storage_csis(args)
|
|
162
254
|
|
|
163
255
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
164
256
|
tensorboard_config = {}
|
|
@@ -218,50 +310,20 @@ def cluster_create(args) -> None:
|
|
|
218
310
|
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
|
|
219
311
|
if set_jobset_on_cluster_code != 0:
|
|
220
312
|
xpk_exit(set_jobset_on_cluster_code)
|
|
313
|
+
update_jobset_resources_code = update_jobset_resources_if_necessary(args)
|
|
314
|
+
if update_jobset_resources_code != 0:
|
|
315
|
+
xpk_exit(update_jobset_resources_code)
|
|
221
316
|
|
|
222
317
|
set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
|
|
223
318
|
if set_pathways_job_on_cluster_code != 0:
|
|
224
319
|
xpk_exit(set_pathways_job_on_cluster_code)
|
|
225
320
|
|
|
226
|
-
|
|
227
|
-
install_kueue_on_cluster_code = install_kueue_on_cluster(args)
|
|
228
|
-
if install_kueue_on_cluster_code != 0:
|
|
229
|
-
xpk_exit(install_kueue_on_cluster_code)
|
|
230
|
-
|
|
231
|
-
xpk_print('Verifying kjob installation')
|
|
232
|
-
err_code = verify_kjob_installed(args)
|
|
233
|
-
if err_code > 0:
|
|
234
|
-
xpk_exit(err_code)
|
|
235
|
-
|
|
236
|
-
xpk_print('Applying kjob CDRs')
|
|
237
|
-
err_code = apply_kjob_crds(args)
|
|
238
|
-
if err_code > 0:
|
|
239
|
-
xpk_exit(err_code)
|
|
321
|
+
install_kueue(args, system, autoprovisioning_config)
|
|
240
322
|
|
|
241
|
-
|
|
242
|
-
if err_code > 0:
|
|
243
|
-
xpk_exit(err_code)
|
|
244
|
-
|
|
245
|
-
k8s_client = setup_k8s_env(args)
|
|
246
|
-
install_storage_crd(k8s_client)
|
|
247
|
-
|
|
248
|
-
xpk_print('Wait for Kueue to be fully available')
|
|
249
|
-
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
250
|
-
if wait_for_kueue_available_code != 0:
|
|
251
|
-
xpk_exit(wait_for_kueue_available_code)
|
|
252
|
-
|
|
253
|
-
xpk_print('Install Kueue Custom Resources')
|
|
254
|
-
enable_kueue_credentials_code = install_kueue_crs(
|
|
255
|
-
args, system, autoprovisioning_config
|
|
256
|
-
)
|
|
257
|
-
if enable_kueue_credentials_code != 0:
|
|
258
|
-
xpk_exit(enable_kueue_credentials_code)
|
|
323
|
+
install_kjob(args)
|
|
259
324
|
|
|
260
325
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
261
|
-
|
|
262
|
-
install_nccl_code = install_nccl_on_cluster(args, system)
|
|
263
|
-
if install_nccl_code != 0:
|
|
264
|
-
xpk_exit(install_nccl_code)
|
|
326
|
+
prepare_gpus(args, system)
|
|
265
327
|
|
|
266
328
|
if args.enable_ray_cluster:
|
|
267
329
|
return_code = install_ray_cluster(args, system)
|
|
@@ -269,6 +331,12 @@ def cluster_create(args) -> None:
|
|
|
269
331
|
xpk_print('Installation of RayCluster failed.')
|
|
270
332
|
xpk_exit(return_code)
|
|
271
333
|
|
|
334
|
+
if hasattr(args, 'enable_mtc') and args.enable_mtc:
|
|
335
|
+
return_code = install_mtc_on_cluster(args, system)
|
|
336
|
+
if return_code != 0:
|
|
337
|
+
xpk_print('Installation of MTC failed.')
|
|
338
|
+
xpk_exit(return_code)
|
|
339
|
+
|
|
272
340
|
xpk_print('GKE commands done! Resources are created.')
|
|
273
341
|
xpk_print(
|
|
274
342
|
'See your GKE Cluster here:'
|
|
@@ -773,6 +841,7 @@ def run_gke_cluster_create_command(
|
|
|
773
841
|
f' --num-nodes {args.default_pool_cpu_num_nodes}'
|
|
774
842
|
f' {args.custom_cluster_arguments}'
|
|
775
843
|
f' {rapid_release_cmd}'
|
|
844
|
+
' --enable-dns-access'
|
|
776
845
|
)
|
|
777
846
|
|
|
778
847
|
enable_ip_alias = False
|
|
@@ -805,6 +874,7 @@ def run_gke_cluster_create_command(
|
|
|
805
874
|
addons = []
|
|
806
875
|
if args.enable_gcsfuse_csi_driver:
|
|
807
876
|
addons.append('GcsFuseCsiDriver')
|
|
877
|
+
|
|
808
878
|
if args.enable_gcpfilestore_csi_driver:
|
|
809
879
|
addons.append('GcpFilestoreCsiDriver')
|
|
810
880
|
|
|
@@ -814,6 +884,13 @@ def run_gke_cluster_create_command(
|
|
|
814
884
|
if args.enable_pd_csi_driver:
|
|
815
885
|
addons.append('GcePersistentDiskCsiDriver')
|
|
816
886
|
|
|
887
|
+
if args.enable_lustre_csi_driver:
|
|
888
|
+
addons.append('LustreCsiDriver')
|
|
889
|
+
command += ' --enable-legacy-lustre-port'
|
|
890
|
+
|
|
891
|
+
if hasattr(args, 'enable_mtc') and args.enable_mtc:
|
|
892
|
+
addons.append('HighScaleCheckpointing')
|
|
893
|
+
|
|
817
894
|
if len(addons) > 0:
|
|
818
895
|
addons_str = ','.join(addons)
|
|
819
896
|
command += f' --addons={addons_str}'
|
|
@@ -823,3 +900,99 @@ def run_gke_cluster_create_command(
|
|
|
823
900
|
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
824
901
|
return 1
|
|
825
902
|
return 0
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
def install_storage_csis(args):
|
|
906
|
+
if args.enable_gcsfuse_csi_driver:
|
|
907
|
+
update_cluster_command_code = (
|
|
908
|
+
update_cluster_with_gcsfuse_driver_if_necessary(args)
|
|
909
|
+
)
|
|
910
|
+
if update_cluster_command_code != 0:
|
|
911
|
+
xpk_exit(update_cluster_command_code)
|
|
912
|
+
|
|
913
|
+
if args.enable_gcpfilestore_csi_driver:
|
|
914
|
+
update_cluster_command_code = (
|
|
915
|
+
update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
916
|
+
)
|
|
917
|
+
if update_cluster_command_code != 0:
|
|
918
|
+
xpk_exit(update_cluster_command_code)
|
|
919
|
+
|
|
920
|
+
if args.enable_parallelstore_csi_driver:
|
|
921
|
+
update_cluster_command_code = (
|
|
922
|
+
update_cluster_with_parallelstore_driver_if_necessary(args)
|
|
923
|
+
)
|
|
924
|
+
if update_cluster_command_code != 0:
|
|
925
|
+
xpk_exit(update_cluster_command_code)
|
|
926
|
+
|
|
927
|
+
if args.enable_pd_csi_driver:
|
|
928
|
+
update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
|
|
929
|
+
args
|
|
930
|
+
)
|
|
931
|
+
if update_cluster_command_code != 0:
|
|
932
|
+
xpk_exit(update_cluster_command_code)
|
|
933
|
+
|
|
934
|
+
if args.enable_lustre_csi_driver:
|
|
935
|
+
update_cluster_command_code = (
|
|
936
|
+
update_cluster_with_lustre_driver_if_necessary(args)
|
|
937
|
+
)
|
|
938
|
+
if update_cluster_command_code != 0:
|
|
939
|
+
xpk_exit(update_cluster_command_code)
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def install_kjob(args):
|
|
943
|
+
xpk_print('Verifying kjob installation')
|
|
944
|
+
err_code = verify_kjob_installed(args)
|
|
945
|
+
if err_code > 0:
|
|
946
|
+
xpk_exit(err_code)
|
|
947
|
+
|
|
948
|
+
xpk_print('Applying kjob CDRs')
|
|
949
|
+
err_code = apply_kjob_crds(args)
|
|
950
|
+
if err_code > 0:
|
|
951
|
+
xpk_exit(err_code)
|
|
952
|
+
|
|
953
|
+
err_code = prepare_kjob(args)
|
|
954
|
+
if err_code > 0:
|
|
955
|
+
xpk_exit(err_code)
|
|
956
|
+
|
|
957
|
+
|
|
958
|
+
def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
|
|
959
|
+
xpk_print('Enabling Kueue on the cluster')
|
|
960
|
+
install_kueue_on_cluster_code = install_kueue_on_cluster(args)
|
|
961
|
+
if install_kueue_on_cluster_code != 0:
|
|
962
|
+
xpk_exit(install_kueue_on_cluster_code)
|
|
963
|
+
|
|
964
|
+
xpk_print('Wait for Kueue to be fully available')
|
|
965
|
+
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
966
|
+
if wait_for_kueue_available_code != 0:
|
|
967
|
+
xpk_exit(wait_for_kueue_available_code)
|
|
968
|
+
|
|
969
|
+
xpk_print('Install Kueue Custom Resources')
|
|
970
|
+
enable_kueue_credentials_code = install_kueue_crs(
|
|
971
|
+
args, system, autoprovisioning_config
|
|
972
|
+
)
|
|
973
|
+
if enable_kueue_credentials_code != 0:
|
|
974
|
+
xpk_exit(enable_kueue_credentials_code)
|
|
975
|
+
|
|
976
|
+
xpk_print('Update Kueue Controller Manager resources')
|
|
977
|
+
update_kueue_resources_code = update_kueue_resources_if_necessary(args)
|
|
978
|
+
if update_kueue_resources_code != 0:
|
|
979
|
+
xpk_exit(update_kueue_resources_code)
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
def prepare_gpus(args, system: SystemCharacteristics):
|
|
983
|
+
xpk_print('Installing NCCL Plugin for cluster')
|
|
984
|
+
install_nccl_code = install_nccl_on_cluster(args, system)
|
|
985
|
+
if install_nccl_code != 0:
|
|
986
|
+
xpk_exit(install_nccl_code)
|
|
987
|
+
|
|
988
|
+
if system.device_type == H100_DEVICE_TYPE:
|
|
989
|
+
xpk_print('Installing NRI device injector for cluster')
|
|
990
|
+
install_nri_code = install_nri_on_cluster(args)
|
|
991
|
+
if install_nri_code != 0:
|
|
992
|
+
xpk_exit(install_nri_code)
|
|
993
|
+
|
|
994
|
+
if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
|
|
995
|
+
xpk_print('Disabling MGLRU')
|
|
996
|
+
err_code = disable_mglru_on_cluster(args)
|
|
997
|
+
if err_code > 0:
|
|
998
|
+
xpk_exit(err_code)
|
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -37,6 +37,7 @@ from ..utils.console import xpk_exit, xpk_print
|
|
|
37
37
|
from ..utils.file import ensure_directory_exists
|
|
38
38
|
from ..utils.network import all_IPs_cidr
|
|
39
39
|
from ..utils.objects import hash_string
|
|
40
|
+
from ..core.capacity import get_reservation_maintenance_interval, get_reservation_placement_policy
|
|
40
41
|
|
|
41
42
|
blueprints_path = os.path.abspath('xpkclusters/blueprints')
|
|
42
43
|
gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
|
|
@@ -234,6 +235,30 @@ def generate_blueprint(
|
|
|
234
235
|
if args.device_type in supported_device_types:
|
|
235
236
|
if args.device_type == a3mega_device_type:
|
|
236
237
|
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
238
|
+
|
|
239
|
+
maintenance_interval = (
|
|
240
|
+
get_reservation_maintenance_interval(
|
|
241
|
+
args.reservation, args.zone, args.project
|
|
242
|
+
)
|
|
243
|
+
if args.reservation is not None
|
|
244
|
+
else 'PERIODIC'
|
|
245
|
+
)
|
|
246
|
+
placement_policy_name = (
|
|
247
|
+
get_reservation_placement_policy(
|
|
248
|
+
args.reservation, args.zone, args.project
|
|
249
|
+
)
|
|
250
|
+
if args.reservation is not None
|
|
251
|
+
else None
|
|
252
|
+
)
|
|
253
|
+
placement_policy = (
|
|
254
|
+
{
|
|
255
|
+
'type': 'COMPACT',
|
|
256
|
+
'name': placement_policy_name.split('/')[-1],
|
|
257
|
+
}
|
|
258
|
+
if placement_policy_name is not None
|
|
259
|
+
and len(placement_policy_name) > 0
|
|
260
|
+
else None
|
|
261
|
+
)
|
|
237
262
|
return bpg.generate_a3_mega_blueprint(
|
|
238
263
|
blueprint_name=blueprint_name,
|
|
239
264
|
prefix=prefix,
|
|
@@ -243,6 +268,8 @@ def generate_blueprint(
|
|
|
243
268
|
zone=args.zone,
|
|
244
269
|
auth_cidr=all_IPs_cidr,
|
|
245
270
|
num_nodes=num_nodes,
|
|
271
|
+
reservation_maintenance_interval=maintenance_interval,
|
|
272
|
+
reservation_placement_policy=placement_policy,
|
|
246
273
|
reservation=args.reservation if args.reservation else None,
|
|
247
274
|
capacity_type=capacity_type,
|
|
248
275
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
xpk/commands/common.py
CHANGED
|
@@ -15,8 +15,12 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..core.commands import run_command_with_updates_retry
|
|
18
|
+
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
18
19
|
from ..core.gcloud_context import zone_to_region
|
|
19
|
-
from ..utils.console import xpk_print
|
|
20
|
+
from ..utils.console import xpk_print, xpk_exit
|
|
21
|
+
from ..core.system_characteristics import (
|
|
22
|
+
SystemCharacteristics,
|
|
23
|
+
)
|
|
20
24
|
|
|
21
25
|
|
|
22
26
|
def set_cluster_command(args) -> int:
|
|
@@ -31,6 +35,7 @@ def set_cluster_command(args) -> int:
|
|
|
31
35
|
command = (
|
|
32
36
|
'gcloud container clusters get-credentials'
|
|
33
37
|
f' {args.cluster} --region={zone_to_region(args.zone)}'
|
|
38
|
+
' --dns-endpoint'
|
|
34
39
|
f' --project={args.project} &&'
|
|
35
40
|
' kubectl config view && kubectl config set-context --current'
|
|
36
41
|
' --namespace=default'
|
|
@@ -42,3 +47,37 @@ def set_cluster_command(args) -> int:
|
|
|
42
47
|
if return_code != 0:
|
|
43
48
|
xpk_print(f'{task} returned ERROR {return_code}')
|
|
44
49
|
return return_code
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def is_TAS_possible(
|
|
53
|
+
system_characteristics: SystemCharacteristics,
|
|
54
|
+
capacity_type: CapacityType,
|
|
55
|
+
flex: bool,
|
|
56
|
+
) -> bool:
|
|
57
|
+
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
args: user provided arguments for running the command.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
True if possible and False otherwise.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
if system_characteristics is None:
|
|
67
|
+
xpk_print('system_characteristics data was not found in configmaps.')
|
|
68
|
+
xpk_exit(1)
|
|
69
|
+
|
|
70
|
+
if capacity_type is None:
|
|
71
|
+
xpk_print('capacity_type data was not found in configmaps.')
|
|
72
|
+
xpk_exit(1)
|
|
73
|
+
|
|
74
|
+
if flex:
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
if (
|
|
78
|
+
system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
|
|
79
|
+
and capacity_type != CapacityType.RESERVATION
|
|
80
|
+
):
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
return True
|
xpk/commands/kjob_common.py
CHANGED
|
@@ -24,7 +24,10 @@ from ..core.kjob import (
|
|
|
24
24
|
get_a3mega_pod_template_annotations,
|
|
25
25
|
get_a3ultra_pod_template_annotations,
|
|
26
26
|
get_a4_pod_template_annotations,
|
|
27
|
+
Kueue_TAS_annotation,
|
|
27
28
|
)
|
|
29
|
+
from .common import is_TAS_possible
|
|
30
|
+
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
28
31
|
|
|
29
32
|
|
|
30
33
|
def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
@@ -35,7 +38,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
35
38
|
elif gpu_type == H200_DEVICE_TYPE:
|
|
36
39
|
annotations = get_a3ultra_pod_template_annotations(args)
|
|
37
40
|
elif gpu_type == B200_DEVICE_TYPE:
|
|
38
|
-
annotations = get_a4_pod_template_annotations()
|
|
41
|
+
annotations = get_a4_pod_template_annotations(args)
|
|
39
42
|
else:
|
|
40
43
|
annotations = []
|
|
41
44
|
|
|
@@ -45,3 +48,12 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
45
48
|
cmd += "\\\n".join(flags)
|
|
46
49
|
|
|
47
50
|
return cmd
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def add_TAS_annotations_to_command(args, cmd: str) -> str:
|
|
54
|
+
system_characteristics = get_cluster_system_characteristics(args)
|
|
55
|
+
capacity_type = get_cluster_capacity_type(args)
|
|
56
|
+
if is_TAS_possible(system_characteristics, capacity_type, flex=False):
|
|
57
|
+
cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
|
|
58
|
+
|
|
59
|
+
return cmd
|
xpk/commands/run.py
CHANGED
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
|
|
19
19
|
from ..core.cluster import (
|
|
20
|
-
|
|
20
|
+
setup_k8s_service_accounts,
|
|
21
21
|
get_cluster_credentials,
|
|
22
22
|
)
|
|
23
23
|
from ..core.commands import run_command_with_full_controls
|
|
@@ -25,14 +25,13 @@ from ..core.gcloud_context import add_zone_and_project
|
|
|
25
25
|
from ..core.kjob import (
|
|
26
26
|
AppProfileDefaults,
|
|
27
27
|
JobTemplateDefaults,
|
|
28
|
-
Kueue_TAS_annotation,
|
|
29
28
|
get_storage_annotations,
|
|
30
29
|
prepare_kjob,
|
|
31
30
|
)
|
|
32
31
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
33
32
|
from ..utils.console import xpk_exit, xpk_print
|
|
34
33
|
from .kind import set_local_cluster_command
|
|
35
|
-
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
34
|
+
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
36
35
|
|
|
37
36
|
|
|
38
37
|
def run(args: Namespace) -> None:
|
|
@@ -54,7 +53,7 @@ def run(args: Namespace) -> None:
|
|
|
54
53
|
err_code = prepare_kjob(args)
|
|
55
54
|
if err_code > 0:
|
|
56
55
|
xpk_exit(err_code)
|
|
57
|
-
|
|
56
|
+
setup_k8s_service_accounts()
|
|
58
57
|
|
|
59
58
|
submit_job(args)
|
|
60
59
|
|
|
@@ -64,12 +63,12 @@ def submit_job(args: Namespace) -> None:
|
|
|
64
63
|
'kubectl kjob create slurm --profile'
|
|
65
64
|
f' {AppProfileDefaults.NAME.value} '
|
|
66
65
|
f' --localqueue {LOCAL_QUEUE_NAME} '
|
|
67
|
-
f" --pod-template-annotation '{Kueue_TAS_annotation}'"
|
|
68
66
|
f' --stream-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
69
67
|
f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
70
68
|
' --wait --rm --first-node-ip'
|
|
71
69
|
)
|
|
72
70
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
71
|
+
cmd = add_TAS_annotations_to_command(args, cmd)
|
|
73
72
|
|
|
74
73
|
for annotation in get_storage_annotations(args):
|
|
75
74
|
cmd += f' --pod-template-annotation {annotation}'
|
xpk/commands/shell.py
CHANGED
|
@@ -12,7 +12,7 @@ limitations under the License.
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
|
|
15
|
-
from ..core.cluster import get_cluster_credentials, add_zone_and_project,
|
|
15
|
+
from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
|
|
16
16
|
from ..utils.console import xpk_exit, xpk_print
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
|
|
@@ -82,7 +82,7 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
|
|
|
82
82
|
err_code = prepare_kjob(args)
|
|
83
83
|
if err_code > 0:
|
|
84
84
|
xpk_exit(err_code)
|
|
85
|
-
|
|
85
|
+
setup_k8s_service_accounts()
|
|
86
86
|
|
|
87
87
|
cmd = (
|
|
88
88
|
'kubectl-kjob create interactive --profile'
|