xpk 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +2 -3
- xpk/commands/cluster.py +225 -73
- xpk/commands/common.py +33 -1
- xpk/commands/kjob_common.py +10 -1
- xpk/commands/run.py +2 -3
- xpk/commands/storage.py +14 -3
- xpk/commands/workload.py +17 -15
- xpk/core/blueprint/blueprint_generator.py +18 -18
- xpk/core/cluster.py +119 -8
- xpk/core/config.py +1 -1
- xpk/core/filestore.py +2 -6
- xpk/core/gcsfuse.py +22 -4
- xpk/core/kjob.py +20 -13
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/network.py +23 -1
- xpk/core/pathways.py +1 -1
- xpk/core/resources.py +21 -0
- xpk/core/workload.py +1 -1
- xpk/core/workload_decorators/rdma_decorator.py +6 -10
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +15 -14
- xpk/parser/cluster.py +573 -389
- xpk/parser/storage.py +11 -2
- xpk/utils/kubectl.py +4 -1
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/METADATA +134 -91
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/RECORD +31 -29
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/commands/batch.py
CHANGED
|
@@ -26,14 +26,13 @@ from ..core.gcloud_context import add_zone_and_project
|
|
|
26
26
|
from ..core.kjob import (
|
|
27
27
|
AppProfileDefaults,
|
|
28
28
|
JobTemplateDefaults,
|
|
29
|
-
Kueue_TAS_annotation,
|
|
30
29
|
get_storage_annotations,
|
|
31
30
|
prepare_kjob,
|
|
32
31
|
)
|
|
33
32
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
34
33
|
from ..utils.console import xpk_exit, xpk_print
|
|
35
34
|
from .kind import set_local_cluster_command
|
|
36
|
-
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
35
|
+
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
37
36
|
|
|
38
37
|
|
|
39
38
|
def batch(args: Namespace) -> None:
|
|
@@ -68,11 +67,11 @@ def submit_job(args: Namespace) -> None:
|
|
|
68
67
|
'kubectl kjob create slurm'
|
|
69
68
|
f' --profile {AppProfileDefaults.NAME.value}'
|
|
70
69
|
f' --localqueue {LOCAL_QUEUE_NAME}'
|
|
71
|
-
f' --pod-template-annotation {Kueue_TAS_annotation}'
|
|
72
70
|
f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
73
71
|
' --first-node-ip'
|
|
74
72
|
)
|
|
75
73
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
74
|
+
cmd = add_TAS_annotations_to_command(args, cmd)
|
|
76
75
|
|
|
77
76
|
for annotation in get_storage_annotations(args):
|
|
78
77
|
cmd += f' --pod-template-annotation {annotation}'
|
xpk/commands/cluster.py
CHANGED
|
@@ -16,19 +16,22 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from tabulate import tabulate
|
|
18
18
|
|
|
19
|
-
from ..core.capacity import H100_DEVICE_TYPE
|
|
19
|
+
from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
|
|
20
20
|
from ..core.cluster import (
|
|
21
21
|
get_all_clusters_programmatic,
|
|
22
22
|
get_cluster_credentials,
|
|
23
23
|
install_nccl_on_cluster,
|
|
24
|
+
install_nri_on_cluster,
|
|
24
25
|
set_jobset_on_cluster,
|
|
25
26
|
set_pathways_job_on_cluster,
|
|
26
27
|
setup_k8s_env,
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
disable_mglru_on_cluster,
|
|
29
|
+
count_nodes_on_cluster,
|
|
29
30
|
update_cluster_with_gcpfilestore_driver_if_necessary,
|
|
31
|
+
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
30
32
|
update_cluster_with_parallelstore_driver_if_necessary,
|
|
31
33
|
update_cluster_with_pd_driver_if_necessary,
|
|
34
|
+
update_cluster_with_workload_identity_if_necessary,
|
|
32
35
|
)
|
|
33
36
|
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
|
|
34
37
|
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
@@ -52,8 +55,12 @@ from ..core.network import (
|
|
|
52
55
|
delete_cluster_subnets,
|
|
53
56
|
set_up_cluster_network_for_a3,
|
|
54
57
|
)
|
|
55
|
-
from ..core.nodepool import
|
|
58
|
+
from ..core.nodepool import (
|
|
59
|
+
get_gke_node_pool_version,
|
|
60
|
+
run_gke_node_pool_create_command,
|
|
61
|
+
)
|
|
56
62
|
from ..core.ray import install_ray_cluster
|
|
63
|
+
from ..core.mtc import install_mtc_on_cluster
|
|
57
64
|
from ..core.resources import create_cluster_configmaps
|
|
58
65
|
from ..core.storage import install_storage_crd
|
|
59
66
|
from ..core.system_characteristics import (
|
|
@@ -70,14 +77,123 @@ from . import cluster_gcluster
|
|
|
70
77
|
from .common import set_cluster_command
|
|
71
78
|
|
|
72
79
|
|
|
80
|
+
def cluster_adapt(args) -> None:
|
|
81
|
+
"""Function that performs cluster adaptation.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
args: user provided arguments for running the command.
|
|
85
|
+
"""
|
|
86
|
+
args.enable_pathways = False
|
|
87
|
+
|
|
88
|
+
system, return_code = get_system_characteristics(args)
|
|
89
|
+
|
|
90
|
+
if return_code > 0:
|
|
91
|
+
xpk_print('Fetching system characteristics failed!')
|
|
92
|
+
xpk_exit(return_code)
|
|
93
|
+
|
|
94
|
+
xpk_print(
|
|
95
|
+
f'Starting cluster adaptation for cluster {args.cluster}:', flush=True
|
|
96
|
+
)
|
|
97
|
+
add_zone_and_project(args)
|
|
98
|
+
|
|
99
|
+
if system.accelerator_type == AcceleratorType['GPU'] and not getattr(
|
|
100
|
+
args, 'num_nodes'
|
|
101
|
+
):
|
|
102
|
+
xpk_print(
|
|
103
|
+
'Argument --num-nodes was not provided, trying to determine number of'
|
|
104
|
+
' nodes based on the available nodes in the cluster...'
|
|
105
|
+
)
|
|
106
|
+
args.num_nodes = count_nodes_on_cluster(args, system)
|
|
107
|
+
if args.num_nodes == 0:
|
|
108
|
+
xpk_print(
|
|
109
|
+
'Found unexpected number of nodes. Is the --device-type correct?'
|
|
110
|
+
)
|
|
111
|
+
xpk_exit(1)
|
|
112
|
+
else:
|
|
113
|
+
xpk_print(f'Using {args.num_nodes} nodes.')
|
|
114
|
+
|
|
115
|
+
# ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
|
|
116
|
+
# Enable WorkloadIdentity if not enabled already.
|
|
117
|
+
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
118
|
+
update_cluster_command_code = (
|
|
119
|
+
update_cluster_with_workload_identity_if_necessary(args)
|
|
120
|
+
)
|
|
121
|
+
if update_cluster_command_code != 0:
|
|
122
|
+
xpk_exit(update_cluster_command_code)
|
|
123
|
+
|
|
124
|
+
get_cluster_credentials(args)
|
|
125
|
+
|
|
126
|
+
k8s_client = setup_k8s_env(args)
|
|
127
|
+
|
|
128
|
+
install_storage_crd(k8s_client)
|
|
129
|
+
install_storage_csis(args)
|
|
130
|
+
|
|
131
|
+
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
132
|
+
tensorboard_config = {}
|
|
133
|
+
if VERTEX_TENSORBOARD_FEATURE_FLAG and args.create_vertex_tensorboard:
|
|
134
|
+
tensorboard_config = create_vertex_tensorboard(args)
|
|
135
|
+
# exit if failed to create Tensorboard in Vertex AI
|
|
136
|
+
if not tensorboard_config:
|
|
137
|
+
xpk_exit(1)
|
|
138
|
+
|
|
139
|
+
# Provision node pools dynamically based on incoming workloads:
|
|
140
|
+
# Currently autoprovisioning is not supported with Pathways.
|
|
141
|
+
autoprovisioning_config = None
|
|
142
|
+
if args.enable_autoprovisioning:
|
|
143
|
+
xpk_print('Enabling Autoprovisioning')
|
|
144
|
+
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
|
|
145
|
+
args, system
|
|
146
|
+
)
|
|
147
|
+
if return_code != 0:
|
|
148
|
+
xpk_exit(return_code)
|
|
149
|
+
|
|
150
|
+
xpk_print('Creating ConfigMap for cluster')
|
|
151
|
+
create_cluster_configmaps_code = create_cluster_configmaps(
|
|
152
|
+
args, system, tensorboard_config, autoprovisioning_config
|
|
153
|
+
)
|
|
154
|
+
if create_cluster_configmaps_code != 0:
|
|
155
|
+
xpk_exit(create_cluster_configmaps_code)
|
|
156
|
+
|
|
157
|
+
xpk_print(
|
|
158
|
+
'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
|
|
159
|
+
' globally available'
|
|
160
|
+
)
|
|
161
|
+
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
|
|
162
|
+
if set_jobset_on_cluster_code != 0:
|
|
163
|
+
xpk_exit(set_jobset_on_cluster_code)
|
|
164
|
+
|
|
165
|
+
# TODO: Uncomment when cluster_adapt will support TPU cluters
|
|
166
|
+
# set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
|
|
167
|
+
# if set_pathways_job_on_cluster_code != 0:
|
|
168
|
+
# xpk_exit(set_pathways_job_on_cluster_code)
|
|
169
|
+
|
|
170
|
+
install_kueue(args, system, autoprovisioning_config)
|
|
171
|
+
|
|
172
|
+
install_kjob(args)
|
|
173
|
+
|
|
174
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
175
|
+
prepare_gpus(args, system)
|
|
176
|
+
|
|
177
|
+
if args.enable_ray_cluster:
|
|
178
|
+
return_code = install_ray_cluster(args, system)
|
|
179
|
+
if return_code != 0:
|
|
180
|
+
xpk_print('Installation of RayCluster failed.')
|
|
181
|
+
xpk_exit(return_code)
|
|
182
|
+
|
|
183
|
+
xpk_print('GKE commands done! Resources are created.')
|
|
184
|
+
xpk_print(
|
|
185
|
+
'See your GKE Cluster here:'
|
|
186
|
+
# pylint: disable=line-too-long
|
|
187
|
+
f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
|
|
188
|
+
)
|
|
189
|
+
xpk_exit(0)
|
|
190
|
+
|
|
191
|
+
|
|
73
192
|
def cluster_create(args) -> None:
|
|
74
193
|
"""Function around cluster creation.
|
|
75
194
|
|
|
76
195
|
Args:
|
|
77
196
|
args: user provided arguments for running the command.
|
|
78
|
-
|
|
79
|
-
Returns:
|
|
80
|
-
0 if successful and 1 otherwise.
|
|
81
197
|
"""
|
|
82
198
|
system, return_code = get_system_characteristics(args)
|
|
83
199
|
|
|
@@ -127,38 +243,12 @@ def cluster_create(args) -> None:
|
|
|
127
243
|
if update_cluster_command_code != 0:
|
|
128
244
|
xpk_exit(update_cluster_command_code)
|
|
129
245
|
|
|
130
|
-
|
|
131
|
-
if args.enable_gcsfuse_csi_driver:
|
|
132
|
-
update_cluster_command_code = (
|
|
133
|
-
update_cluster_with_gcsfuse_driver_if_necessary(args)
|
|
134
|
-
)
|
|
135
|
-
if update_cluster_command_code != 0:
|
|
136
|
-
xpk_exit(update_cluster_command_code)
|
|
137
|
-
|
|
138
|
-
if args.enable_gcpfilestore_csi_driver:
|
|
139
|
-
update_cluster_command_code = (
|
|
140
|
-
update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
141
|
-
)
|
|
142
|
-
if update_cluster_command_code != 0:
|
|
143
|
-
xpk_exit(update_cluster_command_code)
|
|
144
|
-
|
|
145
|
-
if args.enable_parallelstore_csi_driver:
|
|
146
|
-
update_cluster_command_code = (
|
|
147
|
-
update_cluster_with_parallelstore_driver_if_necessary(args)
|
|
148
|
-
)
|
|
149
|
-
if update_cluster_command_code != 0:
|
|
150
|
-
xpk_exit(update_cluster_command_code)
|
|
151
|
-
|
|
152
|
-
if args.enable_pd_csi_driver:
|
|
153
|
-
update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
|
|
154
|
-
args
|
|
155
|
-
)
|
|
156
|
-
if update_cluster_command_code != 0:
|
|
157
|
-
xpk_exit(update_cluster_command_code)
|
|
246
|
+
get_cluster_credentials(args)
|
|
158
247
|
|
|
159
|
-
|
|
248
|
+
k8s_client = setup_k8s_env(args)
|
|
160
249
|
|
|
161
|
-
|
|
250
|
+
install_storage_crd(k8s_client)
|
|
251
|
+
install_storage_csis(args)
|
|
162
252
|
|
|
163
253
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
164
254
|
tensorboard_config = {}
|
|
@@ -223,45 +313,12 @@ def cluster_create(args) -> None:
|
|
|
223
313
|
if set_pathways_job_on_cluster_code != 0:
|
|
224
314
|
xpk_exit(set_pathways_job_on_cluster_code)
|
|
225
315
|
|
|
226
|
-
|
|
227
|
-
install_kueue_on_cluster_code = install_kueue_on_cluster(args)
|
|
228
|
-
if install_kueue_on_cluster_code != 0:
|
|
229
|
-
xpk_exit(install_kueue_on_cluster_code)
|
|
316
|
+
install_kueue(args, system, autoprovisioning_config)
|
|
230
317
|
|
|
231
|
-
|
|
232
|
-
err_code = verify_kjob_installed(args)
|
|
233
|
-
if err_code > 0:
|
|
234
|
-
xpk_exit(err_code)
|
|
235
|
-
|
|
236
|
-
xpk_print('Applying kjob CDRs')
|
|
237
|
-
err_code = apply_kjob_crds(args)
|
|
238
|
-
if err_code > 0:
|
|
239
|
-
xpk_exit(err_code)
|
|
240
|
-
|
|
241
|
-
err_code = prepare_kjob(args)
|
|
242
|
-
if err_code > 0:
|
|
243
|
-
xpk_exit(err_code)
|
|
244
|
-
|
|
245
|
-
k8s_client = setup_k8s_env(args)
|
|
246
|
-
install_storage_crd(k8s_client)
|
|
247
|
-
|
|
248
|
-
xpk_print('Wait for Kueue to be fully available')
|
|
249
|
-
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
250
|
-
if wait_for_kueue_available_code != 0:
|
|
251
|
-
xpk_exit(wait_for_kueue_available_code)
|
|
252
|
-
|
|
253
|
-
xpk_print('Install Kueue Custom Resources')
|
|
254
|
-
enable_kueue_credentials_code = install_kueue_crs(
|
|
255
|
-
args, system, autoprovisioning_config
|
|
256
|
-
)
|
|
257
|
-
if enable_kueue_credentials_code != 0:
|
|
258
|
-
xpk_exit(enable_kueue_credentials_code)
|
|
318
|
+
install_kjob(args)
|
|
259
319
|
|
|
260
320
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
261
|
-
|
|
262
|
-
install_nccl_code = install_nccl_on_cluster(args, system)
|
|
263
|
-
if install_nccl_code != 0:
|
|
264
|
-
xpk_exit(install_nccl_code)
|
|
321
|
+
prepare_gpus(args, system)
|
|
265
322
|
|
|
266
323
|
if args.enable_ray_cluster:
|
|
267
324
|
return_code = install_ray_cluster(args, system)
|
|
@@ -269,6 +326,12 @@ def cluster_create(args) -> None:
|
|
|
269
326
|
xpk_print('Installation of RayCluster failed.')
|
|
270
327
|
xpk_exit(return_code)
|
|
271
328
|
|
|
329
|
+
if hasattr(args, 'enable_mtc') and args.enable_mtc:
|
|
330
|
+
return_code = install_mtc_on_cluster(args, system)
|
|
331
|
+
if return_code != 0:
|
|
332
|
+
xpk_print('Installation of MTC failed.')
|
|
333
|
+
xpk_exit(return_code)
|
|
334
|
+
|
|
272
335
|
xpk_print('GKE commands done! Resources are created.')
|
|
273
336
|
xpk_print(
|
|
274
337
|
'See your GKE Cluster here:'
|
|
@@ -773,6 +836,7 @@ def run_gke_cluster_create_command(
|
|
|
773
836
|
f' --num-nodes {args.default_pool_cpu_num_nodes}'
|
|
774
837
|
f' {args.custom_cluster_arguments}'
|
|
775
838
|
f' {rapid_release_cmd}'
|
|
839
|
+
' --enable-dns-access'
|
|
776
840
|
)
|
|
777
841
|
|
|
778
842
|
enable_ip_alias = False
|
|
@@ -805,6 +869,7 @@ def run_gke_cluster_create_command(
|
|
|
805
869
|
addons = []
|
|
806
870
|
if args.enable_gcsfuse_csi_driver:
|
|
807
871
|
addons.append('GcsFuseCsiDriver')
|
|
872
|
+
|
|
808
873
|
if args.enable_gcpfilestore_csi_driver:
|
|
809
874
|
addons.append('GcpFilestoreCsiDriver')
|
|
810
875
|
|
|
@@ -814,6 +879,9 @@ def run_gke_cluster_create_command(
|
|
|
814
879
|
if args.enable_pd_csi_driver:
|
|
815
880
|
addons.append('GcePersistentDiskCsiDriver')
|
|
816
881
|
|
|
882
|
+
if hasattr(args, 'enable_mtc') and args.enable_mtc:
|
|
883
|
+
addons.append('HighScaleCheckpointing')
|
|
884
|
+
|
|
817
885
|
if len(addons) > 0:
|
|
818
886
|
addons_str = ','.join(addons)
|
|
819
887
|
command += f' --addons={addons_str}'
|
|
@@ -823,3 +891,87 @@ def run_gke_cluster_create_command(
|
|
|
823
891
|
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
824
892
|
return 1
|
|
825
893
|
return 0
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
def install_storage_csis(args):
|
|
897
|
+
if args.enable_gcsfuse_csi_driver:
|
|
898
|
+
update_cluster_command_code = (
|
|
899
|
+
update_cluster_with_gcsfuse_driver_if_necessary(args)
|
|
900
|
+
)
|
|
901
|
+
if update_cluster_command_code != 0:
|
|
902
|
+
xpk_exit(update_cluster_command_code)
|
|
903
|
+
|
|
904
|
+
if args.enable_gcpfilestore_csi_driver:
|
|
905
|
+
update_cluster_command_code = (
|
|
906
|
+
update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
907
|
+
)
|
|
908
|
+
if update_cluster_command_code != 0:
|
|
909
|
+
xpk_exit(update_cluster_command_code)
|
|
910
|
+
|
|
911
|
+
if args.enable_parallelstore_csi_driver:
|
|
912
|
+
update_cluster_command_code = (
|
|
913
|
+
update_cluster_with_parallelstore_driver_if_necessary(args)
|
|
914
|
+
)
|
|
915
|
+
if update_cluster_command_code != 0:
|
|
916
|
+
xpk_exit(update_cluster_command_code)
|
|
917
|
+
|
|
918
|
+
if args.enable_pd_csi_driver:
|
|
919
|
+
update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
|
|
920
|
+
args
|
|
921
|
+
)
|
|
922
|
+
if update_cluster_command_code != 0:
|
|
923
|
+
xpk_exit(update_cluster_command_code)
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
def install_kjob(args):
|
|
927
|
+
xpk_print('Verifying kjob installation')
|
|
928
|
+
err_code = verify_kjob_installed(args)
|
|
929
|
+
if err_code > 0:
|
|
930
|
+
xpk_exit(err_code)
|
|
931
|
+
|
|
932
|
+
xpk_print('Applying kjob CDRs')
|
|
933
|
+
err_code = apply_kjob_crds(args)
|
|
934
|
+
if err_code > 0:
|
|
935
|
+
xpk_exit(err_code)
|
|
936
|
+
|
|
937
|
+
err_code = prepare_kjob(args)
|
|
938
|
+
if err_code > 0:
|
|
939
|
+
xpk_exit(err_code)
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
|
|
943
|
+
xpk_print('Enabling Kueue on the cluster')
|
|
944
|
+
install_kueue_on_cluster_code = install_kueue_on_cluster(args)
|
|
945
|
+
if install_kueue_on_cluster_code != 0:
|
|
946
|
+
xpk_exit(install_kueue_on_cluster_code)
|
|
947
|
+
|
|
948
|
+
xpk_print('Wait for Kueue to be fully available')
|
|
949
|
+
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
950
|
+
if wait_for_kueue_available_code != 0:
|
|
951
|
+
xpk_exit(wait_for_kueue_available_code)
|
|
952
|
+
|
|
953
|
+
xpk_print('Install Kueue Custom Resources')
|
|
954
|
+
enable_kueue_credentials_code = install_kueue_crs(
|
|
955
|
+
args, system, autoprovisioning_config
|
|
956
|
+
)
|
|
957
|
+
if enable_kueue_credentials_code != 0:
|
|
958
|
+
xpk_exit(enable_kueue_credentials_code)
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
def prepare_gpus(args, system: SystemCharacteristics):
|
|
962
|
+
xpk_print('Installing NCCL Plugin for cluster')
|
|
963
|
+
install_nccl_code = install_nccl_on_cluster(args, system)
|
|
964
|
+
if install_nccl_code != 0:
|
|
965
|
+
xpk_exit(install_nccl_code)
|
|
966
|
+
|
|
967
|
+
if system.device_type == H100_DEVICE_TYPE:
|
|
968
|
+
xpk_print('Installing NRI device injector for cluster')
|
|
969
|
+
install_nri_code = install_nri_on_cluster(args)
|
|
970
|
+
if install_nri_code != 0:
|
|
971
|
+
xpk_exit(install_nri_code)
|
|
972
|
+
|
|
973
|
+
if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
|
|
974
|
+
xpk_print('Disabling MGLRU')
|
|
975
|
+
err_code = disable_mglru_on_cluster(args)
|
|
976
|
+
if err_code > 0:
|
|
977
|
+
xpk_exit(err_code)
|
xpk/commands/common.py
CHANGED
|
@@ -15,8 +15,10 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..core.commands import run_command_with_updates_retry
|
|
18
|
+
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
19
|
+
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
18
20
|
from ..core.gcloud_context import zone_to_region
|
|
19
|
-
from ..utils.console import xpk_print
|
|
21
|
+
from ..utils.console import xpk_print, xpk_exit
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
def set_cluster_command(args) -> int:
|
|
@@ -31,6 +33,7 @@ def set_cluster_command(args) -> int:
|
|
|
31
33
|
command = (
|
|
32
34
|
'gcloud container clusters get-credentials'
|
|
33
35
|
f' {args.cluster} --region={zone_to_region(args.zone)}'
|
|
36
|
+
' --dns-endpoint'
|
|
34
37
|
f' --project={args.project} &&'
|
|
35
38
|
' kubectl config view && kubectl config set-context --current'
|
|
36
39
|
' --namespace=default'
|
|
@@ -42,3 +45,32 @@ def set_cluster_command(args) -> int:
|
|
|
42
45
|
if return_code != 0:
|
|
43
46
|
xpk_print(f'{task} returned ERROR {return_code}')
|
|
44
47
|
return return_code
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def is_TAS_possible(args) -> bool:
|
|
51
|
+
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
args: user provided arguments for running the command.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
True if possible and False otherwise.
|
|
58
|
+
"""
|
|
59
|
+
system_characteristics = get_cluster_system_characteristics(args)
|
|
60
|
+
capacity_type = get_cluster_capacity_type(args)
|
|
61
|
+
|
|
62
|
+
if system_characteristics is None:
|
|
63
|
+
xpk_print('system_characteristics data was not found in configmaps.')
|
|
64
|
+
xpk_exit(1)
|
|
65
|
+
|
|
66
|
+
if capacity_type is None:
|
|
67
|
+
xpk_print('capacity_type data was not found in configmaps.')
|
|
68
|
+
xpk_exit(1)
|
|
69
|
+
|
|
70
|
+
if (
|
|
71
|
+
system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
|
|
72
|
+
and capacity_type == CapacityType.SPOT
|
|
73
|
+
):
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
return True
|
xpk/commands/kjob_common.py
CHANGED
|
@@ -24,7 +24,9 @@ from ..core.kjob import (
|
|
|
24
24
|
get_a3mega_pod_template_annotations,
|
|
25
25
|
get_a3ultra_pod_template_annotations,
|
|
26
26
|
get_a4_pod_template_annotations,
|
|
27
|
+
Kueue_TAS_annotation,
|
|
27
28
|
)
|
|
29
|
+
from .common import is_TAS_possible
|
|
28
30
|
|
|
29
31
|
|
|
30
32
|
def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
@@ -35,7 +37,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
35
37
|
elif gpu_type == H200_DEVICE_TYPE:
|
|
36
38
|
annotations = get_a3ultra_pod_template_annotations(args)
|
|
37
39
|
elif gpu_type == B200_DEVICE_TYPE:
|
|
38
|
-
annotations = get_a4_pod_template_annotations()
|
|
40
|
+
annotations = get_a4_pod_template_annotations(args)
|
|
39
41
|
else:
|
|
40
42
|
annotations = []
|
|
41
43
|
|
|
@@ -45,3 +47,10 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
45
47
|
cmd += "\\\n".join(flags)
|
|
46
48
|
|
|
47
49
|
return cmd
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def add_TAS_annotations_to_command(args, cmd: str) -> str:
|
|
53
|
+
if is_TAS_possible(args):
|
|
54
|
+
cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
|
|
55
|
+
|
|
56
|
+
return cmd
|
xpk/commands/run.py
CHANGED
|
@@ -25,14 +25,13 @@ from ..core.gcloud_context import add_zone_and_project
|
|
|
25
25
|
from ..core.kjob import (
|
|
26
26
|
AppProfileDefaults,
|
|
27
27
|
JobTemplateDefaults,
|
|
28
|
-
Kueue_TAS_annotation,
|
|
29
28
|
get_storage_annotations,
|
|
30
29
|
prepare_kjob,
|
|
31
30
|
)
|
|
32
31
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
33
32
|
from ..utils.console import xpk_exit, xpk_print
|
|
34
33
|
from .kind import set_local_cluster_command
|
|
35
|
-
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
34
|
+
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
36
35
|
|
|
37
36
|
|
|
38
37
|
def run(args: Namespace) -> None:
|
|
@@ -64,12 +63,12 @@ def submit_job(args: Namespace) -> None:
|
|
|
64
63
|
'kubectl kjob create slurm --profile'
|
|
65
64
|
f' {AppProfileDefaults.NAME.value} '
|
|
66
65
|
f' --localqueue {LOCAL_QUEUE_NAME} '
|
|
67
|
-
f" --pod-template-annotation '{Kueue_TAS_annotation}'"
|
|
68
66
|
f' --stream-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
69
67
|
f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
70
68
|
' --wait --rm --first-node-ip'
|
|
71
69
|
)
|
|
72
70
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
71
|
+
cmd = add_TAS_annotations_to_command(args, cmd)
|
|
73
72
|
|
|
74
73
|
for annotation in get_storage_annotations(args):
|
|
75
74
|
cmd += f' --pod-template-annotation {annotation}'
|
xpk/commands/storage.py
CHANGED
|
@@ -86,7 +86,6 @@ def storage_create(args: Namespace) -> None:
|
|
|
86
86
|
args.vol,
|
|
87
87
|
args.access_mode,
|
|
88
88
|
filestore_network,
|
|
89
|
-
args.mount_options,
|
|
90
89
|
)
|
|
91
90
|
|
|
92
91
|
k8s_api_client = setup_k8s_env(args)
|
|
@@ -162,7 +161,6 @@ def storage_attach(args: Namespace) -> None:
|
|
|
162
161
|
args.vol,
|
|
163
162
|
args.access_mode,
|
|
164
163
|
filestore_network,
|
|
165
|
-
args.mount_options,
|
|
166
164
|
)
|
|
167
165
|
|
|
168
166
|
elif args.type == GCS_FUSE_TYPE:
|
|
@@ -178,7 +176,11 @@ def storage_attach(args: Namespace) -> None:
|
|
|
178
176
|
manifest = list(yaml.safe_load_all(f))
|
|
179
177
|
else:
|
|
180
178
|
manifest = gcsfuse.manifest(
|
|
181
|
-
args.name,
|
|
179
|
+
args.name,
|
|
180
|
+
args.bucket,
|
|
181
|
+
args.size,
|
|
182
|
+
args.mount_options,
|
|
183
|
+
args.prefetch_metadata,
|
|
182
184
|
)
|
|
183
185
|
|
|
184
186
|
elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE]:
|
|
@@ -323,3 +325,12 @@ def delete_storage_resources(k8s_api_client: ApiClient, storage: Storage):
|
|
|
323
325
|
storage.name,
|
|
324
326
|
"Storage",
|
|
325
327
|
)
|
|
328
|
+
|
|
329
|
+
# remove kubernetes.io/pvc-protection
|
|
330
|
+
delete_resource(
|
|
331
|
+
lambda name: core_api.patch_namespaced_persistent_volume_claim(
|
|
332
|
+
name, "default", {"metadata": {"finalizers": None}}
|
|
333
|
+
),
|
|
334
|
+
storage.pvc,
|
|
335
|
+
"Persistent Volume Claim finalizers",
|
|
336
|
+
)
|
xpk/commands/workload.py
CHANGED
|
@@ -14,11 +14,6 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..core.blueprint.blueprint_generator import (
|
|
18
|
-
get_subnetworks_for_a3mega,
|
|
19
|
-
get_subnetworks_for_a3ultra,
|
|
20
|
-
get_subnetworks_for_a4,
|
|
21
|
-
)
|
|
22
17
|
from ..core.cluster import (
|
|
23
18
|
XPK_SA,
|
|
24
19
|
create_xpk_k8s_service_account,
|
|
@@ -43,6 +38,7 @@ from ..core.nap import (
|
|
|
43
38
|
get_autoprovisioning_node_selector_args,
|
|
44
39
|
is_autoprovisioning_enabled,
|
|
45
40
|
)
|
|
41
|
+
from ..core.network import get_cluster_subnetworks
|
|
46
42
|
from ..core.pathways import (
|
|
47
43
|
append_custom_colocated_python_sidecar,
|
|
48
44
|
append_custom_pathways_proxy_server,
|
|
@@ -93,6 +89,7 @@ from ..core.workload_decorators import (
|
|
|
93
89
|
)
|
|
94
90
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
95
91
|
from ..utils.file import write_tmp_file
|
|
92
|
+
from .common import is_TAS_possible
|
|
96
93
|
from . import cluster_gcluster
|
|
97
94
|
|
|
98
95
|
WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
@@ -217,7 +214,7 @@ spec:
|
|
|
217
214
|
labels:
|
|
218
215
|
xpk.google.com/workload: {args.workload}
|
|
219
216
|
annotations:
|
|
220
|
-
|
|
217
|
+
{kueue_TAS_annotation}
|
|
221
218
|
spec:
|
|
222
219
|
priorityClassName: {args.priority}
|
|
223
220
|
restartPolicy: Never
|
|
@@ -406,7 +403,7 @@ def workload_create(args) -> None:
|
|
|
406
403
|
f' {parallelstore_storages}'
|
|
407
404
|
)
|
|
408
405
|
else:
|
|
409
|
-
xpk_print('No gcp
|
|
406
|
+
xpk_print('No gcp parallelstore instances to add detected.')
|
|
410
407
|
|
|
411
408
|
if len(pd_storages) > 0:
|
|
412
409
|
service_account = XPK_SA
|
|
@@ -451,6 +448,13 @@ def workload_create(args) -> None:
|
|
|
451
448
|
if return_code != 0:
|
|
452
449
|
xpk_exit(return_code)
|
|
453
450
|
|
|
451
|
+
kueue_TAS_annotation = (
|
|
452
|
+
'kueue.x-k8s.io/podset-preferred-topology:'
|
|
453
|
+
' "cloud.google.com/gce-topology-host"'
|
|
454
|
+
)
|
|
455
|
+
if not is_TAS_possible(args):
|
|
456
|
+
kueue_TAS_annotation = ''
|
|
457
|
+
|
|
454
458
|
if system.device_type in cluster_gcluster.supported_device_types:
|
|
455
459
|
yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
|
|
456
460
|
args=args,
|
|
@@ -458,18 +462,16 @@ def workload_create(args) -> None:
|
|
|
458
462
|
service_account=XPK_SA,
|
|
459
463
|
failure_policy_rules=failure_policy_rules,
|
|
460
464
|
pod_failure_policy=pod_failure_policy,
|
|
465
|
+
kueue_TAS_annotation=kueue_TAS_annotation,
|
|
461
466
|
)
|
|
462
467
|
|
|
468
|
+
sub_networks = get_cluster_subnetworks(args)
|
|
463
469
|
if args.device_type == cluster_gcluster.a3mega_device_type:
|
|
464
|
-
sub_networks = get_subnetworks_for_a3mega(args.cluster)
|
|
465
470
|
yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
if args.device_type == cluster_gcluster.a4_device_type:
|
|
472
|
-
sub_networks = get_subnetworks_for_a4()
|
|
471
|
+
elif args.device_type in [
|
|
472
|
+
cluster_gcluster.a3ultra_device_type,
|
|
473
|
+
cluster_gcluster.a4_device_type,
|
|
474
|
+
]:
|
|
473
475
|
yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
|
|
474
476
|
|
|
475
477
|
if all_storages:
|