xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -13
- xpk/commands/cluster.py +240 -71
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/common.py +33 -1
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +30 -18
- xpk/commands/run.py +17 -12
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +75 -19
- xpk/commands/workload.py +161 -324
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +335 -45
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +193 -12
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +5 -1
- xpk/core/gcsfuse.py +27 -6
- xpk/core/kjob.py +66 -20
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/nap.py +4 -0
- xpk/core/network.py +34 -22
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/resources.py +21 -0
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +28 -83
- xpk/core/workload_decorators/rdma_decorator.py +11 -15
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
- xpk/parser/cluster.py +574 -381
- xpk/parser/storage.py +25 -5
- xpk/parser/workload.py +59 -31
- xpk/utils/kubectl.py +4 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/commands/batch.py
CHANGED
|
@@ -14,18 +14,25 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import re
|
|
17
18
|
from argparse import Namespace
|
|
18
19
|
|
|
19
|
-
from ..core.cluster import
|
|
20
|
+
from ..core.cluster import (
|
|
21
|
+
create_xpk_k8s_service_account,
|
|
22
|
+
get_cluster_credentials,
|
|
23
|
+
)
|
|
20
24
|
from ..core.commands import run_command_for_value
|
|
21
25
|
from ..core.gcloud_context import add_zone_and_project
|
|
26
|
+
from ..core.kjob import (
|
|
27
|
+
AppProfileDefaults,
|
|
28
|
+
JobTemplateDefaults,
|
|
29
|
+
get_storage_annotations,
|
|
30
|
+
prepare_kjob,
|
|
31
|
+
)
|
|
22
32
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
23
33
|
from ..utils.console import xpk_exit, xpk_print
|
|
24
|
-
from .common import set_cluster_command
|
|
25
|
-
from ..core.kjob import AppProfileDefaults, JobTemplateDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
|
|
26
|
-
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
27
34
|
from .kind import set_local_cluster_command
|
|
28
|
-
import
|
|
35
|
+
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
29
36
|
|
|
30
37
|
|
|
31
38
|
def batch(args: Namespace) -> None:
|
|
@@ -38,12 +45,11 @@ def batch(args: Namespace) -> None:
|
|
|
38
45
|
"""
|
|
39
46
|
if not args.kind_cluster:
|
|
40
47
|
add_zone_and_project(args)
|
|
41
|
-
|
|
48
|
+
get_cluster_credentials(args)
|
|
42
49
|
else:
|
|
43
50
|
set_cluster_command_code = set_local_cluster_command(args)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
xpk_exit(set_cluster_command_code)
|
|
51
|
+
if set_cluster_command_code != 0:
|
|
52
|
+
xpk_exit(set_cluster_command_code)
|
|
47
53
|
|
|
48
54
|
err_code = prepare_kjob(args)
|
|
49
55
|
if err_code > 0:
|
|
@@ -61,14 +67,14 @@ def submit_job(args: Namespace) -> None:
|
|
|
61
67
|
'kubectl kjob create slurm'
|
|
62
68
|
f' --profile {AppProfileDefaults.NAME.value}'
|
|
63
69
|
f' --localqueue {LOCAL_QUEUE_NAME}'
|
|
64
|
-
f' --pod-template-annotation {Kueue_TAS_annotation}'
|
|
65
70
|
f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
66
71
|
' --first-node-ip'
|
|
67
72
|
)
|
|
68
73
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
74
|
+
cmd = add_TAS_annotations_to_command(args, cmd)
|
|
75
|
+
|
|
76
|
+
for annotation in get_storage_annotations(args):
|
|
77
|
+
cmd += f' --pod-template-annotation {annotation}'
|
|
72
78
|
|
|
73
79
|
if args.ignore_unknown_flags:
|
|
74
80
|
cmd += ' --ignore-unknown-flags'
|
xpk/commands/cluster.py
CHANGED
|
@@ -16,14 +16,21 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from tabulate import tabulate
|
|
18
18
|
|
|
19
|
-
from ..core.capacity import H100_DEVICE_TYPE
|
|
19
|
+
from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
|
|
20
20
|
from ..core.cluster import (
|
|
21
21
|
get_all_clusters_programmatic,
|
|
22
22
|
get_cluster_credentials,
|
|
23
23
|
install_nccl_on_cluster,
|
|
24
|
+
install_nri_on_cluster,
|
|
24
25
|
set_jobset_on_cluster,
|
|
26
|
+
set_pathways_job_on_cluster,
|
|
25
27
|
setup_k8s_env,
|
|
28
|
+
disable_mglru_on_cluster,
|
|
29
|
+
count_nodes_on_cluster,
|
|
30
|
+
update_cluster_with_gcpfilestore_driver_if_necessary,
|
|
26
31
|
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
32
|
+
update_cluster_with_parallelstore_driver_if_necessary,
|
|
33
|
+
update_cluster_with_pd_driver_if_necessary,
|
|
27
34
|
update_cluster_with_workload_identity_if_necessary,
|
|
28
35
|
)
|
|
29
36
|
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
|
|
@@ -46,10 +53,14 @@ from ..core.nap import enable_autoprovisioning_on_cluster
|
|
|
46
53
|
from ..core.network import (
|
|
47
54
|
create_cluster_network_config,
|
|
48
55
|
delete_cluster_subnets,
|
|
49
|
-
|
|
56
|
+
set_up_cluster_network_for_a3,
|
|
57
|
+
)
|
|
58
|
+
from ..core.nodepool import (
|
|
59
|
+
get_gke_node_pool_version,
|
|
60
|
+
run_gke_node_pool_create_command,
|
|
50
61
|
)
|
|
51
|
-
from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
|
|
52
62
|
from ..core.ray import install_ray_cluster
|
|
63
|
+
from ..core.mtc import install_mtc_on_cluster
|
|
53
64
|
from ..core.resources import create_cluster_configmaps
|
|
54
65
|
from ..core.storage import install_storage_crd
|
|
55
66
|
from ..core.system_characteristics import (
|
|
@@ -64,7 +75,118 @@ from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
|
64
75
|
from ..utils.file import write_tmp_file
|
|
65
76
|
from . import cluster_gcluster
|
|
66
77
|
from .common import set_cluster_command
|
|
67
|
-
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def cluster_adapt(args) -> None:
|
|
81
|
+
"""Function that performs cluster adaptation.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
args: user provided arguments for running the command.
|
|
85
|
+
"""
|
|
86
|
+
args.enable_pathways = False
|
|
87
|
+
|
|
88
|
+
system, return_code = get_system_characteristics(args)
|
|
89
|
+
|
|
90
|
+
if return_code > 0:
|
|
91
|
+
xpk_print('Fetching system characteristics failed!')
|
|
92
|
+
xpk_exit(return_code)
|
|
93
|
+
|
|
94
|
+
xpk_print(
|
|
95
|
+
f'Starting cluster adaptation for cluster {args.cluster}:', flush=True
|
|
96
|
+
)
|
|
97
|
+
add_zone_and_project(args)
|
|
98
|
+
|
|
99
|
+
if system.accelerator_type == AcceleratorType['GPU'] and not getattr(
|
|
100
|
+
args, 'num_nodes'
|
|
101
|
+
):
|
|
102
|
+
xpk_print(
|
|
103
|
+
'Argument --num-nodes was not provided, trying to determine number of'
|
|
104
|
+
' nodes based on the available nodes in the cluster...'
|
|
105
|
+
)
|
|
106
|
+
args.num_nodes = count_nodes_on_cluster(args, system)
|
|
107
|
+
if args.num_nodes == 0:
|
|
108
|
+
xpk_print(
|
|
109
|
+
'Found unexpected number of nodes. Is the --device-type correct?'
|
|
110
|
+
)
|
|
111
|
+
xpk_exit(1)
|
|
112
|
+
else:
|
|
113
|
+
xpk_print(f'Using {args.num_nodes} nodes.')
|
|
114
|
+
|
|
115
|
+
# ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
|
|
116
|
+
# Enable WorkloadIdentity if not enabled already.
|
|
117
|
+
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
118
|
+
update_cluster_command_code = (
|
|
119
|
+
update_cluster_with_workload_identity_if_necessary(args)
|
|
120
|
+
)
|
|
121
|
+
if update_cluster_command_code != 0:
|
|
122
|
+
xpk_exit(update_cluster_command_code)
|
|
123
|
+
|
|
124
|
+
get_cluster_credentials(args)
|
|
125
|
+
|
|
126
|
+
k8s_client = setup_k8s_env(args)
|
|
127
|
+
|
|
128
|
+
install_storage_crd(k8s_client)
|
|
129
|
+
install_storage_csis(args)
|
|
130
|
+
|
|
131
|
+
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
132
|
+
tensorboard_config = {}
|
|
133
|
+
if VERTEX_TENSORBOARD_FEATURE_FLAG and args.create_vertex_tensorboard:
|
|
134
|
+
tensorboard_config = create_vertex_tensorboard(args)
|
|
135
|
+
# exit if failed to create Tensorboard in Vertex AI
|
|
136
|
+
if not tensorboard_config:
|
|
137
|
+
xpk_exit(1)
|
|
138
|
+
|
|
139
|
+
# Provision node pools dynamically based on incoming workloads:
|
|
140
|
+
# Currently autoprovisioning is not supported with Pathways.
|
|
141
|
+
autoprovisioning_config = None
|
|
142
|
+
if args.enable_autoprovisioning:
|
|
143
|
+
xpk_print('Enabling Autoprovisioning')
|
|
144
|
+
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
|
|
145
|
+
args, system
|
|
146
|
+
)
|
|
147
|
+
if return_code != 0:
|
|
148
|
+
xpk_exit(return_code)
|
|
149
|
+
|
|
150
|
+
xpk_print('Creating ConfigMap for cluster')
|
|
151
|
+
create_cluster_configmaps_code = create_cluster_configmaps(
|
|
152
|
+
args, system, tensorboard_config, autoprovisioning_config
|
|
153
|
+
)
|
|
154
|
+
if create_cluster_configmaps_code != 0:
|
|
155
|
+
xpk_exit(create_cluster_configmaps_code)
|
|
156
|
+
|
|
157
|
+
xpk_print(
|
|
158
|
+
'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
|
|
159
|
+
' globally available'
|
|
160
|
+
)
|
|
161
|
+
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
|
|
162
|
+
if set_jobset_on_cluster_code != 0:
|
|
163
|
+
xpk_exit(set_jobset_on_cluster_code)
|
|
164
|
+
|
|
165
|
+
# TODO: Uncomment when cluster_adapt will support TPU cluters
|
|
166
|
+
# set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
|
|
167
|
+
# if set_pathways_job_on_cluster_code != 0:
|
|
168
|
+
# xpk_exit(set_pathways_job_on_cluster_code)
|
|
169
|
+
|
|
170
|
+
install_kueue(args, system, autoprovisioning_config)
|
|
171
|
+
|
|
172
|
+
install_kjob(args)
|
|
173
|
+
|
|
174
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
175
|
+
prepare_gpus(args, system)
|
|
176
|
+
|
|
177
|
+
if args.enable_ray_cluster:
|
|
178
|
+
return_code = install_ray_cluster(args, system)
|
|
179
|
+
if return_code != 0:
|
|
180
|
+
xpk_print('Installation of RayCluster failed.')
|
|
181
|
+
xpk_exit(return_code)
|
|
182
|
+
|
|
183
|
+
xpk_print('GKE commands done! Resources are created.')
|
|
184
|
+
xpk_print(
|
|
185
|
+
'See your GKE Cluster here:'
|
|
186
|
+
# pylint: disable=line-too-long
|
|
187
|
+
f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
|
|
188
|
+
)
|
|
189
|
+
xpk_exit(0)
|
|
68
190
|
|
|
69
191
|
|
|
70
192
|
def cluster_create(args) -> None:
|
|
@@ -72,9 +194,6 @@ def cluster_create(args) -> None:
|
|
|
72
194
|
|
|
73
195
|
Args:
|
|
74
196
|
args: user provided arguments for running the command.
|
|
75
|
-
|
|
76
|
-
Returns:
|
|
77
|
-
0 if successful and 1 otherwise.
|
|
78
197
|
"""
|
|
79
198
|
system, return_code = get_system_characteristics(args)
|
|
80
199
|
|
|
@@ -117,35 +236,19 @@ def cluster_create(args) -> None:
|
|
|
117
236
|
|
|
118
237
|
# ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
|
|
119
238
|
# Enable WorkloadIdentity if not enabled already.
|
|
120
|
-
if
|
|
121
|
-
args.enable_workload_identity
|
|
122
|
-
or args.enable_gcsfuse_csi_driver
|
|
123
|
-
or args.enable_gcpfilestore_csi_driver
|
|
124
|
-
):
|
|
239
|
+
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
125
240
|
update_cluster_command_code = (
|
|
126
241
|
update_cluster_with_workload_identity_if_necessary(args)
|
|
127
242
|
)
|
|
128
243
|
if update_cluster_command_code != 0:
|
|
129
244
|
xpk_exit(update_cluster_command_code)
|
|
130
245
|
|
|
131
|
-
|
|
132
|
-
if args.enable_gcsfuse_csi_driver:
|
|
133
|
-
update_cluster_command_code = (
|
|
134
|
-
update_cluster_with_gcsfuse_driver_if_necessary(args)
|
|
135
|
-
)
|
|
136
|
-
if update_cluster_command_code != 0:
|
|
137
|
-
xpk_exit(update_cluster_command_code)
|
|
138
|
-
|
|
139
|
-
if args.enable_gcpfilestore_csi_driver:
|
|
140
|
-
update_cluster_command_code = (
|
|
141
|
-
update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
142
|
-
)
|
|
143
|
-
if update_cluster_command_code != 0:
|
|
144
|
-
xpk_exit(update_cluster_command_code)
|
|
246
|
+
get_cluster_credentials(args)
|
|
145
247
|
|
|
146
|
-
|
|
248
|
+
k8s_client = setup_k8s_env(args)
|
|
147
249
|
|
|
148
|
-
|
|
250
|
+
install_storage_crd(k8s_client)
|
|
251
|
+
install_storage_csis(args)
|
|
149
252
|
|
|
150
253
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
151
254
|
tensorboard_config = {}
|
|
@@ -155,13 +258,12 @@ def cluster_create(args) -> None:
|
|
|
155
258
|
if not tensorboard_config:
|
|
156
259
|
xpk_exit(1)
|
|
157
260
|
|
|
158
|
-
if system.
|
|
261
|
+
if system.device_type == H100_DEVICE_TYPE:
|
|
159
262
|
xpk_print('Setting up Network for cluster')
|
|
160
|
-
set_up_cluster_network_code =
|
|
263
|
+
set_up_cluster_network_code = set_up_cluster_network_for_a3(args)
|
|
161
264
|
if set_up_cluster_network_code != 0:
|
|
162
265
|
xpk_exit(set_up_cluster_network_code)
|
|
163
266
|
|
|
164
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
165
267
|
xpk_print('Creating Network Config for cluster')
|
|
166
268
|
create_cluster_network_config_code = create_cluster_network_config(args)
|
|
167
269
|
if create_cluster_network_config_code != 0:
|
|
@@ -207,45 +309,16 @@ def cluster_create(args) -> None:
|
|
|
207
309
|
if set_jobset_on_cluster_code != 0:
|
|
208
310
|
xpk_exit(set_jobset_on_cluster_code)
|
|
209
311
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
xpk_exit(install_kueue_on_cluster_code)
|
|
312
|
+
set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
|
|
313
|
+
if set_pathways_job_on_cluster_code != 0:
|
|
314
|
+
xpk_exit(set_pathways_job_on_cluster_code)
|
|
214
315
|
|
|
215
|
-
|
|
216
|
-
err_code = verify_kjob_installed(args)
|
|
217
|
-
if err_code > 0:
|
|
218
|
-
xpk_exit(err_code)
|
|
316
|
+
install_kueue(args, system, autoprovisioning_config)
|
|
219
317
|
|
|
220
|
-
|
|
221
|
-
err_code = apply_kjob_crds(args)
|
|
222
|
-
if err_code > 0:
|
|
223
|
-
xpk_exit(err_code)
|
|
224
|
-
|
|
225
|
-
err_code = prepare_kjob(args)
|
|
226
|
-
if err_code > 0:
|
|
227
|
-
xpk_exit(err_code)
|
|
228
|
-
|
|
229
|
-
k8s_client = setup_k8s_env(args)
|
|
230
|
-
install_storage_crd(k8s_client)
|
|
231
|
-
|
|
232
|
-
xpk_print('Wait for Kueue to be fully available')
|
|
233
|
-
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
234
|
-
if wait_for_kueue_available_code != 0:
|
|
235
|
-
xpk_exit(wait_for_kueue_available_code)
|
|
236
|
-
|
|
237
|
-
xpk_print('Install Kueue Custom Resources')
|
|
238
|
-
enable_kueue_credentials_code = install_kueue_crs(
|
|
239
|
-
args, system, autoprovisioning_config
|
|
240
|
-
)
|
|
241
|
-
if enable_kueue_credentials_code != 0:
|
|
242
|
-
xpk_exit(enable_kueue_credentials_code)
|
|
318
|
+
install_kjob(args)
|
|
243
319
|
|
|
244
320
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
245
|
-
|
|
246
|
-
install_nccl_code = install_nccl_on_cluster(args, system)
|
|
247
|
-
if install_nccl_code != 0:
|
|
248
|
-
xpk_exit(install_nccl_code)
|
|
321
|
+
prepare_gpus(args, system)
|
|
249
322
|
|
|
250
323
|
if args.enable_ray_cluster:
|
|
251
324
|
return_code = install_ray_cluster(args, system)
|
|
@@ -253,6 +326,12 @@ def cluster_create(args) -> None:
|
|
|
253
326
|
xpk_print('Installation of RayCluster failed.')
|
|
254
327
|
xpk_exit(return_code)
|
|
255
328
|
|
|
329
|
+
if hasattr(args, 'enable_mtc') and args.enable_mtc:
|
|
330
|
+
return_code = install_mtc_on_cluster(args, system)
|
|
331
|
+
if return_code != 0:
|
|
332
|
+
xpk_print('Installation of MTC failed.')
|
|
333
|
+
xpk_exit(return_code)
|
|
334
|
+
|
|
256
335
|
xpk_print('GKE commands done! Resources are created.')
|
|
257
336
|
xpk_print(
|
|
258
337
|
'See your GKE Cluster here:'
|
|
@@ -757,6 +836,7 @@ def run_gke_cluster_create_command(
|
|
|
757
836
|
f' --num-nodes {args.default_pool_cpu_num_nodes}'
|
|
758
837
|
f' {args.custom_cluster_arguments}'
|
|
759
838
|
f' {rapid_release_cmd}'
|
|
839
|
+
' --enable-dns-access'
|
|
760
840
|
)
|
|
761
841
|
|
|
762
842
|
enable_ip_alias = False
|
|
@@ -783,11 +863,7 @@ def run_gke_cluster_create_command(
|
|
|
783
863
|
if args.enable_ray_cluster:
|
|
784
864
|
command += ' --addons RayOperator'
|
|
785
865
|
|
|
786
|
-
if
|
|
787
|
-
args.enable_workload_identity
|
|
788
|
-
or args.enable_gcsfuse_csi_driver
|
|
789
|
-
or args.enable_gcpfilestore_csi_driver
|
|
790
|
-
):
|
|
866
|
+
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
791
867
|
command += f' --workload-pool={args.project}.svc.id.goog'
|
|
792
868
|
|
|
793
869
|
addons = []
|
|
@@ -797,6 +873,15 @@ def run_gke_cluster_create_command(
|
|
|
797
873
|
if args.enable_gcpfilestore_csi_driver:
|
|
798
874
|
addons.append('GcpFilestoreCsiDriver')
|
|
799
875
|
|
|
876
|
+
if args.enable_parallelstore_csi_driver:
|
|
877
|
+
addons.append('ParallelstoreCsiDriver')
|
|
878
|
+
|
|
879
|
+
if args.enable_pd_csi_driver:
|
|
880
|
+
addons.append('GcePersistentDiskCsiDriver')
|
|
881
|
+
|
|
882
|
+
if hasattr(args, 'enable_mtc') and args.enable_mtc:
|
|
883
|
+
addons.append('HighScaleCheckpointing')
|
|
884
|
+
|
|
800
885
|
if len(addons) > 0:
|
|
801
886
|
addons_str = ','.join(addons)
|
|
802
887
|
command += f' --addons={addons_str}'
|
|
@@ -806,3 +891,87 @@ def run_gke_cluster_create_command(
|
|
|
806
891
|
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
807
892
|
return 1
|
|
808
893
|
return 0
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
def install_storage_csis(args):
|
|
897
|
+
if args.enable_gcsfuse_csi_driver:
|
|
898
|
+
update_cluster_command_code = (
|
|
899
|
+
update_cluster_with_gcsfuse_driver_if_necessary(args)
|
|
900
|
+
)
|
|
901
|
+
if update_cluster_command_code != 0:
|
|
902
|
+
xpk_exit(update_cluster_command_code)
|
|
903
|
+
|
|
904
|
+
if args.enable_gcpfilestore_csi_driver:
|
|
905
|
+
update_cluster_command_code = (
|
|
906
|
+
update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
907
|
+
)
|
|
908
|
+
if update_cluster_command_code != 0:
|
|
909
|
+
xpk_exit(update_cluster_command_code)
|
|
910
|
+
|
|
911
|
+
if args.enable_parallelstore_csi_driver:
|
|
912
|
+
update_cluster_command_code = (
|
|
913
|
+
update_cluster_with_parallelstore_driver_if_necessary(args)
|
|
914
|
+
)
|
|
915
|
+
if update_cluster_command_code != 0:
|
|
916
|
+
xpk_exit(update_cluster_command_code)
|
|
917
|
+
|
|
918
|
+
if args.enable_pd_csi_driver:
|
|
919
|
+
update_cluster_command_code = update_cluster_with_pd_driver_if_necessary(
|
|
920
|
+
args
|
|
921
|
+
)
|
|
922
|
+
if update_cluster_command_code != 0:
|
|
923
|
+
xpk_exit(update_cluster_command_code)
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
def install_kjob(args):
|
|
927
|
+
xpk_print('Verifying kjob installation')
|
|
928
|
+
err_code = verify_kjob_installed(args)
|
|
929
|
+
if err_code > 0:
|
|
930
|
+
xpk_exit(err_code)
|
|
931
|
+
|
|
932
|
+
xpk_print('Applying kjob CDRs')
|
|
933
|
+
err_code = apply_kjob_crds(args)
|
|
934
|
+
if err_code > 0:
|
|
935
|
+
xpk_exit(err_code)
|
|
936
|
+
|
|
937
|
+
err_code = prepare_kjob(args)
|
|
938
|
+
if err_code > 0:
|
|
939
|
+
xpk_exit(err_code)
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
|
|
943
|
+
xpk_print('Enabling Kueue on the cluster')
|
|
944
|
+
install_kueue_on_cluster_code = install_kueue_on_cluster(args)
|
|
945
|
+
if install_kueue_on_cluster_code != 0:
|
|
946
|
+
xpk_exit(install_kueue_on_cluster_code)
|
|
947
|
+
|
|
948
|
+
xpk_print('Wait for Kueue to be fully available')
|
|
949
|
+
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
950
|
+
if wait_for_kueue_available_code != 0:
|
|
951
|
+
xpk_exit(wait_for_kueue_available_code)
|
|
952
|
+
|
|
953
|
+
xpk_print('Install Kueue Custom Resources')
|
|
954
|
+
enable_kueue_credentials_code = install_kueue_crs(
|
|
955
|
+
args, system, autoprovisioning_config
|
|
956
|
+
)
|
|
957
|
+
if enable_kueue_credentials_code != 0:
|
|
958
|
+
xpk_exit(enable_kueue_credentials_code)
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
def prepare_gpus(args, system: SystemCharacteristics):
|
|
962
|
+
xpk_print('Installing NCCL Plugin for cluster')
|
|
963
|
+
install_nccl_code = install_nccl_on_cluster(args, system)
|
|
964
|
+
if install_nccl_code != 0:
|
|
965
|
+
xpk_exit(install_nccl_code)
|
|
966
|
+
|
|
967
|
+
if system.device_type == H100_DEVICE_TYPE:
|
|
968
|
+
xpk_print('Installing NRI device injector for cluster')
|
|
969
|
+
install_nri_code = install_nri_on_cluster(args)
|
|
970
|
+
if install_nri_code != 0:
|
|
971
|
+
xpk_exit(install_nri_code)
|
|
972
|
+
|
|
973
|
+
if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
|
|
974
|
+
xpk_print('Disabling MGLRU')
|
|
975
|
+
err_code = disable_mglru_on_cluster(args)
|
|
976
|
+
if err_code > 0:
|
|
977
|
+
xpk_exit(err_code)
|
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -16,26 +16,27 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import os
|
|
18
18
|
|
|
19
|
-
from ..core.remote_state.remote_state_client import RemoteStateClient
|
|
20
|
-
from ..core.remote_state.fuse_remote_state import FuseStateClient
|
|
21
19
|
from ..core.blueprint.blueprint_generator import (
|
|
22
20
|
BlueprintGenerator,
|
|
23
21
|
BlueprintGeneratorOutput,
|
|
24
22
|
a3mega_device_type,
|
|
25
23
|
a3ultra_device_type,
|
|
24
|
+
a4_device_type,
|
|
26
25
|
supported_device_types,
|
|
27
26
|
)
|
|
28
|
-
from ..core.commands import run_command_for_value
|
|
29
27
|
from ..core.capacity import get_capacity_type
|
|
28
|
+
from ..core.cluster import get_cluster_credentials
|
|
29
|
+
from ..core.commands import run_command_for_value
|
|
30
30
|
from ..core.docker_manager import DockerManager
|
|
31
31
|
from ..core.gcloud_context import zone_to_region
|
|
32
32
|
from ..core.gcluster_manager import GclusterManager
|
|
33
|
+
from ..core.kjob import apply_kjob_crds, prepare_kjob
|
|
34
|
+
from ..core.remote_state.fuse_remote_state import FuseStateClient
|
|
35
|
+
from ..core.remote_state.remote_state_client import RemoteStateClient
|
|
33
36
|
from ..utils.console import xpk_exit, xpk_print
|
|
34
37
|
from ..utils.file import ensure_directory_exists
|
|
35
38
|
from ..utils.network import all_IPs_cidr
|
|
36
39
|
from ..utils.objects import hash_string
|
|
37
|
-
from ..core.cluster import get_cluster_credentials
|
|
38
|
-
from ..core.kjob import apply_kjob_crds, prepare_kjob
|
|
39
40
|
|
|
40
41
|
blueprints_path = os.path.abspath('xpkclusters/blueprints')
|
|
41
42
|
gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
|
|
@@ -266,4 +267,20 @@ def generate_blueprint(
|
|
|
266
267
|
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
267
268
|
gcs_bucket=args.cluster_state_gcs_bucket,
|
|
268
269
|
)
|
|
270
|
+
if args.device_type == a4_device_type:
|
|
271
|
+
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
272
|
+
return bpg.generate_a4_blueprint(
|
|
273
|
+
blueprint_name=blueprint_name,
|
|
274
|
+
prefix=prefix,
|
|
275
|
+
cluster_name=args.cluster,
|
|
276
|
+
region=zone_to_region(args.zone),
|
|
277
|
+
project_id=args.project,
|
|
278
|
+
zone=args.zone,
|
|
279
|
+
auth_cidr=all_IPs_cidr,
|
|
280
|
+
num_nodes=num_nodes,
|
|
281
|
+
reservation=args.reservation if args.reservation else None,
|
|
282
|
+
capacity_type=capacity_type,
|
|
283
|
+
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
284
|
+
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
285
|
+
)
|
|
269
286
|
return None
|
xpk/commands/common.py
CHANGED
|
@@ -15,8 +15,10 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..core.commands import run_command_with_updates_retry
|
|
18
|
+
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
19
|
+
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
18
20
|
from ..core.gcloud_context import zone_to_region
|
|
19
|
-
from ..utils.console import xpk_print
|
|
21
|
+
from ..utils.console import xpk_print, xpk_exit
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
def set_cluster_command(args) -> int:
|
|
@@ -31,6 +33,7 @@ def set_cluster_command(args) -> int:
|
|
|
31
33
|
command = (
|
|
32
34
|
'gcloud container clusters get-credentials'
|
|
33
35
|
f' {args.cluster} --region={zone_to_region(args.zone)}'
|
|
36
|
+
' --dns-endpoint'
|
|
34
37
|
f' --project={args.project} &&'
|
|
35
38
|
' kubectl config view && kubectl config set-context --current'
|
|
36
39
|
' --namespace=default'
|
|
@@ -42,3 +45,32 @@ def set_cluster_command(args) -> int:
|
|
|
42
45
|
if return_code != 0:
|
|
43
46
|
xpk_print(f'{task} returned ERROR {return_code}')
|
|
44
47
|
return return_code
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def is_TAS_possible(args) -> bool:
|
|
51
|
+
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
args: user provided arguments for running the command.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
True if possible and False otherwise.
|
|
58
|
+
"""
|
|
59
|
+
system_characteristics = get_cluster_system_characteristics(args)
|
|
60
|
+
capacity_type = get_cluster_capacity_type(args)
|
|
61
|
+
|
|
62
|
+
if system_characteristics is None:
|
|
63
|
+
xpk_print('system_characteristics data was not found in configmaps.')
|
|
64
|
+
xpk_exit(1)
|
|
65
|
+
|
|
66
|
+
if capacity_type is None:
|
|
67
|
+
xpk_print('capacity_type data was not found in configmaps.')
|
|
68
|
+
xpk_exit(1)
|
|
69
|
+
|
|
70
|
+
if (
|
|
71
|
+
system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
|
|
72
|
+
and capacity_type == CapacityType.SPOT
|
|
73
|
+
):
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
return True
|
xpk/commands/info.py
CHANGED
|
@@ -20,10 +20,10 @@ from argparse import Namespace
|
|
|
20
20
|
from tabulate import tabulate
|
|
21
21
|
|
|
22
22
|
from ..core.commands import run_command_for_value
|
|
23
|
+
from ..core.cluster import get_cluster_credentials
|
|
23
24
|
from ..core.gcloud_context import add_zone_and_project
|
|
24
25
|
from ..core.kueue import verify_kueuectl
|
|
25
26
|
from ..utils.console import xpk_exit, xpk_print
|
|
26
|
-
from .common import set_cluster_command
|
|
27
27
|
|
|
28
28
|
table_fmt = 'plain'
|
|
29
29
|
|
|
@@ -37,9 +37,7 @@ def info(args: Namespace) -> None:
|
|
|
37
37
|
None
|
|
38
38
|
"""
|
|
39
39
|
add_zone_and_project(args)
|
|
40
|
-
|
|
41
|
-
if set_cluster_command_code != 0:
|
|
42
|
-
xpk_exit(set_cluster_command_code)
|
|
40
|
+
get_cluster_credentials(args)
|
|
43
41
|
|
|
44
42
|
verify_kueuectl(args)
|
|
45
43
|
lq, cq = bool(args.localqueue), bool(args.clusterqueue)
|
xpk/commands/job.py
CHANGED
|
@@ -20,10 +20,10 @@ import sys
|
|
|
20
20
|
from ruamel.yaml import YAML
|
|
21
21
|
|
|
22
22
|
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
23
|
+
from ..core.cluster import get_cluster_credentials
|
|
23
24
|
from ..core.gcloud_context import add_zone_and_project
|
|
24
25
|
from ..core.kjob import AppProfileDefaults
|
|
25
26
|
from ..utils.console import xpk_exit, xpk_print
|
|
26
|
-
from .common import set_cluster_command
|
|
27
27
|
from .kind import set_local_cluster_command
|
|
28
28
|
|
|
29
29
|
|
|
@@ -143,14 +143,14 @@ def job_list(args) -> None:
|
|
|
143
143
|
"""
|
|
144
144
|
if not args.kind_cluster:
|
|
145
145
|
add_zone_and_project(args)
|
|
146
|
-
|
|
146
|
+
get_cluster_credentials(args)
|
|
147
147
|
msg = f'Listing jobs for project {args.project} and zone {args.zone}:'
|
|
148
148
|
else:
|
|
149
149
|
set_cluster_command_code = set_local_cluster_command(args)
|
|
150
150
|
msg = 'Listing jobs:'
|
|
151
|
+
if set_cluster_command_code != 0:
|
|
152
|
+
xpk_exit(set_cluster_command_code)
|
|
151
153
|
|
|
152
|
-
if set_cluster_command_code != 0:
|
|
153
|
-
xpk_exit(set_cluster_command_code)
|
|
154
154
|
xpk_print(msg, flush=True)
|
|
155
155
|
|
|
156
156
|
return_code = run_slurm_job_list_command(args)
|
|
@@ -178,12 +178,11 @@ def job_cancel(args) -> None:
|
|
|
178
178
|
xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
|
|
179
179
|
if not args.kind_cluster:
|
|
180
180
|
add_zone_and_project(args)
|
|
181
|
-
|
|
181
|
+
get_cluster_credentials(args)
|
|
182
182
|
else:
|
|
183
183
|
set_cluster_command_code = set_local_cluster_command(args)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
xpk_exit(set_cluster_command_code)
|
|
184
|
+
if set_cluster_command_code != 0:
|
|
185
|
+
xpk_exit(set_cluster_command_code)
|
|
187
186
|
|
|
188
187
|
return_code = run_slurm_job_delete_command(args)
|
|
189
188
|
xpk_exit(return_code)
|