xpk 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +8 -8
- xpk/commands/cluster.py +19 -19
- xpk/commands/cluster_gcluster.py +2 -1
- xpk/commands/common.py +7 -3
- xpk/commands/info.py +12 -12
- xpk/commands/inspector.py +1 -1
- xpk/commands/job.py +42 -12
- xpk/commands/kjob_common.py +2 -1
- xpk/commands/storage.py +6 -3
- xpk/commands/workload.py +28 -15
- xpk/core/blueprint/blueprint_generator.py +7 -7
- xpk/core/blueprint/blueprint_test.py +218 -0
- xpk/core/capacity.py +3 -1
- xpk/core/cluster.py +14 -8
- xpk/core/cluster_private.py +8 -2
- xpk/core/commands.py +13 -10
- xpk/core/config.py +3 -4
- xpk/core/config_test.py +71 -0
- xpk/core/docker_image.py +14 -5
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +10 -5
- xpk/core/filestore.py +7 -2
- xpk/core/gcloud_context.py +2 -2
- xpk/core/jobset.py +1 -1
- xpk/core/kjob.py +7 -3
- xpk/core/kueue.py +28 -8
- xpk/core/nap.py +5 -5
- xpk/core/network.py +1 -1
- xpk/core/nodepool.py +8 -3
- xpk/core/nodepool_test.py +82 -0
- xpk/core/pathways.py +6 -2
- xpk/core/ray.py +1 -1
- xpk/core/resources.py +18 -14
- xpk/core/scheduling.py +4 -0
- xpk/core/storage.py +14 -14
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +3 -2
- xpk/core/workload_decorators/storage_decorator.py +2 -1
- xpk/core/workload_decorators/tcpx_decorator.py +4 -2
- xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
- xpk/core/workload_test.py +28 -0
- xpk/main.py +12 -10
- xpk/parser/cluster.py +110 -49
- xpk/parser/common.py +45 -36
- xpk/parser/storage.py +12 -13
- xpk/parser/workload.py +57 -39
- xpk/utils/console.py +2 -1
- xpk/utils/execution_context.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/network.py +4 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/METADATA +4 -1
- xpk-0.13.0.dist-info/RECORD +101 -0
- xpk-0.11.0.dist-info/RECORD +0 -95
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/WHEEL +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/top_level.txt +0 -0
xpk/commands/batch.py
CHANGED
|
@@ -31,6 +31,7 @@ from ..core.kjob import (
|
|
|
31
31
|
)
|
|
32
32
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
33
33
|
from ..utils.console import xpk_exit, xpk_print
|
|
34
|
+
from ..utils.execution_context import is_dry_run
|
|
34
35
|
from .kind import set_local_cluster_command
|
|
35
36
|
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
36
37
|
|
|
@@ -51,18 +52,16 @@ def batch(args: Namespace) -> None:
|
|
|
51
52
|
if set_cluster_command_code != 0:
|
|
52
53
|
xpk_exit(set_cluster_command_code)
|
|
53
54
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
55
|
+
if not is_dry_run():
|
|
56
|
+
err_code = prepare_kjob(args)
|
|
57
|
+
if err_code > 0:
|
|
58
|
+
xpk_exit(err_code)
|
|
59
|
+
setup_k8s_service_accounts()
|
|
58
60
|
|
|
59
61
|
submit_job(args)
|
|
60
62
|
|
|
61
63
|
|
|
62
64
|
def submit_job(args: Namespace) -> None:
|
|
63
|
-
|
|
64
|
-
setup_k8s_service_accounts()
|
|
65
|
-
|
|
66
65
|
cmd = (
|
|
67
66
|
'kubectl kjob create slurm'
|
|
68
67
|
f' --profile {AppProfileDefaults.NAME.value}'
|
|
@@ -73,7 +72,8 @@ def submit_job(args: Namespace) -> None:
|
|
|
73
72
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
74
73
|
cmd = add_TAS_annotations_to_command(args, cmd)
|
|
75
74
|
|
|
76
|
-
|
|
75
|
+
annotations = [] if is_dry_run() else get_storage_annotations(args)
|
|
76
|
+
for annotation in annotations:
|
|
77
77
|
cmd += f' --pod-template-annotation {annotation}'
|
|
78
78
|
|
|
79
79
|
if args.ignore_unknown_flags:
|
xpk/commands/cluster.py
CHANGED
|
@@ -76,6 +76,7 @@ from ..core.vertex import create_vertex_tensorboard
|
|
|
76
76
|
from ..core.workload import get_workload_list
|
|
77
77
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
78
78
|
from ..utils.file import write_tmp_file
|
|
79
|
+
from ..utils.execution_context import is_dry_run
|
|
79
80
|
from . import cluster_gcluster
|
|
80
81
|
from .common import set_cluster_command
|
|
81
82
|
import shutil
|
|
@@ -92,7 +93,7 @@ def cluster_adapt(args) -> None:
|
|
|
92
93
|
|
|
93
94
|
system, return_code = get_system_characteristics(args)
|
|
94
95
|
|
|
95
|
-
if return_code > 0:
|
|
96
|
+
if return_code > 0 or system is None:
|
|
96
97
|
xpk_print('Fetching system characteristics failed!')
|
|
97
98
|
xpk_exit(return_code)
|
|
98
99
|
|
|
@@ -128,9 +129,10 @@ def cluster_adapt(args) -> None:
|
|
|
128
129
|
|
|
129
130
|
get_cluster_credentials(args)
|
|
130
131
|
|
|
131
|
-
|
|
132
|
+
if not is_dry_run():
|
|
133
|
+
k8s_client = setup_k8s_env(args)
|
|
134
|
+
install_storage_crd(k8s_client)
|
|
132
135
|
|
|
133
|
-
install_storage_crd(k8s_client)
|
|
134
136
|
install_storage_csis(args)
|
|
135
137
|
|
|
136
138
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
@@ -141,8 +143,6 @@ def cluster_adapt(args) -> None:
|
|
|
141
143
|
if not tensorboard_config:
|
|
142
144
|
xpk_exit(1)
|
|
143
145
|
|
|
144
|
-
# Provision node pools dynamically based on incoming workloads:
|
|
145
|
-
# Currently autoprovisioning is not supported with Pathways.
|
|
146
146
|
autoprovisioning_config = None
|
|
147
147
|
if args.enable_autoprovisioning:
|
|
148
148
|
xpk_print('Enabling Autoprovisioning')
|
|
@@ -201,7 +201,7 @@ def cluster_create(args) -> None:
|
|
|
201
201
|
"""
|
|
202
202
|
system, return_code = get_system_characteristics(args)
|
|
203
203
|
|
|
204
|
-
if return_code > 0:
|
|
204
|
+
if return_code > 0 or system is None:
|
|
205
205
|
xpk_print('Fetching system characteristics failed!')
|
|
206
206
|
xpk_exit(return_code)
|
|
207
207
|
|
|
@@ -217,13 +217,13 @@ def cluster_create(args) -> None:
|
|
|
217
217
|
xpk_exit(0)
|
|
218
218
|
|
|
219
219
|
return_code, gke_server_config = get_gke_server_config(args)
|
|
220
|
-
if return_code != 0:
|
|
220
|
+
if return_code != 0 or gke_server_config is None:
|
|
221
221
|
xpk_exit(return_code)
|
|
222
222
|
|
|
223
223
|
return_code, gke_control_plane_version = get_gke_control_plane_version(
|
|
224
224
|
args, gke_server_config
|
|
225
225
|
)
|
|
226
|
-
if return_code != 0:
|
|
226
|
+
if return_code != 0 or gke_control_plane_version is None:
|
|
227
227
|
xpk_exit(return_code)
|
|
228
228
|
|
|
229
229
|
create_cluster_command_code = create_cluster_if_necessary(
|
|
@@ -253,9 +253,10 @@ def cluster_create(args) -> None:
|
|
|
253
253
|
if update_coredns_command_code != 0:
|
|
254
254
|
xpk_exit(update_cluster_command_code)
|
|
255
255
|
|
|
256
|
-
|
|
256
|
+
if not is_dry_run():
|
|
257
|
+
k8s_client = setup_k8s_env(args)
|
|
258
|
+
install_storage_crd(k8s_client)
|
|
257
259
|
|
|
258
|
-
install_storage_crd(k8s_client)
|
|
259
260
|
install_storage_csis(args)
|
|
260
261
|
|
|
261
262
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
@@ -294,7 +295,7 @@ def cluster_create(args) -> None:
|
|
|
294
295
|
# Provision node pools dynamically based on incoming workloads:
|
|
295
296
|
# Currently autoprovisioning is not supported with Pathways.
|
|
296
297
|
autoprovisioning_config = None
|
|
297
|
-
if
|
|
298
|
+
if args.enable_autoprovisioning:
|
|
298
299
|
xpk_print('Enabling Autoprovisioning')
|
|
299
300
|
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
|
|
300
301
|
args, system
|
|
@@ -398,7 +399,7 @@ def cluster_cacheimage(args) -> None:
|
|
|
398
399
|
get_cluster_credentials(args)
|
|
399
400
|
system, return_code = get_system_characteristics(args)
|
|
400
401
|
|
|
401
|
-
if return_code > 0:
|
|
402
|
+
if return_code > 0 or system is None:
|
|
402
403
|
xpk_print('Fetching system characteristics failed!')
|
|
403
404
|
xpk_exit(return_code)
|
|
404
405
|
|
|
@@ -411,10 +412,8 @@ def cluster_cacheimage(args) -> None:
|
|
|
411
412
|
nodeSelectorKey=node_selector_key,
|
|
412
413
|
)
|
|
413
414
|
tmp = write_tmp_file(yml_string)
|
|
414
|
-
command_apply = f'kubectl apply -f {str(tmp
|
|
415
|
-
command_delete = (
|
|
416
|
-
f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true'
|
|
417
|
-
)
|
|
415
|
+
command_apply = f'kubectl apply -f {str(tmp)}'
|
|
416
|
+
command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
|
|
418
417
|
|
|
419
418
|
return_code = run_command_with_updates(
|
|
420
419
|
command_delete, 'Deleting Cached Image', args
|
|
@@ -808,6 +807,7 @@ def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
|
|
|
808
807
|
|
|
809
808
|
def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
|
|
810
809
|
"""Check for the existence of a specific Deployment in a given namespace."""
|
|
810
|
+
# TODO: rewrite this to be more obvious, check if it is correct
|
|
811
811
|
command = (
|
|
812
812
|
f'kubectl get deployment {deployment_name} -n'
|
|
813
813
|
f' {namespace} --ignore-not-found'
|
|
@@ -815,11 +815,11 @@ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
|
|
|
815
815
|
result = run_command_with_updates(
|
|
816
816
|
command, 'Waiting for kubeDNS to be checked.', args
|
|
817
817
|
)
|
|
818
|
-
return result
|
|
818
|
+
return result != 0
|
|
819
819
|
|
|
820
820
|
|
|
821
821
|
def verify_coredns_readiness(
|
|
822
|
-
args, timeout: int =
|
|
822
|
+
args, timeout: int = 240, namespace: str = 'kube-system'
|
|
823
823
|
):
|
|
824
824
|
"""Verifies CoreDNS readiness using kubectl wait commands."""
|
|
825
825
|
xpk_print('Now verifying CoreDNS readiness...')
|
|
@@ -874,7 +874,7 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
|
|
|
874
874
|
xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
|
|
875
875
|
|
|
876
876
|
|
|
877
|
-
def update_coredns(args):
|
|
877
|
+
def update_coredns(args) -> int:
|
|
878
878
|
"""Updates and deploys CoreDNS within a cluster.
|
|
879
879
|
|
|
880
880
|
Args:
|
xpk/commands/cluster_gcluster.py
CHANGED
xpk/commands/common.py
CHANGED
|
@@ -18,6 +18,7 @@ from ..core.commands import run_command_with_updates_retry
|
|
|
18
18
|
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
19
19
|
from ..core.gcloud_context import zone_to_region
|
|
20
20
|
from ..utils.console import xpk_print, xpk_exit
|
|
21
|
+
from ..utils.execution_context import is_dry_run
|
|
21
22
|
from ..core.system_characteristics import (
|
|
22
23
|
SystemCharacteristics,
|
|
23
24
|
)
|
|
@@ -50,8 +51,8 @@ def set_cluster_command(args) -> int:
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
def is_TAS_possible(
|
|
53
|
-
system_characteristics: SystemCharacteristics,
|
|
54
|
-
capacity_type: CapacityType,
|
|
54
|
+
system_characteristics: SystemCharacteristics | None,
|
|
55
|
+
capacity_type: CapacityType | None,
|
|
55
56
|
flex: bool,
|
|
56
57
|
) -> bool:
|
|
57
58
|
"""Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
|
|
@@ -63,6 +64,9 @@ def is_TAS_possible(
|
|
|
63
64
|
True if possible and False otherwise.
|
|
64
65
|
"""
|
|
65
66
|
|
|
67
|
+
if is_dry_run():
|
|
68
|
+
return True
|
|
69
|
+
|
|
66
70
|
if system_characteristics is None:
|
|
67
71
|
xpk_print('system_characteristics data was not found in configmaps.')
|
|
68
72
|
xpk_exit(1)
|
|
@@ -71,7 +75,7 @@ def is_TAS_possible(
|
|
|
71
75
|
xpk_print('capacity_type data was not found in configmaps.')
|
|
72
76
|
xpk_exit(1)
|
|
73
77
|
|
|
74
|
-
if flex:
|
|
78
|
+
if not flex:
|
|
75
79
|
return False
|
|
76
80
|
|
|
77
81
|
if (
|
xpk/commands/info.py
CHANGED
|
@@ -51,19 +51,19 @@ def info(args: Namespace) -> None:
|
|
|
51
51
|
cqs = run_kueuectl_list_clusterqueue(args)
|
|
52
52
|
quotas = get_nominal_quotas(cqs)
|
|
53
53
|
|
|
54
|
-
if lq:
|
|
54
|
+
if lq and lqs is not None:
|
|
55
55
|
print_formatted_lqs(lqs, quotas)
|
|
56
56
|
|
|
57
57
|
if cq:
|
|
58
58
|
print_formatted_cqs(cqs, quotas)
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def get_nominal_quotas(cqs:
|
|
61
|
+
def get_nominal_quotas(cqs: str) -> dict[str, dict[str, str]]:
|
|
62
62
|
"""Get quotas from clusterqueues.
|
|
63
63
|
This function retrieves how much of resource in each flavor is assigned to cluster queue.
|
|
64
64
|
It parses flavors of passed cluster queues.
|
|
65
65
|
Args:
|
|
66
|
-
- cqs - list of cluster queues.
|
|
66
|
+
- cqs - string containing a list of cluster queues in JSON format.
|
|
67
67
|
Returns:
|
|
68
68
|
- dictionary of cluster queues resources quotas in format:
|
|
69
69
|
{cq_name:{"flavorName:resourceName":quota}}
|
|
@@ -75,7 +75,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
|
|
|
75
75
|
xpk_print(cqs)
|
|
76
76
|
xpk_exit(1)
|
|
77
77
|
|
|
78
|
-
quotas = {}
|
|
78
|
+
quotas: dict[str, dict] = {}
|
|
79
79
|
for cq in cq_list:
|
|
80
80
|
spec = cq['spec']
|
|
81
81
|
cq_name = cq['metadata']['name']
|
|
@@ -89,7 +89,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
|
|
|
89
89
|
return quotas
|
|
90
90
|
|
|
91
91
|
|
|
92
|
-
def print_formatted_cqs(cqs:
|
|
92
|
+
def print_formatted_cqs(cqs: str, nominalQuotas) -> None:
|
|
93
93
|
try:
|
|
94
94
|
cq_list = json.loads(cqs)['items']
|
|
95
95
|
except ValueError:
|
|
@@ -105,7 +105,7 @@ def print_formatted_cqs(cqs: list[dict], nominalQuotas) -> None:
|
|
|
105
105
|
)
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
def print_formatted_lqs(lqs:
|
|
108
|
+
def print_formatted_lqs(lqs: str, nominalQuotas) -> None:
|
|
109
109
|
try:
|
|
110
110
|
lq_list = json.loads(lqs)['items']
|
|
111
111
|
except ValueError:
|
|
@@ -143,18 +143,18 @@ def parse_queue_lists(
|
|
|
143
143
|
|
|
144
144
|
|
|
145
145
|
def get_flavors_resources_reservations(
|
|
146
|
-
cq_name: str, flavors_res:
|
|
146
|
+
cq_name: str, flavors_res: dict
|
|
147
147
|
) -> dict[str, dict[str, str]]:
|
|
148
148
|
"""Get usage of flavors resources.
|
|
149
149
|
This function parser flavorsReservation section of clusterQueue of LocalQueue.
|
|
150
150
|
Args:
|
|
151
151
|
- cq_name - name of ClusterQueue to which flavors belong.
|
|
152
|
-
- flavors_res -
|
|
152
|
+
- flavors_res - dict of reservations made by flavors
|
|
153
153
|
Returns:
|
|
154
154
|
Dict containing usage of each resource in flavor for each flavor in cluster or local queue.
|
|
155
155
|
Dict format: {cq_name: {{flavor:resource}:reservation}}
|
|
156
156
|
"""
|
|
157
|
-
reservations = {}
|
|
157
|
+
reservations: dict[str, dict] = {}
|
|
158
158
|
reservations[cq_name] = {}
|
|
159
159
|
for flavor_name, flavor_resources_reservation_list in flavors_res.items():
|
|
160
160
|
for resource in flavor_resources_reservation_list:
|
|
@@ -167,15 +167,15 @@ def get_flavors_resources_reservations(
|
|
|
167
167
|
|
|
168
168
|
def get_flavors_usage(
|
|
169
169
|
q_entry: dict, res_field: str, flavor_resource_quotas: dict
|
|
170
|
-
) ->
|
|
170
|
+
) -> dict[str, str]:
|
|
171
171
|
"""Parse q_entry to retrieve list of each resource usage in flavour.
|
|
172
172
|
Args:
|
|
173
173
|
q_entry - single entry into either LocalQueue or ClusterQueue structured as json
|
|
174
174
|
flavor_resource_quotas - nominalQuota of flavors resource usage for each clusterqueue
|
|
175
175
|
Returns:
|
|
176
|
-
|
|
176
|
+
Dict where for each (key, value):
|
|
177
177
|
- key is flavorName:resourceName
|
|
178
|
-
-
|
|
178
|
+
- value is string formatted as 'flavorResourceReservation/flavorResourceQuota'
|
|
179
179
|
"""
|
|
180
180
|
status = q_entry['status']
|
|
181
181
|
flavors_res = status[res_field]
|
xpk/commands/inspector.py
CHANGED
|
@@ -346,7 +346,7 @@ def inspector(args) -> None:
|
|
|
346
346
|
)
|
|
347
347
|
|
|
348
348
|
# Summarize inspector:
|
|
349
|
-
xpk_print(f'Find xpk inspector output file: {inspector_file
|
|
349
|
+
xpk_print(f'Find xpk inspector output file: {inspector_file}')
|
|
350
350
|
|
|
351
351
|
if final_return_code != 0:
|
|
352
352
|
xpk_print(
|
xpk/commands/job.py
CHANGED
|
@@ -18,6 +18,7 @@ import re
|
|
|
18
18
|
import sys
|
|
19
19
|
|
|
20
20
|
from ruamel.yaml import YAML
|
|
21
|
+
from typing import cast
|
|
21
22
|
|
|
22
23
|
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
23
24
|
from ..core.cluster import get_cluster_credentials
|
|
@@ -27,6 +28,28 @@ from ..utils.console import xpk_exit, xpk_print
|
|
|
27
28
|
from .kind import set_local_cluster_command
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
JOBS_DRY_RUN_YAML = """
|
|
32
|
+
items:
|
|
33
|
+
- apiVersion: slurm.k8s.io/v1alpha1
|
|
34
|
+
kind: SlurmJob
|
|
35
|
+
metadata:
|
|
36
|
+
annotations:
|
|
37
|
+
kjobctl.x-k8s.io/script: echo hello
|
|
38
|
+
creationTimestamp: '2024-04-29T12:00:00Z'
|
|
39
|
+
labels:
|
|
40
|
+
kjobctl.x-k8s.io/app-profile: default
|
|
41
|
+
name: golden-job
|
|
42
|
+
namespace: default
|
|
43
|
+
spec:
|
|
44
|
+
script: echo hello
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
PODS_DRY_RUN_RESULT = """
|
|
48
|
+
foo-pod 2/2 Running 0 2d
|
|
49
|
+
bar-pod 1/1 Evicted 0 1d
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
|
|
30
53
|
def job_info(args):
|
|
31
54
|
"""Run commands obtaining information about a job given by name.
|
|
32
55
|
|
|
@@ -51,7 +74,10 @@ def job_info(args):
|
|
|
51
74
|
f' metadata.name=={job_name}'
|
|
52
75
|
)
|
|
53
76
|
job_code, job_text = run_command_for_value(
|
|
54
|
-
job_command,
|
|
77
|
+
job_command,
|
|
78
|
+
'Getting job info',
|
|
79
|
+
args,
|
|
80
|
+
dry_run_return_val=JOBS_DRY_RUN_YAML,
|
|
55
81
|
)
|
|
56
82
|
if job_code != 0:
|
|
57
83
|
xpk_print(f'Job info request returned ERROR {job_code}')
|
|
@@ -59,7 +85,10 @@ def job_info(args):
|
|
|
59
85
|
|
|
60
86
|
pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers'
|
|
61
87
|
pods_code, pods_text = run_command_for_value(
|
|
62
|
-
pods_command,
|
|
88
|
+
pods_command,
|
|
89
|
+
'Getting pods list',
|
|
90
|
+
args,
|
|
91
|
+
dry_run_return_val=PODS_DRY_RUN_RESULT,
|
|
63
92
|
)
|
|
64
93
|
if pods_code != 0:
|
|
65
94
|
xpk_print(f'Pods list request returned ERROR {pods_code}')
|
|
@@ -84,7 +113,7 @@ def job_info(args):
|
|
|
84
113
|
|
|
85
114
|
|
|
86
115
|
def get_profile(job_yaml: dict) -> str:
|
|
87
|
-
containers = (
|
|
116
|
+
containers: list[dict] = (
|
|
88
117
|
job_yaml.get('spec', {})
|
|
89
118
|
.get('template', {})
|
|
90
119
|
.get('spec', {})
|
|
@@ -96,13 +125,13 @@ def get_profile(job_yaml: dict) -> str:
|
|
|
96
125
|
|
|
97
126
|
|
|
98
127
|
def get_mounts(job_yaml: dict) -> list[dict]:
|
|
99
|
-
containers = (
|
|
128
|
+
containers: list[dict] = (
|
|
100
129
|
job_yaml.get('spec', {})
|
|
101
130
|
.get('template', {})
|
|
102
131
|
.get('spec', {})
|
|
103
132
|
.get('containers', [])
|
|
104
133
|
)
|
|
105
|
-
mounts = next(iter(containers), {}).get('volumeMounts', [])
|
|
134
|
+
mounts: list[dict] = next(iter(containers), {}).get('volumeMounts', [])
|
|
106
135
|
return mounts
|
|
107
136
|
|
|
108
137
|
|
|
@@ -112,23 +141,24 @@ def get_kjob_env_vars(job_desc_text: str) -> list[tuple[str, str]]:
|
|
|
112
141
|
return search_res
|
|
113
142
|
|
|
114
143
|
|
|
115
|
-
def get_pods(pods_text: str) -> list[str]:
|
|
144
|
+
def get_pods(pods_text: str) -> list[dict[str, str]]:
|
|
116
145
|
pods_lines = pods_text.strip().split('\n')
|
|
117
|
-
|
|
146
|
+
pods_lines_tokenized = [line.split() for line in pods_lines]
|
|
118
147
|
return [
|
|
119
148
|
{
|
|
120
|
-
'Name':
|
|
121
|
-
'Status':
|
|
149
|
+
'Name': tokens[0],
|
|
150
|
+
'Status': tokens[2],
|
|
122
151
|
}
|
|
123
|
-
for
|
|
152
|
+
for tokens in pods_lines_tokenized
|
|
124
153
|
]
|
|
125
154
|
|
|
126
155
|
|
|
127
156
|
def get_script_name(job_yaml: dict) -> str | None:
|
|
128
|
-
return (
|
|
157
|
+
return cast(
|
|
158
|
+
str | None,
|
|
129
159
|
job_yaml.get('metadata', {})
|
|
130
160
|
.get('annotations', {})
|
|
131
|
-
.get('kjobctl.x-k8s.io/script', '')
|
|
161
|
+
.get('kjobctl.x-k8s.io/script', ''),
|
|
132
162
|
)
|
|
133
163
|
|
|
134
164
|
|
xpk/commands/kjob_common.py
CHANGED
|
@@ -33,6 +33,7 @@ from ..core.resources import get_cluster_capacity_type, get_cluster_system_chara
|
|
|
33
33
|
def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
34
34
|
gpu_type = get_gpu_type_from_cluster(args)
|
|
35
35
|
|
|
36
|
+
annotations: tuple
|
|
36
37
|
if gpu_type == H100_MEGA_DEVICE_TYPE:
|
|
37
38
|
annotations = get_a3mega_pod_template_annotations(args)
|
|
38
39
|
elif gpu_type == H200_DEVICE_TYPE:
|
|
@@ -40,7 +41,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
40
41
|
elif gpu_type == B200_DEVICE_TYPE:
|
|
41
42
|
annotations = get_a4_pod_template_annotations(args)
|
|
42
43
|
else:
|
|
43
|
-
annotations =
|
|
44
|
+
annotations = tuple()
|
|
44
45
|
|
|
45
46
|
flags = [
|
|
46
47
|
f" --pod-template-annotation {annotation} " for annotation in annotations
|
xpk/commands/storage.py
CHANGED
|
@@ -58,6 +58,7 @@ from ..core.storage import (
|
|
|
58
58
|
)
|
|
59
59
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
60
60
|
from ..utils.kubectl import apply_kubectl_manifest
|
|
61
|
+
from ..utils.execution_context import is_dry_run
|
|
61
62
|
|
|
62
63
|
|
|
63
64
|
def storage_create(args: Namespace) -> None:
|
|
@@ -141,7 +142,7 @@ def storage_delete(args: Namespace) -> None:
|
|
|
141
142
|
|
|
142
143
|
def storage_attach(args: Namespace) -> None:
|
|
143
144
|
add_zone_and_project(args)
|
|
144
|
-
manifest = [{}]
|
|
145
|
+
manifest: list[dict] = [{}]
|
|
145
146
|
if args.type == GCP_FILESTORE_TYPE:
|
|
146
147
|
if args.instance is None:
|
|
147
148
|
args.instance = args.name
|
|
@@ -243,8 +244,10 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
|
|
|
243
244
|
|
|
244
245
|
|
|
245
246
|
def storage_list(args: Namespace) -> None:
|
|
246
|
-
|
|
247
|
-
|
|
247
|
+
storages = []
|
|
248
|
+
if not is_dry_run():
|
|
249
|
+
k8s_api_client = setup_k8s_env(args)
|
|
250
|
+
storages = list_storages(k8s_api_client)
|
|
248
251
|
print_storages_for_cluster(storages)
|
|
249
252
|
|
|
250
253
|
|
xpk/commands/workload.py
CHANGED
|
@@ -84,6 +84,7 @@ from ..core.system_characteristics import (
|
|
|
84
84
|
from ..core.vertex import create_vertex_experiment
|
|
85
85
|
from ..core.workload import (
|
|
86
86
|
check_if_workload_exists,
|
|
87
|
+
get_jobsets_list_gcp_link,
|
|
87
88
|
get_workload_list,
|
|
88
89
|
wait_for_job_completion,
|
|
89
90
|
zone_to_region,
|
|
@@ -96,6 +97,7 @@ from ..core.workload_decorators import (
|
|
|
96
97
|
)
|
|
97
98
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
98
99
|
from ..utils.file import write_tmp_file
|
|
100
|
+
from ..utils.execution_context import is_dry_run
|
|
99
101
|
from . import cluster_gcluster
|
|
100
102
|
from .common import is_TAS_possible
|
|
101
103
|
|
|
@@ -226,7 +228,8 @@ spec:
|
|
|
226
228
|
metadata:
|
|
227
229
|
labels:
|
|
228
230
|
xpk.google.com/workload: {args.workload}
|
|
229
|
-
annotations:
|
|
231
|
+
annotations:
|
|
232
|
+
{annotations}
|
|
230
233
|
spec:
|
|
231
234
|
priorityClassName: {args.priority}
|
|
232
235
|
restartPolicy: Never
|
|
@@ -304,8 +307,10 @@ def workload_create(args) -> None:
|
|
|
304
307
|
Returns:
|
|
305
308
|
0 if successful and 1 otherwise.
|
|
306
309
|
"""
|
|
307
|
-
k8s_api_client =
|
|
308
|
-
|
|
310
|
+
k8s_api_client = None
|
|
311
|
+
if not is_dry_run():
|
|
312
|
+
k8s_api_client = setup_k8s_env(args)
|
|
313
|
+
setup_k8s_service_accounts()
|
|
309
314
|
|
|
310
315
|
workload_exists = check_if_workload_exists(args)
|
|
311
316
|
|
|
@@ -319,7 +324,7 @@ def workload_create(args) -> None:
|
|
|
319
324
|
xpk_print('Starting workload create', flush=True)
|
|
320
325
|
system, return_code = get_system_characteristics(args)
|
|
321
326
|
|
|
322
|
-
if return_code > 0:
|
|
327
|
+
if return_code > 0 or system is None:
|
|
323
328
|
xpk_print('Fetching system characteristics failed!')
|
|
324
329
|
xpk_exit(return_code)
|
|
325
330
|
|
|
@@ -345,7 +350,7 @@ def workload_create(args) -> None:
|
|
|
345
350
|
):
|
|
346
351
|
xpk_print(
|
|
347
352
|
'Warning: Cluster has been created using XPK version:'
|
|
348
|
-
f' {
|
|
353
|
+
f' {cluster_xpk_version} but the XPK version you are'
|
|
349
354
|
f' using to schedule workload is: {XPK_CURRENT_VERSION}. Some features'
|
|
350
355
|
' might not be available for this cluster. We recommend to'
|
|
351
356
|
' upgrade/downgrade your XPK version or cluster by running `xpk'
|
|
@@ -354,7 +359,7 @@ def workload_create(args) -> None:
|
|
|
354
359
|
|
|
355
360
|
debugging_dashboard_id = None
|
|
356
361
|
|
|
357
|
-
tensorboard_config = {}
|
|
362
|
+
tensorboard_config: dict | None = {}
|
|
358
363
|
if VERTEX_TENSORBOARD_FEATURE_FLAG and args.use_vertex_tensorboard:
|
|
359
364
|
tensorboard_config = create_vertex_experiment(args)
|
|
360
365
|
# exit if failed to create Experiment in Vertex AI
|
|
@@ -381,8 +386,10 @@ def workload_create(args) -> None:
|
|
|
381
386
|
all_storages = []
|
|
382
387
|
# Currently storage customization is not supported for Pathways workloads. b/408468941
|
|
383
388
|
if not args.use_pathways:
|
|
384
|
-
storages: list[Storage] =
|
|
385
|
-
|
|
389
|
+
storages: list[Storage] = (
|
|
390
|
+
[]
|
|
391
|
+
if k8s_api_client is None
|
|
392
|
+
else get_storages_to_mount(k8s_api_client, args.storage)
|
|
386
393
|
)
|
|
387
394
|
gcs_fuse_storages = list(
|
|
388
395
|
filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
|
|
@@ -450,8 +457,8 @@ def workload_create(args) -> None:
|
|
|
450
457
|
- action: FailJobSet
|
|
451
458
|
onJobFailureReasons:
|
|
452
459
|
- PodFailurePolicy"""
|
|
453
|
-
|
|
454
|
-
restart_on_exit_codes = ','.join(map(str,
|
|
460
|
+
restart_on_exit_codes_list = get_restart_exit_codes(args)
|
|
461
|
+
restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes_list))
|
|
455
462
|
pod_failure_policy = f"""
|
|
456
463
|
podFailurePolicy:
|
|
457
464
|
rules:
|
|
@@ -567,14 +574,14 @@ def workload_create(args) -> None:
|
|
|
567
574
|
pod_failure_policy=pod_failure_policy,
|
|
568
575
|
)
|
|
569
576
|
tmp = write_tmp_file(yml_string)
|
|
570
|
-
command = f'kubectl apply -f {str(tmp
|
|
577
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
571
578
|
return_code = run_command_with_updates(command, 'Creating Workload', args)
|
|
572
579
|
|
|
573
580
|
if return_code != 0:
|
|
574
581
|
xpk_print(f'Create Workload request returned ERROR {return_code}')
|
|
575
582
|
xpk_exit(return_code)
|
|
576
583
|
|
|
577
|
-
if not args.use_pathways:
|
|
584
|
+
if not args.use_pathways and not is_dry_run():
|
|
578
585
|
add_bucket_iam_members(args, storages)
|
|
579
586
|
|
|
580
587
|
# Get GKE outlier dashboard for TPU
|
|
@@ -723,7 +730,11 @@ def workload_delete(args) -> None:
|
|
|
723
730
|
)
|
|
724
731
|
else:
|
|
725
732
|
return_code = run_commands(
|
|
726
|
-
commands,
|
|
733
|
+
commands,
|
|
734
|
+
'Delete Workload',
|
|
735
|
+
task_names,
|
|
736
|
+
batch=100,
|
|
737
|
+
dry_run=args.dry_run,
|
|
727
738
|
)
|
|
728
739
|
|
|
729
740
|
if return_code != 0:
|
|
@@ -741,8 +752,6 @@ def workload_list(args) -> None:
|
|
|
741
752
|
Returns:
|
|
742
753
|
0 if successful and 1 otherwise.
|
|
743
754
|
"""
|
|
744
|
-
xpk_print(args)
|
|
745
|
-
|
|
746
755
|
xpk_print('Starting workload list', flush=True)
|
|
747
756
|
add_zone_and_project(args)
|
|
748
757
|
get_cluster_credentials(args)
|
|
@@ -760,4 +769,8 @@ def workload_list(args) -> None:
|
|
|
760
769
|
xpk_print(f'List Job request returned ERROR {return_code}')
|
|
761
770
|
xpk_exit(return_code)
|
|
762
771
|
xpk_print(f'Workload List Output:\n{return_value}')
|
|
772
|
+
|
|
773
|
+
workload_list_gcp_link = get_jobsets_list_gcp_link(project=args.project)
|
|
774
|
+
xpk_print(f'See your workloads in Cloud Console: {workload_list_gcp_link}')
|
|
775
|
+
|
|
763
776
|
xpk_exit(0)
|
|
@@ -34,7 +34,7 @@ from ..system_characteristics import get_system_characteristics_by_device_type
|
|
|
34
34
|
from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
|
|
35
35
|
from ..kueue import KUEUE_VERSION
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
yaml_parser = yaml.YAML()
|
|
38
38
|
|
|
39
39
|
a3high_device_type = H100_DEVICE_TYPE
|
|
40
40
|
a3mega_device_type = H100_MEGA_DEVICE_TYPE
|
|
@@ -52,7 +52,7 @@ blueprint_dependencies_dir = {
|
|
|
52
52
|
}
|
|
53
53
|
|
|
54
54
|
cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
|
|
55
|
-
cluster_toolkit_version = "v1.
|
|
55
|
+
cluster_toolkit_version = "v1.62.2"
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
class BlueprintGeneratorOutput:
|
|
@@ -1019,7 +1019,7 @@ class BlueprintGenerator:
|
|
|
1019
1019
|
) -> str:
|
|
1020
1020
|
blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
|
|
1021
1021
|
with open(blueprint_path, "w+", encoding="utf-8") as blueprint_file:
|
|
1022
|
-
|
|
1022
|
+
yaml_parser.dump(xpk_blueprint, blueprint_file)
|
|
1023
1023
|
return blueprint_path
|
|
1024
1024
|
|
|
1025
1025
|
def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
|
|
@@ -1033,7 +1033,7 @@ class BlueprintGenerator:
|
|
|
1033
1033
|
ensure_directory_exists(storage_path_with_prefix)
|
|
1034
1034
|
return storage_path_with_prefix
|
|
1035
1035
|
|
|
1036
|
-
def blueprint_exists(self, blueprint_name, prefix: str = ""):
|
|
1036
|
+
def blueprint_exists(self, blueprint_name, prefix: str = "") -> bool:
|
|
1037
1037
|
blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
|
|
1038
1038
|
return os.path.exists(blueprint_path)
|
|
1039
1039
|
|
|
@@ -1061,6 +1061,6 @@ class BlueprintGenerator:
|
|
|
1061
1061
|
}
|
|
1062
1062
|
|
|
1063
1063
|
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1064
|
+
yaml_parser.register_class(Blueprint)
|
|
1065
|
+
yaml_parser.register_class(DeploymentGroup)
|
|
1066
|
+
yaml_parser.register_class(DeploymentModule)
|