xpk 0.12.0__tar.gz → 0.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xpk-0.12.0/src/xpk.egg-info → xpk-0.13.0}/PKG-INFO +1 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/batch.py +8 -8
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/cluster.py +9 -8
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/common.py +4 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/inspector.py +1 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/job.py +30 -2
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/storage.py +5 -2
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/workload.py +16 -9
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/cluster.py +5 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/cluster_private.py +3 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/commands.py +10 -7
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/config.py +1 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/docker_image.py +14 -5
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/docker_resources.py +9 -4
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/jobset.py +1 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/kjob.py +5 -2
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/kueue.py +22 -6
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/nap.py +1 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/network.py +1 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/nodepool.py +8 -3
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/pathways.py +6 -2
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/ray.py +1 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/resources.py +17 -7
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/scheduling.py +4 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/main.py +4 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/cluster.py +43 -0
- xpk-0.13.0/src/xpk/utils/execution_context.py +28 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/file.py +25 -10
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/network.py +4 -0
- {xpk-0.12.0 → xpk-0.13.0/src/xpk.egg-info}/PKG-INFO +1 -1
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk.egg-info/SOURCES.txt +1 -0
- {xpk-0.12.0 → xpk-0.13.0}/LICENSE +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/README.md +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/pyproject.toml +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/setup.cfg +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/__init__.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/api/__init__.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/api/storage_crd.yaml +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/__init__.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/cluster_gcluster.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/config.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/info.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/kind.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/kjob_common.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/run.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/shell.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/commands/version.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/__init__.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/blueprint/__init__.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/blueprint/blueprint_definitions.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/blueprint/blueprint_generator.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/blueprint/blueprint_test.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/capacity.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/config_test.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/docker_container.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/docker_manager.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/filestore.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/gcloud_context.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/gcluster_manager.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/gcsfuse.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/monitoring.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/mtc.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/nodepool_test.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/remote_state/__init__.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/remote_state/fuse_remote_state.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/remote_state/remote_state_client.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/storage.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/system_characteristics.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/vertex.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/__init__.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/rdma_decorator.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/storage_decorator.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/tcpx_decorator.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/tcpx_decorator_test.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_decorators/tcpxo_decorator.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/core/workload_test.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/__init__.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/batch.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/common.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/config.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/core.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/info.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/inspector.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/job.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/kind.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/run.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/shell.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/storage.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/validators.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/version.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/parser/workload.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/templates/__init__.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/templates/storage.yaml +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/__init__.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/console.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/gcs_utils.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/kubectl.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/objects.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/templates.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/validation.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk/utils/yaml.py +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk.egg-info/dependency_links.txt +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk.egg-info/entry_points.txt +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk.egg-info/requires.txt +0 -0
- {xpk-0.12.0 → xpk-0.13.0}/src/xpk.egg-info/top_level.txt +0 -0
|
@@ -31,6 +31,7 @@ from ..core.kjob import (
|
|
|
31
31
|
)
|
|
32
32
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
33
33
|
from ..utils.console import xpk_exit, xpk_print
|
|
34
|
+
from ..utils.execution_context import is_dry_run
|
|
34
35
|
from .kind import set_local_cluster_command
|
|
35
36
|
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
36
37
|
|
|
@@ -51,18 +52,16 @@ def batch(args: Namespace) -> None:
|
|
|
51
52
|
if set_cluster_command_code != 0:
|
|
52
53
|
xpk_exit(set_cluster_command_code)
|
|
53
54
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
55
|
+
if not is_dry_run():
|
|
56
|
+
err_code = prepare_kjob(args)
|
|
57
|
+
if err_code > 0:
|
|
58
|
+
xpk_exit(err_code)
|
|
59
|
+
setup_k8s_service_accounts()
|
|
58
60
|
|
|
59
61
|
submit_job(args)
|
|
60
62
|
|
|
61
63
|
|
|
62
64
|
def submit_job(args: Namespace) -> None:
|
|
63
|
-
|
|
64
|
-
setup_k8s_service_accounts()
|
|
65
|
-
|
|
66
65
|
cmd = (
|
|
67
66
|
'kubectl kjob create slurm'
|
|
68
67
|
f' --profile {AppProfileDefaults.NAME.value}'
|
|
@@ -73,7 +72,8 @@ def submit_job(args: Namespace) -> None:
|
|
|
73
72
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
74
73
|
cmd = add_TAS_annotations_to_command(args, cmd)
|
|
75
74
|
|
|
76
|
-
|
|
75
|
+
annotations = [] if is_dry_run() else get_storage_annotations(args)
|
|
76
|
+
for annotation in annotations:
|
|
77
77
|
cmd += f' --pod-template-annotation {annotation}'
|
|
78
78
|
|
|
79
79
|
if args.ignore_unknown_flags:
|
|
@@ -76,6 +76,7 @@ from ..core.vertex import create_vertex_tensorboard
|
|
|
76
76
|
from ..core.workload import get_workload_list
|
|
77
77
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
78
78
|
from ..utils.file import write_tmp_file
|
|
79
|
+
from ..utils.execution_context import is_dry_run
|
|
79
80
|
from . import cluster_gcluster
|
|
80
81
|
from .common import set_cluster_command
|
|
81
82
|
import shutil
|
|
@@ -128,9 +129,10 @@ def cluster_adapt(args) -> None:
|
|
|
128
129
|
|
|
129
130
|
get_cluster_credentials(args)
|
|
130
131
|
|
|
131
|
-
|
|
132
|
+
if not is_dry_run():
|
|
133
|
+
k8s_client = setup_k8s_env(args)
|
|
134
|
+
install_storage_crd(k8s_client)
|
|
132
135
|
|
|
133
|
-
install_storage_crd(k8s_client)
|
|
134
136
|
install_storage_csis(args)
|
|
135
137
|
|
|
136
138
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
@@ -251,9 +253,10 @@ def cluster_create(args) -> None:
|
|
|
251
253
|
if update_coredns_command_code != 0:
|
|
252
254
|
xpk_exit(update_cluster_command_code)
|
|
253
255
|
|
|
254
|
-
|
|
256
|
+
if not is_dry_run():
|
|
257
|
+
k8s_client = setup_k8s_env(args)
|
|
258
|
+
install_storage_crd(k8s_client)
|
|
255
259
|
|
|
256
|
-
install_storage_crd(k8s_client)
|
|
257
260
|
install_storage_csis(args)
|
|
258
261
|
|
|
259
262
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
@@ -409,10 +412,8 @@ def cluster_cacheimage(args) -> None:
|
|
|
409
412
|
nodeSelectorKey=node_selector_key,
|
|
410
413
|
)
|
|
411
414
|
tmp = write_tmp_file(yml_string)
|
|
412
|
-
command_apply = f'kubectl apply -f {str(tmp
|
|
413
|
-
command_delete = (
|
|
414
|
-
f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true'
|
|
415
|
-
)
|
|
415
|
+
command_apply = f'kubectl apply -f {str(tmp)}'
|
|
416
|
+
command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
|
|
416
417
|
|
|
417
418
|
return_code = run_command_with_updates(
|
|
418
419
|
command_delete, 'Deleting Cached Image', args
|
|
@@ -18,6 +18,7 @@ from ..core.commands import run_command_with_updates_retry
|
|
|
18
18
|
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
19
19
|
from ..core.gcloud_context import zone_to_region
|
|
20
20
|
from ..utils.console import xpk_print, xpk_exit
|
|
21
|
+
from ..utils.execution_context import is_dry_run
|
|
21
22
|
from ..core.system_characteristics import (
|
|
22
23
|
SystemCharacteristics,
|
|
23
24
|
)
|
|
@@ -63,6 +64,9 @@ def is_TAS_possible(
|
|
|
63
64
|
True if possible and False otherwise.
|
|
64
65
|
"""
|
|
65
66
|
|
|
67
|
+
if is_dry_run():
|
|
68
|
+
return True
|
|
69
|
+
|
|
66
70
|
if system_characteristics is None:
|
|
67
71
|
xpk_print('system_characteristics data was not found in configmaps.')
|
|
68
72
|
xpk_exit(1)
|
|
@@ -346,7 +346,7 @@ def inspector(args) -> None:
|
|
|
346
346
|
)
|
|
347
347
|
|
|
348
348
|
# Summarize inspector:
|
|
349
|
-
xpk_print(f'Find xpk inspector output file: {inspector_file
|
|
349
|
+
xpk_print(f'Find xpk inspector output file: {inspector_file}')
|
|
350
350
|
|
|
351
351
|
if final_return_code != 0:
|
|
352
352
|
xpk_print(
|
|
@@ -28,6 +28,28 @@ from ..utils.console import xpk_exit, xpk_print
|
|
|
28
28
|
from .kind import set_local_cluster_command
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
JOBS_DRY_RUN_YAML = """
|
|
32
|
+
items:
|
|
33
|
+
- apiVersion: slurm.k8s.io/v1alpha1
|
|
34
|
+
kind: SlurmJob
|
|
35
|
+
metadata:
|
|
36
|
+
annotations:
|
|
37
|
+
kjobctl.x-k8s.io/script: echo hello
|
|
38
|
+
creationTimestamp: '2024-04-29T12:00:00Z'
|
|
39
|
+
labels:
|
|
40
|
+
kjobctl.x-k8s.io/app-profile: default
|
|
41
|
+
name: golden-job
|
|
42
|
+
namespace: default
|
|
43
|
+
spec:
|
|
44
|
+
script: echo hello
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
PODS_DRY_RUN_RESULT = """
|
|
48
|
+
foo-pod 2/2 Running 0 2d
|
|
49
|
+
bar-pod 1/1 Evicted 0 1d
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
|
|
31
53
|
def job_info(args):
|
|
32
54
|
"""Run commands obtaining information about a job given by name.
|
|
33
55
|
|
|
@@ -52,7 +74,10 @@ def job_info(args):
|
|
|
52
74
|
f' metadata.name=={job_name}'
|
|
53
75
|
)
|
|
54
76
|
job_code, job_text = run_command_for_value(
|
|
55
|
-
job_command,
|
|
77
|
+
job_command,
|
|
78
|
+
'Getting job info',
|
|
79
|
+
args,
|
|
80
|
+
dry_run_return_val=JOBS_DRY_RUN_YAML,
|
|
56
81
|
)
|
|
57
82
|
if job_code != 0:
|
|
58
83
|
xpk_print(f'Job info request returned ERROR {job_code}')
|
|
@@ -60,7 +85,10 @@ def job_info(args):
|
|
|
60
85
|
|
|
61
86
|
pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers'
|
|
62
87
|
pods_code, pods_text = run_command_for_value(
|
|
63
|
-
pods_command,
|
|
88
|
+
pods_command,
|
|
89
|
+
'Getting pods list',
|
|
90
|
+
args,
|
|
91
|
+
dry_run_return_val=PODS_DRY_RUN_RESULT,
|
|
64
92
|
)
|
|
65
93
|
if pods_code != 0:
|
|
66
94
|
xpk_print(f'Pods list request returned ERROR {pods_code}')
|
|
@@ -58,6 +58,7 @@ from ..core.storage import (
|
|
|
58
58
|
)
|
|
59
59
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
60
60
|
from ..utils.kubectl import apply_kubectl_manifest
|
|
61
|
+
from ..utils.execution_context import is_dry_run
|
|
61
62
|
|
|
62
63
|
|
|
63
64
|
def storage_create(args: Namespace) -> None:
|
|
@@ -243,8 +244,10 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
|
|
|
243
244
|
|
|
244
245
|
|
|
245
246
|
def storage_list(args: Namespace) -> None:
|
|
246
|
-
|
|
247
|
-
|
|
247
|
+
storages = []
|
|
248
|
+
if not is_dry_run():
|
|
249
|
+
k8s_api_client = setup_k8s_env(args)
|
|
250
|
+
storages = list_storages(k8s_api_client)
|
|
248
251
|
print_storages_for_cluster(storages)
|
|
249
252
|
|
|
250
253
|
|
|
@@ -97,6 +97,7 @@ from ..core.workload_decorators import (
|
|
|
97
97
|
)
|
|
98
98
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
99
99
|
from ..utils.file import write_tmp_file
|
|
100
|
+
from ..utils.execution_context import is_dry_run
|
|
100
101
|
from . import cluster_gcluster
|
|
101
102
|
from .common import is_TAS_possible
|
|
102
103
|
|
|
@@ -306,8 +307,10 @@ def workload_create(args) -> None:
|
|
|
306
307
|
Returns:
|
|
307
308
|
0 if successful and 1 otherwise.
|
|
308
309
|
"""
|
|
309
|
-
k8s_api_client =
|
|
310
|
-
|
|
310
|
+
k8s_api_client = None
|
|
311
|
+
if not is_dry_run():
|
|
312
|
+
k8s_api_client = setup_k8s_env(args)
|
|
313
|
+
setup_k8s_service_accounts()
|
|
311
314
|
|
|
312
315
|
workload_exists = check_if_workload_exists(args)
|
|
313
316
|
|
|
@@ -383,8 +386,10 @@ def workload_create(args) -> None:
|
|
|
383
386
|
all_storages = []
|
|
384
387
|
# Currently storage customization is not supported for Pathways workloads. b/408468941
|
|
385
388
|
if not args.use_pathways:
|
|
386
|
-
storages: list[Storage] =
|
|
387
|
-
|
|
389
|
+
storages: list[Storage] = (
|
|
390
|
+
[]
|
|
391
|
+
if k8s_api_client is None
|
|
392
|
+
else get_storages_to_mount(k8s_api_client, args.storage)
|
|
388
393
|
)
|
|
389
394
|
gcs_fuse_storages = list(
|
|
390
395
|
filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
|
|
@@ -569,14 +574,14 @@ def workload_create(args) -> None:
|
|
|
569
574
|
pod_failure_policy=pod_failure_policy,
|
|
570
575
|
)
|
|
571
576
|
tmp = write_tmp_file(yml_string)
|
|
572
|
-
command = f'kubectl apply -f {str(tmp
|
|
577
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
573
578
|
return_code = run_command_with_updates(command, 'Creating Workload', args)
|
|
574
579
|
|
|
575
580
|
if return_code != 0:
|
|
576
581
|
xpk_print(f'Create Workload request returned ERROR {return_code}')
|
|
577
582
|
xpk_exit(return_code)
|
|
578
583
|
|
|
579
|
-
if not args.use_pathways:
|
|
584
|
+
if not args.use_pathways and not is_dry_run():
|
|
580
585
|
add_bucket_iam_members(args, storages)
|
|
581
586
|
|
|
582
587
|
# Get GKE outlier dashboard for TPU
|
|
@@ -725,7 +730,11 @@ def workload_delete(args) -> None:
|
|
|
725
730
|
)
|
|
726
731
|
else:
|
|
727
732
|
return_code = run_commands(
|
|
728
|
-
commands,
|
|
733
|
+
commands,
|
|
734
|
+
'Delete Workload',
|
|
735
|
+
task_names,
|
|
736
|
+
batch=100,
|
|
737
|
+
dry_run=args.dry_run,
|
|
729
738
|
)
|
|
730
739
|
|
|
731
740
|
if return_code != 0:
|
|
@@ -743,8 +752,6 @@ def workload_list(args) -> None:
|
|
|
743
752
|
Returns:
|
|
744
753
|
0 if successful and 1 otherwise.
|
|
745
754
|
"""
|
|
746
|
-
xpk_print(args)
|
|
747
|
-
|
|
748
755
|
xpk_print('Starting workload list', flush=True)
|
|
749
756
|
add_zone_and_project(args)
|
|
750
757
|
get_cluster_credentials(args)
|
|
@@ -442,7 +442,11 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
|
|
|
442
442
|
if not getattr(args, 'kind_cluster', False):
|
|
443
443
|
add_zone_and_project(args)
|
|
444
444
|
get_cluster_credentials(args)
|
|
445
|
-
args.project_number =
|
|
445
|
+
args.project_number = (
|
|
446
|
+
project_id_to_project_number(args.project)
|
|
447
|
+
if not args.dry_run
|
|
448
|
+
else abs(hash(args.project) % (10**12)) # 12 digit hash
|
|
449
|
+
)
|
|
446
450
|
|
|
447
451
|
config.load_kube_config()
|
|
448
452
|
return k8s_client.ApiClient()
|
|
@@ -19,6 +19,7 @@ from ..utils.network import (
|
|
|
19
19
|
add_current_machine_to_networks,
|
|
20
20
|
is_current_machine_in_any_network,
|
|
21
21
|
)
|
|
22
|
+
from ..utils.execution_context import is_dry_run
|
|
22
23
|
from ..utils.objects import is_text_true
|
|
23
24
|
from .commands import run_command_for_value, run_command_with_updates
|
|
24
25
|
from .gcloud_context import zone_to_region
|
|
@@ -37,7 +38,7 @@ def authorize_private_cluster_access_if_necessary(args) -> int:
|
|
|
37
38
|
if not args.private and args.authorized_networks is None:
|
|
38
39
|
xpk_print('Cluster is public and no need to authorize networks.')
|
|
39
40
|
return 0
|
|
40
|
-
|
|
41
|
+
elif not is_dry_run():
|
|
41
42
|
xpk_print(
|
|
42
43
|
'Cannot convert an existing public cluster to private. The arguments'
|
|
43
44
|
' --private and --authorized-networks are not acceptable for public'
|
|
@@ -164,6 +165,7 @@ def get_cluster_authorized_networks(args) -> list[str]:
|
|
|
164
165
|
command,
|
|
165
166
|
'Fetching the list of authorized network from cluster describe.',
|
|
166
167
|
args,
|
|
168
|
+
dry_run_return_val='127.0.0.1/32',
|
|
167
169
|
)
|
|
168
170
|
|
|
169
171
|
if return_code != 0:
|
|
@@ -78,14 +78,13 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
78
78
|
The max return code and a list of all the return codes.
|
|
79
79
|
"""
|
|
80
80
|
|
|
81
|
+
files = [open(f, 'w', encoding='utf-8') for f in output_logs]
|
|
81
82
|
children = []
|
|
82
83
|
start_time = datetime.datetime.now()
|
|
83
|
-
for
|
|
84
|
+
for command, file in zip(commands, files):
|
|
84
85
|
children.append(
|
|
85
86
|
# subprocess managed by list pylint: disable=consider-using-with
|
|
86
|
-
subprocess.Popen(
|
|
87
|
-
command, stdout=output_logs[i], stderr=output_logs[i], shell=True
|
|
88
|
-
)
|
|
87
|
+
subprocess.Popen(command, stdout=file, stderr=file, shell=True)
|
|
89
88
|
)
|
|
90
89
|
|
|
91
90
|
while True:
|
|
@@ -99,7 +98,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
99
98
|
slow_worker_text = per_command_name[slow_worker_index]
|
|
100
99
|
slow_str = (
|
|
101
100
|
f', task {slow_worker_text} still working, logfile'
|
|
102
|
-
f' {output_logs[slow_worker_index]
|
|
101
|
+
f' {output_logs[slow_worker_index]}'
|
|
103
102
|
)
|
|
104
103
|
else:
|
|
105
104
|
slow_str = ''
|
|
@@ -116,7 +115,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
116
115
|
)
|
|
117
116
|
xpk_print(
|
|
118
117
|
f'Failure is {per_command_name[failing_index]}'
|
|
119
|
-
f' and logfile {output_logs[failing_index]
|
|
118
|
+
f' and logfile {output_logs[failing_index]}'
|
|
120
119
|
)
|
|
121
120
|
for child in children:
|
|
122
121
|
child.terminate()
|
|
@@ -126,6 +125,10 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
126
125
|
break
|
|
127
126
|
|
|
128
127
|
time.sleep(1)
|
|
128
|
+
|
|
129
|
+
for file in files:
|
|
130
|
+
file.close()
|
|
131
|
+
|
|
129
132
|
return max_returncode, returncodes
|
|
130
133
|
|
|
131
134
|
|
|
@@ -351,6 +354,6 @@ def run_command_with_full_controls(
|
|
|
351
354
|
|
|
352
355
|
def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int:
|
|
353
356
|
tmp = write_tmp_file(yml_string)
|
|
354
|
-
command = f'kubectl apply -f {str(tmp
|
|
357
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
355
358
|
err_code = run_command_with_updates(command, task, args)
|
|
356
359
|
return err_code
|
|
@@ -22,7 +22,7 @@ from ..utils import file
|
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
|
|
24
24
|
# This is the version for XPK PyPI package
|
|
25
|
-
__version__ = 'v0.
|
|
25
|
+
__version__ = 'v0.13.0'
|
|
26
26
|
XPK_CURRENT_VERSION = __version__
|
|
27
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
28
28
|
|
|
@@ -21,6 +21,7 @@ import string
|
|
|
21
21
|
|
|
22
22
|
from ..utils.console import xpk_exit, xpk_print
|
|
23
23
|
from ..utils.file import write_tmp_file
|
|
24
|
+
from ..utils.execution_context import is_dry_run
|
|
24
25
|
from .commands import run_command_with_updates
|
|
25
26
|
|
|
26
27
|
DEFAULT_DOCKER_IMAGE = 'python:3.10'
|
|
@@ -75,7 +76,9 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
75
76
|
"""
|
|
76
77
|
|
|
77
78
|
# Pick a name for the docker image.
|
|
78
|
-
docker_image_prefix =
|
|
79
|
+
docker_image_prefix = (
|
|
80
|
+
'dry-run' if is_dry_run() else os.getenv('USER', 'unknown')
|
|
81
|
+
)
|
|
79
82
|
docker_name = f'{docker_image_prefix}-runner'
|
|
80
83
|
|
|
81
84
|
script_dir_dockerfile = """FROM {base_docker_image}
|
|
@@ -94,7 +97,7 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
94
97
|
)
|
|
95
98
|
tmp = write_tmp_file(docker_file)
|
|
96
99
|
docker_build_command = (
|
|
97
|
-
f'docker buildx build --platform={PLATFORM} -f {str(tmp
|
|
100
|
+
f'docker buildx build --platform={PLATFORM} -f {str(tmp)} -t'
|
|
98
101
|
f' {docker_name} {args.script_dir}'
|
|
99
102
|
)
|
|
100
103
|
xpk_print(f'Building {args.script_dir} into docker image.')
|
|
@@ -114,10 +117,16 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
114
117
|
|
|
115
118
|
# Pick a randomly generated `tag_length` character docker tag.
|
|
116
119
|
tag_length = 4
|
|
117
|
-
tag_random_prefix =
|
|
118
|
-
|
|
120
|
+
tag_random_prefix = (
|
|
121
|
+
'prefix'
|
|
122
|
+
if is_dry_run()
|
|
123
|
+
else ''.join(random.choices(string.ascii_lowercase, k=tag_length))
|
|
124
|
+
)
|
|
125
|
+
tag_datetime = (
|
|
126
|
+
'current'
|
|
127
|
+
if is_dry_run()
|
|
128
|
+
else datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
|
119
129
|
)
|
|
120
|
-
tag_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
|
121
130
|
tag_name = f'{tag_random_prefix}-{tag_datetime}'
|
|
122
131
|
cloud_docker_image = f'gcr.io/{args.project}/{docker_name}:{tag_name}'
|
|
123
132
|
xpk_print(f'Adding Docker Image: {cloud_docker_image} to {args.project}')
|
|
@@ -20,6 +20,7 @@ from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
|
20
20
|
from .cluster import setup_k8s_env
|
|
21
21
|
from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE, Storage, get_storages_to_mount
|
|
22
22
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
23
|
+
from ..utils.execution_context import is_dry_run
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def get_main_container_resources(
|
|
@@ -272,8 +273,10 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
|
|
|
272
273
|
- name: shared-data
|
|
273
274
|
"""
|
|
274
275
|
|
|
275
|
-
storages: list[Storage] =
|
|
276
|
-
|
|
276
|
+
storages: list[Storage] = (
|
|
277
|
+
[]
|
|
278
|
+
if is_dry_run()
|
|
279
|
+
else get_storages_to_mount(setup_k8s_env(args), args.storage)
|
|
277
280
|
)
|
|
278
281
|
for storage in storages:
|
|
279
282
|
if storage.type in {
|
|
@@ -325,8 +328,10 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
|
|
|
325
328
|
elif system.accelerator_type == AcceleratorType['GPU']:
|
|
326
329
|
volume_mount_yaml = ''
|
|
327
330
|
|
|
328
|
-
storages: list[Storage] =
|
|
329
|
-
|
|
331
|
+
storages: list[Storage] = (
|
|
332
|
+
[]
|
|
333
|
+
if is_dry_run()
|
|
334
|
+
else get_storages_to_mount(setup_k8s_env(args), args.storage)
|
|
330
335
|
)
|
|
331
336
|
for storage in storages:
|
|
332
337
|
if storage.type in {
|
|
@@ -134,7 +134,7 @@ def update_jobset_resources_if_necessary(args):
|
|
|
134
134
|
memory_limit_size=new_memory_limit,
|
|
135
135
|
)
|
|
136
136
|
tmp = write_tmp_file(yml_string)
|
|
137
|
-
command = f'kubectl apply -f {str(tmp
|
|
137
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
138
138
|
|
|
139
139
|
task = 'Updating jobset Controller Manager resources'
|
|
140
140
|
return_code = run_command_with_updates_retry(command, task, args)
|
|
@@ -23,6 +23,7 @@ from kubernetes.client import ApiClient
|
|
|
23
23
|
from kubernetes.client.rest import ApiException
|
|
24
24
|
|
|
25
25
|
from ..utils import templates
|
|
26
|
+
from ..utils.execution_context import is_dry_run
|
|
26
27
|
from ..utils.console import xpk_exit, xpk_print
|
|
27
28
|
from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
28
29
|
from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
|
|
@@ -368,8 +369,10 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int:
|
|
|
368
369
|
def prepare_kjob(args: Namespace) -> int:
|
|
369
370
|
system = get_cluster_system_characteristics(args)
|
|
370
371
|
|
|
371
|
-
|
|
372
|
-
|
|
372
|
+
storages = []
|
|
373
|
+
if not is_dry_run():
|
|
374
|
+
k8s_api_client = setup_k8s_env(args)
|
|
375
|
+
storages = get_auto_mount_storages(k8s_api_client)
|
|
373
376
|
|
|
374
377
|
service_account = ""
|
|
375
378
|
if len(storages) > 0:
|
|
@@ -436,6 +436,8 @@ def install_kueue_crs(
|
|
|
436
436
|
cluster_hardware_name=cluster_hardware_name,
|
|
437
437
|
resource_type=resource_type,
|
|
438
438
|
total_chips=total_chips,
|
|
439
|
+
cpu_limit=args.cpu_limit,
|
|
440
|
+
memory_limit=args.memory_limit,
|
|
439
441
|
)
|
|
440
442
|
topology_label = ''
|
|
441
443
|
if system.device_type in [
|
|
@@ -474,7 +476,7 @@ def install_kueue_crs(
|
|
|
474
476
|
yml_string = topology_yaml + yml_string
|
|
475
477
|
|
|
476
478
|
tmp = write_tmp_file(yml_string)
|
|
477
|
-
command = f'kubectl apply -f {str(tmp
|
|
479
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
478
480
|
|
|
479
481
|
task = 'Applying Kueue Custom Resources'
|
|
480
482
|
return_code = run_command_with_updates_retry(command, task, args)
|
|
@@ -484,7 +486,7 @@ def install_kueue_crs(
|
|
|
484
486
|
|
|
485
487
|
|
|
486
488
|
def get_kueue_covered_resources_config(
|
|
487
|
-
cluster_hardware_name, resource_type, total_chips
|
|
489
|
+
cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit
|
|
488
490
|
) -> str:
|
|
489
491
|
"""Gets Kueue covered resources configuration.
|
|
490
492
|
|
|
@@ -497,17 +499,31 @@ def get_kueue_covered_resources_config(
|
|
|
497
499
|
A string of Kueue covered resources configuration.
|
|
498
500
|
"""
|
|
499
501
|
config_format = """
|
|
500
|
-
- coveredResources:
|
|
502
|
+
- coveredResources: {resource_types}
|
|
501
503
|
flavors:
|
|
502
504
|
- name: {cluster_hardware_name}
|
|
503
505
|
resources:
|
|
504
506
|
- name: "{resource_type}"
|
|
505
|
-
nominalQuota: {total_chips}
|
|
506
|
-
|
|
507
|
+
nominalQuota: {total_chips}"""
|
|
508
|
+
resource_types = [resource_type]
|
|
509
|
+
if cpu_limit:
|
|
510
|
+
config_format = config_format + """
|
|
511
|
+
- name: "cpu"
|
|
512
|
+
nominalQuota: {cpu_limit}"""
|
|
513
|
+
resource_types.append('cpu')
|
|
514
|
+
if memory_limit:
|
|
515
|
+
config_format = config_format + """
|
|
516
|
+
- name: "memory"
|
|
517
|
+
nominalQuota: {memory_limit}"""
|
|
518
|
+
resource_types.append('memory')
|
|
519
|
+
|
|
507
520
|
config_string = config_format.format(
|
|
508
521
|
cluster_hardware_name=cluster_hardware_name,
|
|
522
|
+
resource_types=resource_types,
|
|
509
523
|
resource_type=resource_type,
|
|
510
524
|
total_chips=total_chips,
|
|
525
|
+
cpu_limit=cpu_limit,
|
|
526
|
+
memory_limit=memory_limit,
|
|
511
527
|
)
|
|
512
528
|
return config_string
|
|
513
529
|
|
|
@@ -536,7 +552,7 @@ def update_kueue_resources_if_necessary(args):
|
|
|
536
552
|
memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION
|
|
537
553
|
)
|
|
538
554
|
tmp = write_tmp_file(yml_string)
|
|
539
|
-
command = f'kubectl apply -f {str(tmp
|
|
555
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
540
556
|
|
|
541
557
|
task = 'Updating Kueue Controller Manager resources'
|
|
542
558
|
return_code = run_command_with_updates_retry(command, task, args)
|
|
@@ -250,7 +250,7 @@ def create_autoprovisioning_config(
|
|
|
250
250
|
zones=f'- {args.zone}',
|
|
251
251
|
)
|
|
252
252
|
autoprovisioning_config = AutoprovisioningConfig(
|
|
253
|
-
config_filename=write_tmp_file(yml_string)
|
|
253
|
+
config_filename=write_tmp_file(yml_string),
|
|
254
254
|
minimum_chips=minimum,
|
|
255
255
|
maximum_chips=maximum,
|
|
256
256
|
)
|
|
@@ -221,7 +221,7 @@ def create_cluster_network_config(args) -> int:
|
|
|
221
221
|
"""
|
|
222
222
|
yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
|
|
223
223
|
tmp = write_tmp_file(yml_string)
|
|
224
|
-
command = f'kubectl apply -f {str(tmp
|
|
224
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
225
225
|
|
|
226
226
|
return_code = run_command_with_updates(
|
|
227
227
|
command, 'GKE Cluster Create Network Config', args
|
|
@@ -265,7 +265,9 @@ def run_gke_node_pool_create_command(
|
|
|
265
265
|
)
|
|
266
266
|
configmap_yml = {}
|
|
267
267
|
configmap_yml[resources_configmap_name] = resources_yml
|
|
268
|
-
return_code = create_or_update_cluster_configmap(
|
|
268
|
+
return_code = create_or_update_cluster_configmap(
|
|
269
|
+
configmap_yml, args.dry_run
|
|
270
|
+
)
|
|
269
271
|
if return_code != 0:
|
|
270
272
|
return 1
|
|
271
273
|
|
|
@@ -461,7 +463,7 @@ def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]:
|
|
|
461
463
|
f' --region={zone_to_region(args.zone)} --format="value(locations)"'
|
|
462
464
|
)
|
|
463
465
|
return_code, nodepool_zone = run_command_for_value(
|
|
464
|
-
command, 'Get Node Pool Zone', args
|
|
466
|
+
command, 'Get Node Pool Zone', args, dry_run_return_val=args.zone
|
|
465
467
|
)
|
|
466
468
|
if return_code != 0:
|
|
467
469
|
xpk_print(f'Get Node Pool Zone returned ERROR {return_code}')
|
|
@@ -570,7 +572,10 @@ def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
|
|
|
570
572
|
for i, command in enumerate(commands):
|
|
571
573
|
xpk_print(f'To complete {task_names[i]} we are executing {command}')
|
|
572
574
|
max_return_code = run_commands(
|
|
573
|
-
commands,
|
|
575
|
+
commands,
|
|
576
|
+
'Update GKE node pools to default RAPID GKE version',
|
|
577
|
+
task_names,
|
|
578
|
+
dry_run=args.dry_run,
|
|
574
579
|
)
|
|
575
580
|
if max_return_code != 0:
|
|
576
581
|
xpk_print(
|
|
@@ -19,6 +19,7 @@ from ..core.docker_container import get_user_workload_container
|
|
|
19
19
|
from ..core.gcloud_context import zone_to_region
|
|
20
20
|
from ..core.nodepool import get_all_nodepools_programmatic
|
|
21
21
|
from ..utils.console import xpk_exit, xpk_print
|
|
22
|
+
from ..utils.execution_context import is_dry_run
|
|
22
23
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
23
24
|
|
|
24
25
|
|
|
@@ -79,7 +80,10 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:
|
|
|
79
80
|
# Ensure the cluster and CPU nodepools were created with create-pathways
|
|
80
81
|
all_node_pools = get_all_nodepools_programmatic(args)
|
|
81
82
|
desired_pw_cpu_node_pools = {'cpu-np'}
|
|
82
|
-
if
|
|
83
|
+
if (
|
|
84
|
+
not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0]))
|
|
85
|
+
and not is_dry_run()
|
|
86
|
+
):
|
|
83
87
|
xpk_print(
|
|
84
88
|
'Cluster needs to be created with `xpk create-pathways` to run'
|
|
85
89
|
' Pathways workloads.'
|
|
@@ -322,7 +326,7 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
|
|
|
322
326
|
return_code = run_command_with_updates(commands[0], 'Delete Workload', args)
|
|
323
327
|
else:
|
|
324
328
|
return_code = run_commands(
|
|
325
|
-
commands, 'Delete Workload', task_names, batch=100
|
|
329
|
+
commands, 'Delete Workload', task_names, batch=100, dry_run=args.dry_run
|
|
326
330
|
)
|
|
327
331
|
|
|
328
332
|
if return_code != 0:
|
|
@@ -132,7 +132,7 @@ def install_ray_cluster(args, system) -> int:
|
|
|
132
132
|
)
|
|
133
133
|
|
|
134
134
|
tmp = write_tmp_file(yml_string)
|
|
135
|
-
command = f'kubectl apply -f {str(tmp
|
|
135
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
136
136
|
task = 'Applying RayCluster'
|
|
137
137
|
retry_attempts = 1
|
|
138
138
|
return_code = run_command_with_updates_retry(
|