xpk 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +8 -8
- xpk/commands/cluster.py +9 -8
- xpk/commands/common.py +4 -0
- xpk/commands/inspector.py +1 -1
- xpk/commands/job.py +30 -2
- xpk/commands/storage.py +5 -2
- xpk/commands/workload.py +16 -9
- xpk/core/cluster.py +5 -1
- xpk/core/cluster_private.py +3 -1
- xpk/core/commands.py +10 -7
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +14 -5
- xpk/core/docker_resources.py +9 -4
- xpk/core/jobset.py +1 -1
- xpk/core/kjob.py +5 -2
- xpk/core/kueue.py +22 -6
- xpk/core/nap.py +1 -1
- xpk/core/network.py +1 -1
- xpk/core/nodepool.py +8 -3
- xpk/core/pathways.py +6 -2
- xpk/core/ray.py +1 -1
- xpk/core/resources.py +17 -7
- xpk/core/scheduling.py +4 -0
- xpk/main.py +4 -1
- xpk/parser/cluster.py +43 -0
- xpk/utils/execution_context.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/network.py +4 -0
- {xpk-0.12.0.dist-info → xpk-0.13.0.dist-info}/METADATA +1 -1
- {xpk-0.12.0.dist-info → xpk-0.13.0.dist-info}/RECORD +34 -33
- {xpk-0.12.0.dist-info → xpk-0.13.0.dist-info}/WHEEL +0 -0
- {xpk-0.12.0.dist-info → xpk-0.13.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.12.0.dist-info → xpk-0.13.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.12.0.dist-info → xpk-0.13.0.dist-info}/top_level.txt +0 -0
xpk/commands/batch.py
CHANGED
|
@@ -31,6 +31,7 @@ from ..core.kjob import (
|
|
|
31
31
|
)
|
|
32
32
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
33
33
|
from ..utils.console import xpk_exit, xpk_print
|
|
34
|
+
from ..utils.execution_context import is_dry_run
|
|
34
35
|
from .kind import set_local_cluster_command
|
|
35
36
|
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
36
37
|
|
|
@@ -51,18 +52,16 @@ def batch(args: Namespace) -> None:
|
|
|
51
52
|
if set_cluster_command_code != 0:
|
|
52
53
|
xpk_exit(set_cluster_command_code)
|
|
53
54
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
55
|
+
if not is_dry_run():
|
|
56
|
+
err_code = prepare_kjob(args)
|
|
57
|
+
if err_code > 0:
|
|
58
|
+
xpk_exit(err_code)
|
|
59
|
+
setup_k8s_service_accounts()
|
|
58
60
|
|
|
59
61
|
submit_job(args)
|
|
60
62
|
|
|
61
63
|
|
|
62
64
|
def submit_job(args: Namespace) -> None:
|
|
63
|
-
|
|
64
|
-
setup_k8s_service_accounts()
|
|
65
|
-
|
|
66
65
|
cmd = (
|
|
67
66
|
'kubectl kjob create slurm'
|
|
68
67
|
f' --profile {AppProfileDefaults.NAME.value}'
|
|
@@ -73,7 +72,8 @@ def submit_job(args: Namespace) -> None:
|
|
|
73
72
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
74
73
|
cmd = add_TAS_annotations_to_command(args, cmd)
|
|
75
74
|
|
|
76
|
-
|
|
75
|
+
annotations = [] if is_dry_run() else get_storage_annotations(args)
|
|
76
|
+
for annotation in annotations:
|
|
77
77
|
cmd += f' --pod-template-annotation {annotation}'
|
|
78
78
|
|
|
79
79
|
if args.ignore_unknown_flags:
|
xpk/commands/cluster.py
CHANGED
|
@@ -76,6 +76,7 @@ from ..core.vertex import create_vertex_tensorboard
|
|
|
76
76
|
from ..core.workload import get_workload_list
|
|
77
77
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
78
78
|
from ..utils.file import write_tmp_file
|
|
79
|
+
from ..utils.execution_context import is_dry_run
|
|
79
80
|
from . import cluster_gcluster
|
|
80
81
|
from .common import set_cluster_command
|
|
81
82
|
import shutil
|
|
@@ -128,9 +129,10 @@ def cluster_adapt(args) -> None:
|
|
|
128
129
|
|
|
129
130
|
get_cluster_credentials(args)
|
|
130
131
|
|
|
131
|
-
|
|
132
|
+
if not is_dry_run():
|
|
133
|
+
k8s_client = setup_k8s_env(args)
|
|
134
|
+
install_storage_crd(k8s_client)
|
|
132
135
|
|
|
133
|
-
install_storage_crd(k8s_client)
|
|
134
136
|
install_storage_csis(args)
|
|
135
137
|
|
|
136
138
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
@@ -251,9 +253,10 @@ def cluster_create(args) -> None:
|
|
|
251
253
|
if update_coredns_command_code != 0:
|
|
252
254
|
xpk_exit(update_cluster_command_code)
|
|
253
255
|
|
|
254
|
-
|
|
256
|
+
if not is_dry_run():
|
|
257
|
+
k8s_client = setup_k8s_env(args)
|
|
258
|
+
install_storage_crd(k8s_client)
|
|
255
259
|
|
|
256
|
-
install_storage_crd(k8s_client)
|
|
257
260
|
install_storage_csis(args)
|
|
258
261
|
|
|
259
262
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
@@ -409,10 +412,8 @@ def cluster_cacheimage(args) -> None:
|
|
|
409
412
|
nodeSelectorKey=node_selector_key,
|
|
410
413
|
)
|
|
411
414
|
tmp = write_tmp_file(yml_string)
|
|
412
|
-
command_apply = f'kubectl apply -f {str(tmp
|
|
413
|
-
command_delete = (
|
|
414
|
-
f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true'
|
|
415
|
-
)
|
|
415
|
+
command_apply = f'kubectl apply -f {str(tmp)}'
|
|
416
|
+
command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
|
|
416
417
|
|
|
417
418
|
return_code = run_command_with_updates(
|
|
418
419
|
command_delete, 'Deleting Cached Image', args
|
xpk/commands/common.py
CHANGED
|
@@ -18,6 +18,7 @@ from ..core.commands import run_command_with_updates_retry
|
|
|
18
18
|
from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
|
|
19
19
|
from ..core.gcloud_context import zone_to_region
|
|
20
20
|
from ..utils.console import xpk_print, xpk_exit
|
|
21
|
+
from ..utils.execution_context import is_dry_run
|
|
21
22
|
from ..core.system_characteristics import (
|
|
22
23
|
SystemCharacteristics,
|
|
23
24
|
)
|
|
@@ -63,6 +64,9 @@ def is_TAS_possible(
|
|
|
63
64
|
True if possible and False otherwise.
|
|
64
65
|
"""
|
|
65
66
|
|
|
67
|
+
if is_dry_run():
|
|
68
|
+
return True
|
|
69
|
+
|
|
66
70
|
if system_characteristics is None:
|
|
67
71
|
xpk_print('system_characteristics data was not found in configmaps.')
|
|
68
72
|
xpk_exit(1)
|
xpk/commands/inspector.py
CHANGED
|
@@ -346,7 +346,7 @@ def inspector(args) -> None:
|
|
|
346
346
|
)
|
|
347
347
|
|
|
348
348
|
# Summarize inspector:
|
|
349
|
-
xpk_print(f'Find xpk inspector output file: {inspector_file
|
|
349
|
+
xpk_print(f'Find xpk inspector output file: {inspector_file}')
|
|
350
350
|
|
|
351
351
|
if final_return_code != 0:
|
|
352
352
|
xpk_print(
|
xpk/commands/job.py
CHANGED
|
@@ -28,6 +28,28 @@ from ..utils.console import xpk_exit, xpk_print
|
|
|
28
28
|
from .kind import set_local_cluster_command
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
JOBS_DRY_RUN_YAML = """
|
|
32
|
+
items:
|
|
33
|
+
- apiVersion: slurm.k8s.io/v1alpha1
|
|
34
|
+
kind: SlurmJob
|
|
35
|
+
metadata:
|
|
36
|
+
annotations:
|
|
37
|
+
kjobctl.x-k8s.io/script: echo hello
|
|
38
|
+
creationTimestamp: '2024-04-29T12:00:00Z'
|
|
39
|
+
labels:
|
|
40
|
+
kjobctl.x-k8s.io/app-profile: default
|
|
41
|
+
name: golden-job
|
|
42
|
+
namespace: default
|
|
43
|
+
spec:
|
|
44
|
+
script: echo hello
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
PODS_DRY_RUN_RESULT = """
|
|
48
|
+
foo-pod 2/2 Running 0 2d
|
|
49
|
+
bar-pod 1/1 Evicted 0 1d
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
|
|
31
53
|
def job_info(args):
|
|
32
54
|
"""Run commands obtaining information about a job given by name.
|
|
33
55
|
|
|
@@ -52,7 +74,10 @@ def job_info(args):
|
|
|
52
74
|
f' metadata.name=={job_name}'
|
|
53
75
|
)
|
|
54
76
|
job_code, job_text = run_command_for_value(
|
|
55
|
-
job_command,
|
|
77
|
+
job_command,
|
|
78
|
+
'Getting job info',
|
|
79
|
+
args,
|
|
80
|
+
dry_run_return_val=JOBS_DRY_RUN_YAML,
|
|
56
81
|
)
|
|
57
82
|
if job_code != 0:
|
|
58
83
|
xpk_print(f'Job info request returned ERROR {job_code}')
|
|
@@ -60,7 +85,10 @@ def job_info(args):
|
|
|
60
85
|
|
|
61
86
|
pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers'
|
|
62
87
|
pods_code, pods_text = run_command_for_value(
|
|
63
|
-
pods_command,
|
|
88
|
+
pods_command,
|
|
89
|
+
'Getting pods list',
|
|
90
|
+
args,
|
|
91
|
+
dry_run_return_val=PODS_DRY_RUN_RESULT,
|
|
64
92
|
)
|
|
65
93
|
if pods_code != 0:
|
|
66
94
|
xpk_print(f'Pods list request returned ERROR {pods_code}')
|
xpk/commands/storage.py
CHANGED
|
@@ -58,6 +58,7 @@ from ..core.storage import (
|
|
|
58
58
|
)
|
|
59
59
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
60
60
|
from ..utils.kubectl import apply_kubectl_manifest
|
|
61
|
+
from ..utils.execution_context import is_dry_run
|
|
61
62
|
|
|
62
63
|
|
|
63
64
|
def storage_create(args: Namespace) -> None:
|
|
@@ -243,8 +244,10 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
|
|
|
243
244
|
|
|
244
245
|
|
|
245
246
|
def storage_list(args: Namespace) -> None:
|
|
246
|
-
|
|
247
|
-
|
|
247
|
+
storages = []
|
|
248
|
+
if not is_dry_run():
|
|
249
|
+
k8s_api_client = setup_k8s_env(args)
|
|
250
|
+
storages = list_storages(k8s_api_client)
|
|
248
251
|
print_storages_for_cluster(storages)
|
|
249
252
|
|
|
250
253
|
|
xpk/commands/workload.py
CHANGED
|
@@ -97,6 +97,7 @@ from ..core.workload_decorators import (
|
|
|
97
97
|
)
|
|
98
98
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
99
99
|
from ..utils.file import write_tmp_file
|
|
100
|
+
from ..utils.execution_context import is_dry_run
|
|
100
101
|
from . import cluster_gcluster
|
|
101
102
|
from .common import is_TAS_possible
|
|
102
103
|
|
|
@@ -306,8 +307,10 @@ def workload_create(args) -> None:
|
|
|
306
307
|
Returns:
|
|
307
308
|
0 if successful and 1 otherwise.
|
|
308
309
|
"""
|
|
309
|
-
k8s_api_client =
|
|
310
|
-
|
|
310
|
+
k8s_api_client = None
|
|
311
|
+
if not is_dry_run():
|
|
312
|
+
k8s_api_client = setup_k8s_env(args)
|
|
313
|
+
setup_k8s_service_accounts()
|
|
311
314
|
|
|
312
315
|
workload_exists = check_if_workload_exists(args)
|
|
313
316
|
|
|
@@ -383,8 +386,10 @@ def workload_create(args) -> None:
|
|
|
383
386
|
all_storages = []
|
|
384
387
|
# Currently storage customization is not supported for Pathways workloads. b/408468941
|
|
385
388
|
if not args.use_pathways:
|
|
386
|
-
storages: list[Storage] =
|
|
387
|
-
|
|
389
|
+
storages: list[Storage] = (
|
|
390
|
+
[]
|
|
391
|
+
if k8s_api_client is None
|
|
392
|
+
else get_storages_to_mount(k8s_api_client, args.storage)
|
|
388
393
|
)
|
|
389
394
|
gcs_fuse_storages = list(
|
|
390
395
|
filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
|
|
@@ -569,14 +574,14 @@ def workload_create(args) -> None:
|
|
|
569
574
|
pod_failure_policy=pod_failure_policy,
|
|
570
575
|
)
|
|
571
576
|
tmp = write_tmp_file(yml_string)
|
|
572
|
-
command = f'kubectl apply -f {str(tmp
|
|
577
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
573
578
|
return_code = run_command_with_updates(command, 'Creating Workload', args)
|
|
574
579
|
|
|
575
580
|
if return_code != 0:
|
|
576
581
|
xpk_print(f'Create Workload request returned ERROR {return_code}')
|
|
577
582
|
xpk_exit(return_code)
|
|
578
583
|
|
|
579
|
-
if not args.use_pathways:
|
|
584
|
+
if not args.use_pathways and not is_dry_run():
|
|
580
585
|
add_bucket_iam_members(args, storages)
|
|
581
586
|
|
|
582
587
|
# Get GKE outlier dashboard for TPU
|
|
@@ -725,7 +730,11 @@ def workload_delete(args) -> None:
|
|
|
725
730
|
)
|
|
726
731
|
else:
|
|
727
732
|
return_code = run_commands(
|
|
728
|
-
commands,
|
|
733
|
+
commands,
|
|
734
|
+
'Delete Workload',
|
|
735
|
+
task_names,
|
|
736
|
+
batch=100,
|
|
737
|
+
dry_run=args.dry_run,
|
|
729
738
|
)
|
|
730
739
|
|
|
731
740
|
if return_code != 0:
|
|
@@ -743,8 +752,6 @@ def workload_list(args) -> None:
|
|
|
743
752
|
Returns:
|
|
744
753
|
0 if successful and 1 otherwise.
|
|
745
754
|
"""
|
|
746
|
-
xpk_print(args)
|
|
747
|
-
|
|
748
755
|
xpk_print('Starting workload list', flush=True)
|
|
749
756
|
add_zone_and_project(args)
|
|
750
757
|
get_cluster_credentials(args)
|
xpk/core/cluster.py
CHANGED
|
@@ -442,7 +442,11 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
|
|
|
442
442
|
if not getattr(args, 'kind_cluster', False):
|
|
443
443
|
add_zone_and_project(args)
|
|
444
444
|
get_cluster_credentials(args)
|
|
445
|
-
args.project_number =
|
|
445
|
+
args.project_number = (
|
|
446
|
+
project_id_to_project_number(args.project)
|
|
447
|
+
if not args.dry_run
|
|
448
|
+
else abs(hash(args.project) % (10**12)) # 12 digit hash
|
|
449
|
+
)
|
|
446
450
|
|
|
447
451
|
config.load_kube_config()
|
|
448
452
|
return k8s_client.ApiClient()
|
xpk/core/cluster_private.py
CHANGED
|
@@ -19,6 +19,7 @@ from ..utils.network import (
|
|
|
19
19
|
add_current_machine_to_networks,
|
|
20
20
|
is_current_machine_in_any_network,
|
|
21
21
|
)
|
|
22
|
+
from ..utils.execution_context import is_dry_run
|
|
22
23
|
from ..utils.objects import is_text_true
|
|
23
24
|
from .commands import run_command_for_value, run_command_with_updates
|
|
24
25
|
from .gcloud_context import zone_to_region
|
|
@@ -37,7 +38,7 @@ def authorize_private_cluster_access_if_necessary(args) -> int:
|
|
|
37
38
|
if not args.private and args.authorized_networks is None:
|
|
38
39
|
xpk_print('Cluster is public and no need to authorize networks.')
|
|
39
40
|
return 0
|
|
40
|
-
|
|
41
|
+
elif not is_dry_run():
|
|
41
42
|
xpk_print(
|
|
42
43
|
'Cannot convert an existing public cluster to private. The arguments'
|
|
43
44
|
' --private and --authorized-networks are not acceptable for public'
|
|
@@ -164,6 +165,7 @@ def get_cluster_authorized_networks(args) -> list[str]:
|
|
|
164
165
|
command,
|
|
165
166
|
'Fetching the list of authorized network from cluster describe.',
|
|
166
167
|
args,
|
|
168
|
+
dry_run_return_val='127.0.0.1/32',
|
|
167
169
|
)
|
|
168
170
|
|
|
169
171
|
if return_code != 0:
|
xpk/core/commands.py
CHANGED
|
@@ -78,14 +78,13 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
78
78
|
The max return code and a list of all the return codes.
|
|
79
79
|
"""
|
|
80
80
|
|
|
81
|
+
files = [open(f, 'w', encoding='utf-8') for f in output_logs]
|
|
81
82
|
children = []
|
|
82
83
|
start_time = datetime.datetime.now()
|
|
83
|
-
for
|
|
84
|
+
for command, file in zip(commands, files):
|
|
84
85
|
children.append(
|
|
85
86
|
# subprocess managed by list pylint: disable=consider-using-with
|
|
86
|
-
subprocess.Popen(
|
|
87
|
-
command, stdout=output_logs[i], stderr=output_logs[i], shell=True
|
|
88
|
-
)
|
|
87
|
+
subprocess.Popen(command, stdout=file, stderr=file, shell=True)
|
|
89
88
|
)
|
|
90
89
|
|
|
91
90
|
while True:
|
|
@@ -99,7 +98,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
99
98
|
slow_worker_text = per_command_name[slow_worker_index]
|
|
100
99
|
slow_str = (
|
|
101
100
|
f', task {slow_worker_text} still working, logfile'
|
|
102
|
-
f' {output_logs[slow_worker_index]
|
|
101
|
+
f' {output_logs[slow_worker_index]}'
|
|
103
102
|
)
|
|
104
103
|
else:
|
|
105
104
|
slow_str = ''
|
|
@@ -116,7 +115,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
116
115
|
)
|
|
117
116
|
xpk_print(
|
|
118
117
|
f'Failure is {per_command_name[failing_index]}'
|
|
119
|
-
f' and logfile {output_logs[failing_index]
|
|
118
|
+
f' and logfile {output_logs[failing_index]}'
|
|
120
119
|
)
|
|
121
120
|
for child in children:
|
|
122
121
|
child.terminate()
|
|
@@ -126,6 +125,10 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
126
125
|
break
|
|
127
126
|
|
|
128
127
|
time.sleep(1)
|
|
128
|
+
|
|
129
|
+
for file in files:
|
|
130
|
+
file.close()
|
|
131
|
+
|
|
129
132
|
return max_returncode, returncodes
|
|
130
133
|
|
|
131
134
|
|
|
@@ -351,6 +354,6 @@ def run_command_with_full_controls(
|
|
|
351
354
|
|
|
352
355
|
def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int:
|
|
353
356
|
tmp = write_tmp_file(yml_string)
|
|
354
|
-
command = f'kubectl apply -f {str(tmp
|
|
357
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
355
358
|
err_code = run_command_with_updates(command, task, args)
|
|
356
359
|
return err_code
|
xpk/core/config.py
CHANGED
|
@@ -22,7 +22,7 @@ from ..utils import file
|
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
|
|
24
24
|
# This is the version for XPK PyPI package
|
|
25
|
-
__version__ = 'v0.
|
|
25
|
+
__version__ = 'v0.13.0'
|
|
26
26
|
XPK_CURRENT_VERSION = __version__
|
|
27
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
28
28
|
|
xpk/core/docker_image.py
CHANGED
|
@@ -21,6 +21,7 @@ import string
|
|
|
21
21
|
|
|
22
22
|
from ..utils.console import xpk_exit, xpk_print
|
|
23
23
|
from ..utils.file import write_tmp_file
|
|
24
|
+
from ..utils.execution_context import is_dry_run
|
|
24
25
|
from .commands import run_command_with_updates
|
|
25
26
|
|
|
26
27
|
DEFAULT_DOCKER_IMAGE = 'python:3.10'
|
|
@@ -75,7 +76,9 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
75
76
|
"""
|
|
76
77
|
|
|
77
78
|
# Pick a name for the docker image.
|
|
78
|
-
docker_image_prefix =
|
|
79
|
+
docker_image_prefix = (
|
|
80
|
+
'dry-run' if is_dry_run() else os.getenv('USER', 'unknown')
|
|
81
|
+
)
|
|
79
82
|
docker_name = f'{docker_image_prefix}-runner'
|
|
80
83
|
|
|
81
84
|
script_dir_dockerfile = """FROM {base_docker_image}
|
|
@@ -94,7 +97,7 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
94
97
|
)
|
|
95
98
|
tmp = write_tmp_file(docker_file)
|
|
96
99
|
docker_build_command = (
|
|
97
|
-
f'docker buildx build --platform={PLATFORM} -f {str(tmp
|
|
100
|
+
f'docker buildx build --platform={PLATFORM} -f {str(tmp)} -t'
|
|
98
101
|
f' {docker_name} {args.script_dir}'
|
|
99
102
|
)
|
|
100
103
|
xpk_print(f'Building {args.script_dir} into docker image.')
|
|
@@ -114,10 +117,16 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
114
117
|
|
|
115
118
|
# Pick a randomly generated `tag_length` character docker tag.
|
|
116
119
|
tag_length = 4
|
|
117
|
-
tag_random_prefix =
|
|
118
|
-
|
|
120
|
+
tag_random_prefix = (
|
|
121
|
+
'prefix'
|
|
122
|
+
if is_dry_run()
|
|
123
|
+
else ''.join(random.choices(string.ascii_lowercase, k=tag_length))
|
|
124
|
+
)
|
|
125
|
+
tag_datetime = (
|
|
126
|
+
'current'
|
|
127
|
+
if is_dry_run()
|
|
128
|
+
else datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
|
119
129
|
)
|
|
120
|
-
tag_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
|
121
130
|
tag_name = f'{tag_random_prefix}-{tag_datetime}'
|
|
122
131
|
cloud_docker_image = f'gcr.io/{args.project}/{docker_name}:{tag_name}'
|
|
123
132
|
xpk_print(f'Adding Docker Image: {cloud_docker_image} to {args.project}')
|
xpk/core/docker_resources.py
CHANGED
|
@@ -20,6 +20,7 @@ from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
|
20
20
|
from .cluster import setup_k8s_env
|
|
21
21
|
from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE, Storage, get_storages_to_mount
|
|
22
22
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
23
|
+
from ..utils.execution_context import is_dry_run
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def get_main_container_resources(
|
|
@@ -272,8 +273,10 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
|
|
|
272
273
|
- name: shared-data
|
|
273
274
|
"""
|
|
274
275
|
|
|
275
|
-
storages: list[Storage] =
|
|
276
|
-
|
|
276
|
+
storages: list[Storage] = (
|
|
277
|
+
[]
|
|
278
|
+
if is_dry_run()
|
|
279
|
+
else get_storages_to_mount(setup_k8s_env(args), args.storage)
|
|
277
280
|
)
|
|
278
281
|
for storage in storages:
|
|
279
282
|
if storage.type in {
|
|
@@ -325,8 +328,10 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
|
|
|
325
328
|
elif system.accelerator_type == AcceleratorType['GPU']:
|
|
326
329
|
volume_mount_yaml = ''
|
|
327
330
|
|
|
328
|
-
storages: list[Storage] =
|
|
329
|
-
|
|
331
|
+
storages: list[Storage] = (
|
|
332
|
+
[]
|
|
333
|
+
if is_dry_run()
|
|
334
|
+
else get_storages_to_mount(setup_k8s_env(args), args.storage)
|
|
330
335
|
)
|
|
331
336
|
for storage in storages:
|
|
332
337
|
if storage.type in {
|
xpk/core/jobset.py
CHANGED
|
@@ -134,7 +134,7 @@ def update_jobset_resources_if_necessary(args):
|
|
|
134
134
|
memory_limit_size=new_memory_limit,
|
|
135
135
|
)
|
|
136
136
|
tmp = write_tmp_file(yml_string)
|
|
137
|
-
command = f'kubectl apply -f {str(tmp
|
|
137
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
138
138
|
|
|
139
139
|
task = 'Updating jobset Controller Manager resources'
|
|
140
140
|
return_code = run_command_with_updates_retry(command, task, args)
|
xpk/core/kjob.py
CHANGED
|
@@ -23,6 +23,7 @@ from kubernetes.client import ApiClient
|
|
|
23
23
|
from kubernetes.client.rest import ApiException
|
|
24
24
|
|
|
25
25
|
from ..utils import templates
|
|
26
|
+
from ..utils.execution_context import is_dry_run
|
|
26
27
|
from ..utils.console import xpk_exit, xpk_print
|
|
27
28
|
from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
28
29
|
from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
|
|
@@ -368,8 +369,10 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int:
|
|
|
368
369
|
def prepare_kjob(args: Namespace) -> int:
|
|
369
370
|
system = get_cluster_system_characteristics(args)
|
|
370
371
|
|
|
371
|
-
|
|
372
|
-
|
|
372
|
+
storages = []
|
|
373
|
+
if not is_dry_run():
|
|
374
|
+
k8s_api_client = setup_k8s_env(args)
|
|
375
|
+
storages = get_auto_mount_storages(k8s_api_client)
|
|
373
376
|
|
|
374
377
|
service_account = ""
|
|
375
378
|
if len(storages) > 0:
|
xpk/core/kueue.py
CHANGED
|
@@ -436,6 +436,8 @@ def install_kueue_crs(
|
|
|
436
436
|
cluster_hardware_name=cluster_hardware_name,
|
|
437
437
|
resource_type=resource_type,
|
|
438
438
|
total_chips=total_chips,
|
|
439
|
+
cpu_limit=args.cpu_limit,
|
|
440
|
+
memory_limit=args.memory_limit,
|
|
439
441
|
)
|
|
440
442
|
topology_label = ''
|
|
441
443
|
if system.device_type in [
|
|
@@ -474,7 +476,7 @@ def install_kueue_crs(
|
|
|
474
476
|
yml_string = topology_yaml + yml_string
|
|
475
477
|
|
|
476
478
|
tmp = write_tmp_file(yml_string)
|
|
477
|
-
command = f'kubectl apply -f {str(tmp
|
|
479
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
478
480
|
|
|
479
481
|
task = 'Applying Kueue Custom Resources'
|
|
480
482
|
return_code = run_command_with_updates_retry(command, task, args)
|
|
@@ -484,7 +486,7 @@ def install_kueue_crs(
|
|
|
484
486
|
|
|
485
487
|
|
|
486
488
|
def get_kueue_covered_resources_config(
|
|
487
|
-
cluster_hardware_name, resource_type, total_chips
|
|
489
|
+
cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit
|
|
488
490
|
) -> str:
|
|
489
491
|
"""Gets Kueue covered resources configuration.
|
|
490
492
|
|
|
@@ -497,17 +499,31 @@ def get_kueue_covered_resources_config(
|
|
|
497
499
|
A string of Kueue covered resources configuration.
|
|
498
500
|
"""
|
|
499
501
|
config_format = """
|
|
500
|
-
- coveredResources:
|
|
502
|
+
- coveredResources: {resource_types}
|
|
501
503
|
flavors:
|
|
502
504
|
- name: {cluster_hardware_name}
|
|
503
505
|
resources:
|
|
504
506
|
- name: "{resource_type}"
|
|
505
|
-
nominalQuota: {total_chips}
|
|
506
|
-
|
|
507
|
+
nominalQuota: {total_chips}"""
|
|
508
|
+
resource_types = [resource_type]
|
|
509
|
+
if cpu_limit:
|
|
510
|
+
config_format = config_format + """
|
|
511
|
+
- name: "cpu"
|
|
512
|
+
nominalQuota: {cpu_limit}"""
|
|
513
|
+
resource_types.append('cpu')
|
|
514
|
+
if memory_limit:
|
|
515
|
+
config_format = config_format + """
|
|
516
|
+
- name: "memory"
|
|
517
|
+
nominalQuota: {memory_limit}"""
|
|
518
|
+
resource_types.append('memory')
|
|
519
|
+
|
|
507
520
|
config_string = config_format.format(
|
|
508
521
|
cluster_hardware_name=cluster_hardware_name,
|
|
522
|
+
resource_types=resource_types,
|
|
509
523
|
resource_type=resource_type,
|
|
510
524
|
total_chips=total_chips,
|
|
525
|
+
cpu_limit=cpu_limit,
|
|
526
|
+
memory_limit=memory_limit,
|
|
511
527
|
)
|
|
512
528
|
return config_string
|
|
513
529
|
|
|
@@ -536,7 +552,7 @@ def update_kueue_resources_if_necessary(args):
|
|
|
536
552
|
memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION
|
|
537
553
|
)
|
|
538
554
|
tmp = write_tmp_file(yml_string)
|
|
539
|
-
command = f'kubectl apply -f {str(tmp
|
|
555
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
540
556
|
|
|
541
557
|
task = 'Updating Kueue Controller Manager resources'
|
|
542
558
|
return_code = run_command_with_updates_retry(command, task, args)
|
xpk/core/nap.py
CHANGED
|
@@ -250,7 +250,7 @@ def create_autoprovisioning_config(
|
|
|
250
250
|
zones=f'- {args.zone}',
|
|
251
251
|
)
|
|
252
252
|
autoprovisioning_config = AutoprovisioningConfig(
|
|
253
|
-
config_filename=write_tmp_file(yml_string)
|
|
253
|
+
config_filename=write_tmp_file(yml_string),
|
|
254
254
|
minimum_chips=minimum,
|
|
255
255
|
maximum_chips=maximum,
|
|
256
256
|
)
|
xpk/core/network.py
CHANGED
|
@@ -221,7 +221,7 @@ def create_cluster_network_config(args) -> int:
|
|
|
221
221
|
"""
|
|
222
222
|
yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
|
|
223
223
|
tmp = write_tmp_file(yml_string)
|
|
224
|
-
command = f'kubectl apply -f {str(tmp
|
|
224
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
225
225
|
|
|
226
226
|
return_code = run_command_with_updates(
|
|
227
227
|
command, 'GKE Cluster Create Network Config', args
|
xpk/core/nodepool.py
CHANGED
|
@@ -265,7 +265,9 @@ def run_gke_node_pool_create_command(
|
|
|
265
265
|
)
|
|
266
266
|
configmap_yml = {}
|
|
267
267
|
configmap_yml[resources_configmap_name] = resources_yml
|
|
268
|
-
return_code = create_or_update_cluster_configmap(
|
|
268
|
+
return_code = create_or_update_cluster_configmap(
|
|
269
|
+
configmap_yml, args.dry_run
|
|
270
|
+
)
|
|
269
271
|
if return_code != 0:
|
|
270
272
|
return 1
|
|
271
273
|
|
|
@@ -461,7 +463,7 @@ def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]:
|
|
|
461
463
|
f' --region={zone_to_region(args.zone)} --format="value(locations)"'
|
|
462
464
|
)
|
|
463
465
|
return_code, nodepool_zone = run_command_for_value(
|
|
464
|
-
command, 'Get Node Pool Zone', args
|
|
466
|
+
command, 'Get Node Pool Zone', args, dry_run_return_val=args.zone
|
|
465
467
|
)
|
|
466
468
|
if return_code != 0:
|
|
467
469
|
xpk_print(f'Get Node Pool Zone returned ERROR {return_code}')
|
|
@@ -570,7 +572,10 @@ def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
|
|
|
570
572
|
for i, command in enumerate(commands):
|
|
571
573
|
xpk_print(f'To complete {task_names[i]} we are executing {command}')
|
|
572
574
|
max_return_code = run_commands(
|
|
573
|
-
commands,
|
|
575
|
+
commands,
|
|
576
|
+
'Update GKE node pools to default RAPID GKE version',
|
|
577
|
+
task_names,
|
|
578
|
+
dry_run=args.dry_run,
|
|
574
579
|
)
|
|
575
580
|
if max_return_code != 0:
|
|
576
581
|
xpk_print(
|
xpk/core/pathways.py
CHANGED
|
@@ -19,6 +19,7 @@ from ..core.docker_container import get_user_workload_container
|
|
|
19
19
|
from ..core.gcloud_context import zone_to_region
|
|
20
20
|
from ..core.nodepool import get_all_nodepools_programmatic
|
|
21
21
|
from ..utils.console import xpk_exit, xpk_print
|
|
22
|
+
from ..utils.execution_context import is_dry_run
|
|
22
23
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
23
24
|
|
|
24
25
|
|
|
@@ -79,7 +80,10 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:
|
|
|
79
80
|
# Ensure the cluster and CPU nodepools were created with create-pathways
|
|
80
81
|
all_node_pools = get_all_nodepools_programmatic(args)
|
|
81
82
|
desired_pw_cpu_node_pools = {'cpu-np'}
|
|
82
|
-
if
|
|
83
|
+
if (
|
|
84
|
+
not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0]))
|
|
85
|
+
and not is_dry_run()
|
|
86
|
+
):
|
|
83
87
|
xpk_print(
|
|
84
88
|
'Cluster needs to be created with `xpk create-pathways` to run'
|
|
85
89
|
' Pathways workloads.'
|
|
@@ -322,7 +326,7 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
|
|
|
322
326
|
return_code = run_command_with_updates(commands[0], 'Delete Workload', args)
|
|
323
327
|
else:
|
|
324
328
|
return_code = run_commands(
|
|
325
|
-
commands, 'Delete Workload', task_names, batch=100
|
|
329
|
+
commands, 'Delete Workload', task_names, batch=100, dry_run=args.dry_run
|
|
326
330
|
)
|
|
327
331
|
|
|
328
332
|
if return_code != 0:
|
xpk/core/ray.py
CHANGED
|
@@ -132,7 +132,7 @@ def install_ray_cluster(args, system) -> int:
|
|
|
132
132
|
)
|
|
133
133
|
|
|
134
134
|
tmp = write_tmp_file(yml_string)
|
|
135
|
-
command = f'kubectl apply -f {str(tmp
|
|
135
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
136
136
|
task = 'Applying RayCluster'
|
|
137
137
|
retry_attempts = 1
|
|
138
138
|
return_code = run_command_with_updates_retry(
|
xpk/core/resources.py
CHANGED
|
@@ -66,7 +66,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
|
|
|
66
66
|
)
|
|
67
67
|
|
|
68
68
|
return_code, return_value = run_command_for_value(
|
|
69
|
-
command,
|
|
69
|
+
command,
|
|
70
|
+
'GKE Cluster Get ConfigMap',
|
|
71
|
+
args,
|
|
72
|
+
dry_run_return_val='map[]',
|
|
70
73
|
)
|
|
71
74
|
if return_code != 0:
|
|
72
75
|
xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
|
|
@@ -81,8 +84,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
|
|
|
81
84
|
configs = return_value[4:-1].split(' ')
|
|
82
85
|
|
|
83
86
|
for config in configs:
|
|
84
|
-
|
|
85
|
-
|
|
87
|
+
parts = config.strip().split(':')
|
|
88
|
+
if len(parts) != 2:
|
|
89
|
+
continue
|
|
90
|
+
config_map[parts[0]] = parts[1]
|
|
86
91
|
return config_map
|
|
87
92
|
|
|
88
93
|
|
|
@@ -150,10 +155,12 @@ def create_cluster_configmaps(
|
|
|
150
155
|
args=args, name=metadata_configmap_name, data=metadata
|
|
151
156
|
)
|
|
152
157
|
configmap_yml[metadata_configmap_name] = metadata_yml
|
|
153
|
-
return create_or_update_cluster_configmap(configmap_yml)
|
|
158
|
+
return create_or_update_cluster_configmap(configmap_yml, args.dry_run)
|
|
154
159
|
|
|
155
160
|
|
|
156
|
-
def create_or_update_cluster_configmap(
|
|
161
|
+
def create_or_update_cluster_configmap(
|
|
162
|
+
configmap_yml: dict, dry_run: bool
|
|
163
|
+
) -> int:
|
|
157
164
|
"""
|
|
158
165
|
Args:
|
|
159
166
|
configmap_yml: dict containing ConfigMap name and yml string.
|
|
@@ -165,13 +172,16 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
|
|
|
165
172
|
task_names = []
|
|
166
173
|
for configmap_name, yml_string in configmap_yml.items():
|
|
167
174
|
tmp = write_tmp_file(yml_string)
|
|
168
|
-
command = f'kubectl apply -f {str(tmp
|
|
175
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
169
176
|
commands.append(command)
|
|
170
177
|
task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
|
|
171
178
|
task_names.append(task_name)
|
|
172
179
|
|
|
173
180
|
return_code = run_commands(
|
|
174
|
-
commands,
|
|
181
|
+
commands,
|
|
182
|
+
'GKE Cluster CreateOrUpdate ConfigMap(s)',
|
|
183
|
+
task_names,
|
|
184
|
+
dry_run=dry_run,
|
|
175
185
|
)
|
|
176
186
|
if return_code != 0:
|
|
177
187
|
xpk_print(
|
xpk/core/scheduling.py
CHANGED
|
@@ -15,6 +15,7 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..utils.console import xpk_print
|
|
18
|
+
from ..utils.execution_context import is_dry_run
|
|
18
19
|
from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
|
|
19
20
|
from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
|
|
20
21
|
from .system_characteristics import (
|
|
@@ -45,6 +46,9 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
|
|
45
46
|
)
|
|
46
47
|
return True
|
|
47
48
|
|
|
49
|
+
if is_dry_run():
|
|
50
|
+
return True
|
|
51
|
+
|
|
48
52
|
# Check for gke accelerator type:
|
|
49
53
|
missing_gke_accelerator_type = False
|
|
50
54
|
if not cluster_config_map.get(system.gke_accelerator):
|
xpk/main.py
CHANGED
|
@@ -37,6 +37,7 @@ import sys
|
|
|
37
37
|
from .parser.core import set_parser
|
|
38
38
|
from .utils.console import xpk_print
|
|
39
39
|
from .utils.validation import validate_dependencies
|
|
40
|
+
from .utils.execution_context import set_dry_run
|
|
40
41
|
################### Compatibility Check ###################
|
|
41
42
|
# Check that the user runs the below version or greater.
|
|
42
43
|
|
|
@@ -63,9 +64,11 @@ def main() -> None:
|
|
|
63
64
|
set_parser(parser=parser)
|
|
64
65
|
|
|
65
66
|
xpk_print('Starting xpk', flush=True)
|
|
66
|
-
validate_dependencies()
|
|
67
67
|
main_args = parser.parse_args()
|
|
68
68
|
main_args.enable_ray_cluster = False
|
|
69
|
+
set_dry_run('dry_run' in main_args and main_args.dry_run)
|
|
70
|
+
if not main_args.dry_run:
|
|
71
|
+
validate_dependencies()
|
|
69
72
|
main_args.func(main_args)
|
|
70
73
|
xpk_print('XPK Done.', flush=True)
|
|
71
74
|
|
xpk/parser/cluster.py
CHANGED
|
@@ -174,6 +174,13 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
174
174
|
'Arguments for configuring MTC in cluster create.',
|
|
175
175
|
)
|
|
176
176
|
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
177
|
+
|
|
178
|
+
cluster_create_resource_limits = cluster_create_parser.add_argument_group(
|
|
179
|
+
'Optional Resource Limits Arguments',
|
|
180
|
+
'Arguments for configuring resource limits in cluster create.',
|
|
181
|
+
)
|
|
182
|
+
add_resource_limits(cluster_create_resource_limits)
|
|
183
|
+
|
|
177
184
|
cluster_create_parser.set_defaults(func=cluster_create)
|
|
178
185
|
|
|
179
186
|
|
|
@@ -245,6 +252,15 @@ def set_cluster_create_pathways_parser(
|
|
|
245
252
|
)
|
|
246
253
|
)
|
|
247
254
|
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
255
|
+
|
|
256
|
+
cluster_create_resource_limits = (
|
|
257
|
+
cluster_create_pathways_parser.add_argument_group(
|
|
258
|
+
'Optional Resource Limits Arguments',
|
|
259
|
+
'Arguments for configuring resource limits in cluster create.',
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
add_resource_limits(cluster_create_resource_limits)
|
|
263
|
+
|
|
248
264
|
cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways)
|
|
249
265
|
|
|
250
266
|
|
|
@@ -320,6 +336,13 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
|
320
336
|
'Arguments for configuring MTC in cluster create.',
|
|
321
337
|
)
|
|
322
338
|
add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
|
|
339
|
+
|
|
340
|
+
cluster_create_resource_limits = cluster_create_ray_parser.add_argument_group(
|
|
341
|
+
'Optional Resource Limits Arguments',
|
|
342
|
+
'Arguments for configuring resource limits in cluster create.',
|
|
343
|
+
)
|
|
344
|
+
add_resource_limits(cluster_create_resource_limits)
|
|
345
|
+
|
|
323
346
|
cluster_create_ray_parser.set_defaults(func=cluster_create_ray_cluster)
|
|
324
347
|
|
|
325
348
|
|
|
@@ -887,3 +910,23 @@ def add_shared_cluster_create_mtc_arguments(
|
|
|
887
910
|
' checkpointing. By default, it is set to "google.com/tpu".'
|
|
888
911
|
),
|
|
889
912
|
)
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
def add_resource_limits(parser_or_group: ParserOrArgumentGroup):
|
|
916
|
+
"""Add resource limits arguments in cluster create.
|
|
917
|
+
|
|
918
|
+
Args:
|
|
919
|
+
List of cluster create resource limits arguments parsers or group
|
|
920
|
+
"""
|
|
921
|
+
parser_or_group.add_argument(
|
|
922
|
+
'--memory-limit',
|
|
923
|
+
type=str,
|
|
924
|
+
default=None,
|
|
925
|
+
help='The memory limit for the Kueue controller manager.',
|
|
926
|
+
)
|
|
927
|
+
parser_or_group.add_argument(
|
|
928
|
+
'--cpu-limit',
|
|
929
|
+
type=int,
|
|
930
|
+
default=None,
|
|
931
|
+
help='The CPU limit for the Kueue controller manager.',
|
|
932
|
+
)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
dry_run = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def set_dry_run(value: bool) -> None:
|
|
21
|
+
"""Sets the dry_run flag."""
|
|
22
|
+
global dry_run
|
|
23
|
+
dry_run = value
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_dry_run() -> bool:
|
|
27
|
+
"""Returns the current value of the dry_run flag."""
|
|
28
|
+
return dry_run
|
xpk/utils/file.py
CHANGED
|
@@ -16,10 +16,11 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import tempfile
|
|
18
18
|
import os
|
|
19
|
-
|
|
19
|
+
import hashlib
|
|
20
|
+
from .execution_context import is_dry_run
|
|
20
21
|
|
|
21
22
|
|
|
22
|
-
def make_tmp_files(per_command_name):
|
|
23
|
+
def make_tmp_files(per_command_name: list[str]) -> list[str]:
|
|
23
24
|
"""Make temporary files for each command.
|
|
24
25
|
|
|
25
26
|
Args:
|
|
@@ -28,16 +29,19 @@ def make_tmp_files(per_command_name):
|
|
|
28
29
|
Returns:
|
|
29
30
|
A list of temporary files for each command.
|
|
30
31
|
"""
|
|
32
|
+
if is_dry_run():
|
|
33
|
+
return [_hash_filename(command) for command in per_command_name]
|
|
34
|
+
|
|
31
35
|
# Supports removal of spaces from command names before converting to file name.
|
|
32
36
|
return [
|
|
33
37
|
tempfile.NamedTemporaryFile(
|
|
34
38
|
delete=False, prefix=command.replace(' ', '-') + '-'
|
|
35
|
-
)
|
|
39
|
+
).file.name
|
|
36
40
|
for command in per_command_name
|
|
37
41
|
]
|
|
38
42
|
|
|
39
43
|
|
|
40
|
-
def write_tmp_file(payload):
|
|
44
|
+
def write_tmp_file(payload: str) -> str:
|
|
41
45
|
"""Writes `payload` to a temporary file.
|
|
42
46
|
|
|
43
47
|
Args:
|
|
@@ -46,14 +50,17 @@ def write_tmp_file(payload):
|
|
|
46
50
|
Returns:
|
|
47
51
|
A file object that was written to.
|
|
48
52
|
"""
|
|
53
|
+
if is_dry_run():
|
|
54
|
+
return _hash_filename(payload)
|
|
55
|
+
|
|
49
56
|
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
|
50
57
|
with open(file=tmp.name, mode='w', encoding='utf=8') as f:
|
|
51
58
|
f.write(payload)
|
|
52
59
|
f.flush()
|
|
53
|
-
return tmp
|
|
60
|
+
return tmp.file.name
|
|
54
61
|
|
|
55
62
|
|
|
56
|
-
def append_tmp_file(payload, file):
|
|
63
|
+
def append_tmp_file(payload: str, file: str) -> str:
|
|
57
64
|
"""Appends `payload` to an already created file.
|
|
58
65
|
|
|
59
66
|
Use `write_temporary_file` to create a file.
|
|
@@ -65,18 +72,26 @@ def append_tmp_file(payload, file):
|
|
|
65
72
|
Returns:
|
|
66
73
|
A file object that was written to.
|
|
67
74
|
"""
|
|
68
|
-
|
|
75
|
+
if is_dry_run():
|
|
76
|
+
return file
|
|
77
|
+
|
|
78
|
+
with open(file=file, mode='a', encoding='utf=8') as f:
|
|
69
79
|
f.write(payload)
|
|
70
80
|
f.flush()
|
|
71
81
|
return file
|
|
72
82
|
|
|
73
83
|
|
|
74
|
-
def ensure_directory_exists(directory_path):
|
|
84
|
+
def ensure_directory_exists(directory_path: str) -> None:
|
|
75
85
|
"""Checks if a directory exists and creates it if it doesn't.
|
|
76
86
|
|
|
77
87
|
Args:
|
|
78
88
|
directory_path: The path to the directory.
|
|
79
89
|
"""
|
|
80
|
-
if not os.path.exists(directory_path):
|
|
90
|
+
if not is_dry_run() and not os.path.exists(directory_path):
|
|
81
91
|
os.makedirs(directory_path)
|
|
82
|
-
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _hash_filename(seed: str) -> str:
|
|
95
|
+
m = hashlib.sha256()
|
|
96
|
+
m.update(seed.encode('utf-8'))
|
|
97
|
+
return m.hexdigest()
|
xpk/utils/network.py
CHANGED
|
@@ -18,6 +18,7 @@ import ipaddress
|
|
|
18
18
|
import socket
|
|
19
19
|
import requests
|
|
20
20
|
from .console import xpk_print
|
|
21
|
+
from .execution_context import is_dry_run
|
|
21
22
|
|
|
22
23
|
# Retrives machine's external IP address
|
|
23
24
|
ip_resolver_url = "http://api.ipify.org"
|
|
@@ -36,6 +37,9 @@ def get_current_machine_ip(external_ip=True):
|
|
|
36
37
|
The IP address as a string.
|
|
37
38
|
"""
|
|
38
39
|
|
|
40
|
+
if is_dry_run():
|
|
41
|
+
return 0, "127.0.0.1"
|
|
42
|
+
|
|
39
43
|
try:
|
|
40
44
|
if external_ip:
|
|
41
45
|
# Get external IP address
|
|
@@ -1,51 +1,51 @@
|
|
|
1
1
|
xpk/__init__.py,sha256=7mu-VQDQMyxM5To0KOhuYe4y2TYGsEkfV7hXZmUyih4,561
|
|
2
|
-
xpk/main.py,sha256=
|
|
2
|
+
xpk/main.py,sha256=GrA6HQ4YzIHEspkP2uwDOLRYZOfBmAh8Rv9sIZcUhZg,2504
|
|
3
3
|
xpk/api/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
4
4
|
xpk/api/storage_crd.yaml,sha256=r4WFXnSJJ25EUF-t4Ljfbl-cJoSaiFiZkP8451eTub4,1260
|
|
5
5
|
xpk/commands/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
6
|
-
xpk/commands/batch.py,sha256=
|
|
7
|
-
xpk/commands/cluster.py,sha256=
|
|
6
|
+
xpk/commands/batch.py,sha256=g24nbVV-ruiFgFottJ-o4RG6CZeK1vkRJ1dOIoeLr9k,3905
|
|
7
|
+
xpk/commands/cluster.py,sha256=PzXQSgPzctA4EfeuFr8b5oyxzp03TZT3OxixNqUwXlw,40407
|
|
8
8
|
xpk/commands/cluster_gcluster.py,sha256=8jJ7nHBbkmaPtsVQ2m_GnLxkS5iNV5sSN61KL0K_uEY,10861
|
|
9
|
-
xpk/commands/common.py,sha256=
|
|
9
|
+
xpk/commands/common.py,sha256=xDpg8Y-0dtGNyx7NIhu-k4fjYSlW1X80wl_WL5fumrs,2504
|
|
10
10
|
xpk/commands/config.py,sha256=gFNkf3ibsvZmcPpkpKXe-KJmHO5IKucNwLCXNgKvaDc,836
|
|
11
11
|
xpk/commands/info.py,sha256=1orA0u5KCB6fj-smHkuFL1WCH96NGrEiDpRCgPrxUW4,7304
|
|
12
|
-
xpk/commands/inspector.py,sha256=
|
|
13
|
-
xpk/commands/job.py,sha256=
|
|
12
|
+
xpk/commands/inspector.py,sha256=q12w8V1JRd9HuBwtpi3KvTFHbCrBR0j-59CAsPREG5I,12092
|
|
13
|
+
xpk/commands/job.py,sha256=lfc7rrw-YEA4Wsz5n-mTABkVX050FPWUbwz14G82xGA,6134
|
|
14
14
|
xpk/commands/kind.py,sha256=Vl3RT47kHCR0ORX9dK37HCiYtbmXJUCIAaq-QEbIclU,7578
|
|
15
15
|
xpk/commands/kjob_common.py,sha256=dtT-R0n50byTmu2Qcni0pqKYobUAHNENBN_4pt0l-KE,1952
|
|
16
16
|
xpk/commands/run.py,sha256=5hYMG0DcdHnFWsJ5gmfX09t6ZPVItt7FFoHO_ED0_Dk,3798
|
|
17
17
|
xpk/commands/shell.py,sha256=5-sKcI2Rbk3aCojnBNtipCwgOrbIDnG4f8ah0KIayY8,4182
|
|
18
|
-
xpk/commands/storage.py,sha256=
|
|
18
|
+
xpk/commands/storage.py,sha256=Odrp3JNbVp1Ngr_Pj1ds-V02Qz3HbJiSYm9wUYscI8s,10704
|
|
19
19
|
xpk/commands/version.py,sha256=CU4mb71r66U28krnPAopC6vBpdK-IGclsy5uNaQcgRY,824
|
|
20
|
-
xpk/commands/workload.py,sha256=
|
|
20
|
+
xpk/commands/workload.py,sha256=zIRTwxz_pDUkGMUiBGvrejY-gjnCVpsJW2Ey9cQcb14,26906
|
|
21
21
|
xpk/core/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
22
22
|
xpk/core/capacity.py,sha256=SQzncJSLuI4LLJ2VcnpxcRlTjiBG3e8nM0_QxG1986w,7367
|
|
23
|
-
xpk/core/cluster.py,sha256=
|
|
24
|
-
xpk/core/cluster_private.py,sha256=
|
|
25
|
-
xpk/core/commands.py,sha256=
|
|
26
|
-
xpk/core/config.py,sha256=
|
|
23
|
+
xpk/core/cluster.py,sha256=D9VGYlB8mZlGWUjgF9ri7P17RK-Q0EIWLVLHvh7f948,29057
|
|
24
|
+
xpk/core/cluster_private.py,sha256=_wzaywvp_uaszg_L0U3u8wYRYQbY0vSlf3a0aycDUxM,6812
|
|
25
|
+
xpk/core/commands.py,sha256=_kpML4DkorN2yB7gcrgpLaa4OSrivkC8tOTdaCh85VU,10805
|
|
26
|
+
xpk/core/config.py,sha256=svK1NWm7M0ttu2jyp7mzhM0iKmA7ATxLQY5sB-7FfS4,3407
|
|
27
27
|
xpk/core/config_test.py,sha256=v1qfyFRzLkYSQ7Wn4nx1N0dBSOFXidLWDfhkeHDZOVM,1847
|
|
28
28
|
xpk/core/docker_container.py,sha256=GvkCJ2S5UKn8uh3pZhRd3X7iS0-PsQpRO8l7QhywVGc,7604
|
|
29
|
-
xpk/core/docker_image.py,sha256=
|
|
29
|
+
xpk/core/docker_image.py,sha256=r7gXOqQX7wd8oHklYjPvhOi8Dbaz0NIfv4NbVfhbGkg,6805
|
|
30
30
|
xpk/core/docker_manager.py,sha256=JBFgyD6O7LKwEHJC7YuSoCDZqrFRtb-LjgWNqkfAbR0,10566
|
|
31
|
-
xpk/core/docker_resources.py,sha256=
|
|
31
|
+
xpk/core/docker_resources.py,sha256=_aKgpUjyJB2krQ1PkHrotB7K4kByLmPLbuvl_UVvuX8,12843
|
|
32
32
|
xpk/core/filestore.py,sha256=mcuUzsAPARbnrBG4fIGsEoN8NmzjaQ6k0tvIwMtjO9k,8068
|
|
33
33
|
xpk/core/gcloud_context.py,sha256=go0avmBbYx45vk_7W3iwQEphmQUx27oaL6dseyocqLI,5836
|
|
34
34
|
xpk/core/gcluster_manager.py,sha256=JFip2hInFczFP2h5AXa70IPIuTaJ475TG6GxkQjKOI8,6337
|
|
35
35
|
xpk/core/gcsfuse.py,sha256=kg5pgxdTjgiqquuGjev9fXzJPb8oiWPTK6wzCddzheQ,2125
|
|
36
|
-
xpk/core/jobset.py,sha256=
|
|
37
|
-
xpk/core/kjob.py,sha256=
|
|
38
|
-
xpk/core/kueue.py,sha256=
|
|
36
|
+
xpk/core/jobset.py,sha256=T1TPlIm0D5h5PTwnfMGUPZkwVfkd_MpnoKYRqI63ses,4113
|
|
37
|
+
xpk/core/kjob.py,sha256=vuSzPsMIuBpAXJkXTGSgCAz6WzzDl31XkQPQnW3pg1M,14806
|
|
38
|
+
xpk/core/kueue.py,sha256=K2nU-Icqi7-LcDHYIX7m0cNOz3zjuwqFzqMD35k6W7A,15753
|
|
39
39
|
xpk/core/monitoring.py,sha256=v9MvLzNfvJAVby_ehSlPe6PaO0_pf3shkXg5gd-UWm8,4338
|
|
40
40
|
xpk/core/mtc.py,sha256=pO7p3l-EzLFdTE8MdwWV8i0Zu-7epGql_kPoksVofIU,6259
|
|
41
|
-
xpk/core/nap.py,sha256=
|
|
42
|
-
xpk/core/network.py,sha256=
|
|
43
|
-
xpk/core/nodepool.py,sha256=
|
|
41
|
+
xpk/core/nap.py,sha256=H0ZDj68m8B8LHPTP41BLd8Q945eYjxFbh4SCkqf9HFM,12809
|
|
42
|
+
xpk/core/network.py,sha256=mu1J9QIQzv_fWfvnVXJYQPcgw7Od01ok2zEqCJx0dIs,10527
|
|
43
|
+
xpk/core/nodepool.py,sha256=8a5sYPFH7CSkLzTugAQ8JaXPux2RH7bY_6fTPMBMh4E,23132
|
|
44
44
|
xpk/core/nodepool_test.py,sha256=QRpmdyZTPRDE2qCibWeKQgE3Q2WCxXt1Onfv0MK4QZQ,2626
|
|
45
|
-
xpk/core/pathways.py,sha256=
|
|
46
|
-
xpk/core/ray.py,sha256=
|
|
47
|
-
xpk/core/resources.py,sha256=
|
|
48
|
-
xpk/core/scheduling.py,sha256=
|
|
45
|
+
xpk/core/pathways.py,sha256=Q76IaxffhvlTR_Rz6BxSEO0j-ytY5hPEcQaATcfGQiM,10704
|
|
46
|
+
xpk/core/ray.py,sha256=74IMjmwneoMoPu1TpILCV1F64nSu7L5ETmGp86fLgp0,6327
|
|
47
|
+
xpk/core/resources.py,sha256=wFqpcWQj1KwF6b1NRIMqMcrDTQJT00cNHMpFsHlIs_k,8230
|
|
48
|
+
xpk/core/scheduling.py,sha256=dlWMeP_3RDdNKO2qmNxzq-YyQATQ9vKyMMI7xEfEkoM,9242
|
|
49
49
|
xpk/core/storage.py,sha256=NILvVAcLNMLmp4wKx_TEKbMMF5X1oL-FrQV46PT0_ds,16902
|
|
50
50
|
xpk/core/system_characteristics.py,sha256=2mtQlUiufK98XUXo0_f1D4d06FRGdUk_VNkaBg48Fcs,15152
|
|
51
51
|
xpk/core/vertex.py,sha256=pD9UBL62xHomuqdNu7xKccfD2KCbjgohMk3AhX-CXSw,3644
|
|
@@ -66,7 +66,7 @@ xpk/core/workload_decorators/tcpx_decorator_test.py,sha256=iTBS3X_-VwA2oveNDjscd
|
|
|
66
66
|
xpk/core/workload_decorators/tcpxo_decorator.py,sha256=_nLX7tbnxhnS-xv4Jijd1JOP76V4LpNCfW3Np404Cqw,6537
|
|
67
67
|
xpk/parser/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
68
68
|
xpk/parser/batch.py,sha256=mJU-Cp1yTLje59vD-B1IiBcUeD-ZmEsoeB4xhj9cflc,1406
|
|
69
|
-
xpk/parser/cluster.py,sha256=
|
|
69
|
+
xpk/parser/cluster.py,sha256=jt6oPoFtOZjdtDKVLOKSskUZooajan8mgT9sdlbNuow,30445
|
|
70
70
|
xpk/parser/common.py,sha256=N6P6wPuptluNEddh9kpUsaWgxXGADNZLMfKT-P7QkW0,7791
|
|
71
71
|
xpk/parser/config.py,sha256=-XnWx9aFsBW4Uzo_hpOMD2ZQ0bdZLvq1ksv83_5jqSM,1633
|
|
72
72
|
xpk/parser/core.py,sha256=VRJerlS92ufoQbG1mZv7B04DAP4qGkBHa4pRXgcbAs0,4761
|
|
@@ -84,17 +84,18 @@ xpk/templates/__init__.py,sha256=7mu-VQDQMyxM5To0KOhuYe4y2TYGsEkfV7hXZmUyih4,561
|
|
|
84
84
|
xpk/templates/storage.yaml,sha256=AykdyMtDnKZF8Y_0BYxoYP03hEIzEk6iNalXAQHgAls,163
|
|
85
85
|
xpk/utils/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
86
86
|
xpk/utils/console.py,sha256=hRbvtog_VAzuxt5GfwK5GZdd5SWaa7kvWG8zo_qFRQc,1519
|
|
87
|
-
xpk/utils/
|
|
87
|
+
xpk/utils/execution_context.py,sha256=WYxm6NExBIP6iLAWaL5aV858riGJbAHn0Zs6fmKlmzE,784
|
|
88
|
+
xpk/utils/file.py,sha256=hi9v4gfwiB3JHi3tnelPbm_dlTUt47U0wvvWKQqMjiQ,2500
|
|
88
89
|
xpk/utils/gcs_utils.py,sha256=zg-XSTv4G4TFjeT2bNBm2WLdDXPrOZi0rNv_JdppNg4,4113
|
|
89
90
|
xpk/utils/kubectl.py,sha256=WKB9UhpouPN9G4n2ejRi_PgsYLI0R01gzkS1WGU6mJA,1828
|
|
90
|
-
xpk/utils/network.py,sha256=
|
|
91
|
+
xpk/utils/network.py,sha256=dGS5rxIm_zaayDElHNlzalaf09M99by5ckL_lGDl_yQ,4293
|
|
91
92
|
xpk/utils/objects.py,sha256=OwMNxB4TGX21qnJPdZo2YBMPMbQPqOtHMh19QhoRNRY,2498
|
|
92
93
|
xpk/utils/templates.py,sha256=g8zgR1MxyJmTmzM_wnvH30FmcbgQMC47UQwBtLj8B9k,807
|
|
93
94
|
xpk/utils/validation.py,sha256=bSJApIY0Lk48I4EEQP08ZUvolXt_APpYXVGJXFQ_YLA,2711
|
|
94
95
|
xpk/utils/yaml.py,sha256=j8xuAJ9yAAwnQi6ozwZ-nMnDyDnc3xWkeBZMtSuP4RU,844
|
|
95
|
-
xpk-0.
|
|
96
|
-
xpk-0.
|
|
97
|
-
xpk-0.
|
|
98
|
-
xpk-0.
|
|
99
|
-
xpk-0.
|
|
100
|
-
xpk-0.
|
|
96
|
+
xpk-0.13.0.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
97
|
+
xpk-0.13.0.dist-info/METADATA,sha256=EdMiwFyuULp8iGjAVi9qMtzt57IRDZD9sYk0tww-JKY,71759
|
|
98
|
+
xpk-0.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
99
|
+
xpk-0.13.0.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
|
|
100
|
+
xpk-0.13.0.dist-info/top_level.txt,sha256=aDe4N0jicmuWExx_6w0TxWQJaEuPSs9BnLU-3aF1GLo,4
|
|
101
|
+
xpk-0.13.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|