xpk 0.17.1__py3-none-any.whl → 0.17.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +0 -22
- xpk/commands/cluster_gcluster.py +1 -13
- xpk/commands/cluster_gcluster_test.py +0 -10
- xpk/commands/cluster_test.py +0 -4
- xpk/commands/kind.py +0 -21
- xpk/commands/storage.py +0 -25
- xpk/core/cluster.py +1 -3
- xpk/core/config.py +0 -15
- xpk/core/system_characteristics.py +1 -16
- xpk/core/workload_decorators/rdma_decorator.py +0 -15
- xpk/core/workload_decorators/tcpx_decorator.py +0 -8
- xpk/core/workload_decorators/tcpx_decorator_test.py +0 -78
- xpk/core/workload_decorators/tcpxo_decorator.py +0 -16
- xpk/parser/common.py +0 -151
- xpk/parser/core.py +0 -31
- xpk/utils/validation.py +0 -8
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/METADATA +1 -1
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/RECORD +22 -33
- xpk/commands/batch.py +0 -144
- xpk/commands/job.py +0 -244
- xpk/commands/kjob_common.py +0 -60
- xpk/commands/run.py +0 -140
- xpk/commands/shell.py +0 -142
- xpk/core/kjob.py +0 -473
- xpk/parser/batch.py +0 -43
- xpk/parser/job.py +0 -147
- xpk/parser/run.py +0 -47
- xpk/parser/shell.py +0 -59
- xpk/templates/volume_bundle.yaml +0 -7
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/WHEEL +0 -0
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/entry_points.txt +0 -0
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py
CHANGED
|
@@ -49,7 +49,6 @@ from ..core.gcloud_context import (
|
|
|
49
49
|
zone_to_region,
|
|
50
50
|
)
|
|
51
51
|
from ..core.jobset import update_jobset_resources_if_necessary
|
|
52
|
-
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
|
|
53
52
|
from ..core.kueue_manager import (KueueConfig, KueueManager)
|
|
54
53
|
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
55
54
|
from ..core.network import (
|
|
@@ -98,7 +97,6 @@ def cluster_adapt(args) -> None:
|
|
|
98
97
|
if should_validate_dependencies(args):
|
|
99
98
|
validate_dependencies_list([
|
|
100
99
|
SystemDependency.KUBECTL,
|
|
101
|
-
SystemDependency.KJOB,
|
|
102
100
|
SystemDependency.GCLOUD,
|
|
103
101
|
])
|
|
104
102
|
args.enable_pathways = False
|
|
@@ -188,7 +186,6 @@ def cluster_adapt(args) -> None:
|
|
|
188
186
|
if install_kueue_code != 0:
|
|
189
187
|
xpk_exit(install_kueue_code)
|
|
190
188
|
|
|
191
|
-
install_kjob(args)
|
|
192
189
|
if system.accelerator_type == AcceleratorType.GPU:
|
|
193
190
|
prepare_gpus(system)
|
|
194
191
|
|
|
@@ -308,7 +305,6 @@ def cluster_create(args) -> None:
|
|
|
308
305
|
if should_validate_dependencies(args):
|
|
309
306
|
validate_dependencies_list([
|
|
310
307
|
SystemDependency.KUBECTL,
|
|
311
|
-
SystemDependency.KJOB,
|
|
312
308
|
SystemDependency.GCLOUD,
|
|
313
309
|
])
|
|
314
310
|
|
|
@@ -455,8 +451,6 @@ def cluster_create(args) -> None:
|
|
|
455
451
|
if install_kueue_code != 0:
|
|
456
452
|
xpk_exit(install_kueue_code)
|
|
457
453
|
|
|
458
|
-
install_kjob(args)
|
|
459
|
-
|
|
460
454
|
if system.accelerator_type == AcceleratorType.GPU:
|
|
461
455
|
prepare_gpus(system)
|
|
462
456
|
|
|
@@ -1343,22 +1337,6 @@ def install_storage_csis(args):
|
|
|
1343
1337
|
xpk_exit(update_cluster_command_code)
|
|
1344
1338
|
|
|
1345
1339
|
|
|
1346
|
-
def install_kjob(args):
|
|
1347
|
-
xpk_print('Verifying kjob installation')
|
|
1348
|
-
err_code = verify_kjob_installed()
|
|
1349
|
-
if err_code > 0:
|
|
1350
|
-
xpk_exit(err_code)
|
|
1351
|
-
|
|
1352
|
-
xpk_print('Applying kjob CDRs')
|
|
1353
|
-
err_code = apply_kjob_crds()
|
|
1354
|
-
if err_code > 0:
|
|
1355
|
-
xpk_exit(err_code)
|
|
1356
|
-
|
|
1357
|
-
err_code = prepare_kjob(args)
|
|
1358
|
-
if err_code > 0:
|
|
1359
|
-
xpk_exit(err_code)
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
1340
|
def _install_kueue(
|
|
1363
1341
|
args,
|
|
1364
1342
|
system: SystemCharacteristics,
|
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -38,7 +38,6 @@ from ..core.commands import run_command_for_value
|
|
|
38
38
|
from ..core.docker_manager import DockerManager
|
|
39
39
|
from ..core.gcloud_context import zone_to_region
|
|
40
40
|
from ..core.gcluster_manager import GclusterManager
|
|
41
|
-
from ..core.kjob import apply_kjob_crds, prepare_kjob
|
|
42
41
|
from ..core.remote_state.fuse_remote_state import FuseStateClient
|
|
43
42
|
from ..core.remote_state.remote_state_client import RemoteStateClient
|
|
44
43
|
from ..utils.console import xpk_exit, xpk_print
|
|
@@ -112,18 +111,7 @@ def cluster_create(
|
|
|
112
111
|
get_cluster_credentials(args)
|
|
113
112
|
|
|
114
113
|
err_code = __install_kueue(args)
|
|
115
|
-
|
|
116
|
-
xpk_exit(err_code)
|
|
117
|
-
|
|
118
|
-
err_code = apply_kjob_crds()
|
|
119
|
-
if err_code > 0:
|
|
120
|
-
xpk_exit(err_code)
|
|
121
|
-
|
|
122
|
-
err_code = prepare_kjob(args)
|
|
123
|
-
if err_code > 0:
|
|
124
|
-
xpk_exit(err_code)
|
|
125
|
-
|
|
126
|
-
xpk_exit(0)
|
|
114
|
+
xpk_exit(err_code)
|
|
127
115
|
|
|
128
116
|
|
|
129
117
|
def __install_kueue(args) -> int:
|
|
@@ -46,8 +46,6 @@ def mock_cluster_create_deps(request):
|
|
|
46
46
|
"""Mocks dependencies for cluster_create."""
|
|
47
47
|
with (
|
|
48
48
|
patch("xpk.commands.cluster_gcluster.xpk_exit") as mock_exit,
|
|
49
|
-
patch("xpk.commands.cluster_gcluster.prepare_kjob") as mock_prep_kjob,
|
|
50
|
-
patch("xpk.commands.cluster_gcluster.apply_kjob_crds") as mock_apply_kjob,
|
|
51
49
|
patch(
|
|
52
50
|
"xpk.commands.cluster_gcluster.get_cluster_credentials"
|
|
53
51
|
) as mock_get_creds,
|
|
@@ -68,8 +66,6 @@ def mock_cluster_create_deps(request):
|
|
|
68
66
|
):
|
|
69
67
|
yield {
|
|
70
68
|
"xpk_exit": mock_exit,
|
|
71
|
-
"prepare_kjob": mock_prep_kjob,
|
|
72
|
-
"apply_kjob_crds": mock_apply_kjob,
|
|
73
69
|
"get_cluster_credentials": mock_get_creds,
|
|
74
70
|
"generate_blueprint": mock_gen_bp,
|
|
75
71
|
"prepare_gcluster_manager": mock_prep_gcm,
|
|
@@ -85,9 +81,6 @@ def test_install_kueue_standard(
|
|
|
85
81
|
mock_get_total_chips, mock_args, mock_cluster_create_deps
|
|
86
82
|
):
|
|
87
83
|
"""Tests __install_kueue for a standard installation."""
|
|
88
|
-
mock_cluster_create_deps["prepare_kjob"].return_value = 0
|
|
89
|
-
mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
|
|
90
|
-
|
|
91
84
|
mock_system = SystemCharacteristics(
|
|
92
85
|
topology="N/A",
|
|
93
86
|
vms_per_slice=1,
|
|
@@ -138,9 +131,6 @@ def test_install_kueue_with_autoprovisioning(
|
|
|
138
131
|
mock_enable_autoprovisioning, mock_args, mock_cluster_create_deps
|
|
139
132
|
):
|
|
140
133
|
"""Tests __install_kueue with autoprovisioning enabled."""
|
|
141
|
-
mock_cluster_create_deps["prepare_kjob"].return_value = 0
|
|
142
|
-
mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
|
|
143
|
-
|
|
144
134
|
mock_args.enable_autoprovisioning = True
|
|
145
135
|
mock_system = SystemCharacteristics(
|
|
146
136
|
topology="N/A",
|
xpk/commands/cluster_test.py
CHANGED
|
@@ -56,7 +56,6 @@ class _ClusterCreateMocks:
|
|
|
56
56
|
create_cluster_configmaps: MagicMock
|
|
57
57
|
set_jobset_on_cluster: MagicMock
|
|
58
58
|
get_cluster_location: MagicMock
|
|
59
|
-
install_kjob: MagicMock
|
|
60
59
|
xpk_exit: MagicMock
|
|
61
60
|
update_jobset_resources_if_necessary: MagicMock
|
|
62
61
|
_install_kueue: MagicMock
|
|
@@ -204,9 +203,6 @@ def cluster_create_mocks(mocker) -> _ClusterCreateMocks:
|
|
|
204
203
|
'xpk.commands.cluster.get_cluster_location',
|
|
205
204
|
return_value='us-central1',
|
|
206
205
|
),
|
|
207
|
-
install_kjob=mocker.patch(
|
|
208
|
-
'xpk.commands.cluster.install_kjob', return_value=0
|
|
209
|
-
),
|
|
210
206
|
xpk_exit=mocker.patch('xpk.commands.cluster.xpk_exit'),
|
|
211
207
|
update_jobset_resources_if_necessary=mocker.patch(
|
|
212
208
|
'xpk.commands.cluster.update_jobset_resources_if_necessary',
|
xpk/commands/kind.py
CHANGED
|
@@ -20,11 +20,6 @@ from ..core.commands import (
|
|
|
20
20
|
run_command_with_updates,
|
|
21
21
|
)
|
|
22
22
|
from ..core.cluster import set_jobset_on_cluster, setup_k8s_env
|
|
23
|
-
from ..core.kjob import (
|
|
24
|
-
verify_kjob_installed,
|
|
25
|
-
prepare_kjob,
|
|
26
|
-
apply_kjob_crds,
|
|
27
|
-
)
|
|
28
23
|
from ..core.scheduling import get_total_chips_requested_from_args
|
|
29
24
|
from ..core.storage import install_storage_crd
|
|
30
25
|
from ..core.system_characteristics import (
|
|
@@ -48,7 +43,6 @@ def cluster_create(args) -> None:
|
|
|
48
43
|
if should_validate_dependencies(args):
|
|
49
44
|
validate_dependencies_list([
|
|
50
45
|
SystemDependency.KUBECTL,
|
|
51
|
-
SystemDependency.KJOB,
|
|
52
46
|
SystemDependency.GCLOUD,
|
|
53
47
|
])
|
|
54
48
|
xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
|
|
@@ -69,21 +63,6 @@ def cluster_create(args) -> None:
|
|
|
69
63
|
if set_jobset_on_cluster_code != 0:
|
|
70
64
|
xpk_exit(set_jobset_on_cluster_code)
|
|
71
65
|
|
|
72
|
-
xpk_print('Verifying kjob installation')
|
|
73
|
-
err_code = verify_kjob_installed()
|
|
74
|
-
if err_code > 0:
|
|
75
|
-
xpk_exit(err_code)
|
|
76
|
-
|
|
77
|
-
xpk_print('Applying kjob CDRs')
|
|
78
|
-
err_code = apply_kjob_crds()
|
|
79
|
-
if err_code > 0:
|
|
80
|
-
xpk_exit(err_code)
|
|
81
|
-
|
|
82
|
-
args.kind_cluster = True
|
|
83
|
-
err_code = prepare_kjob(args)
|
|
84
|
-
if err_code > 0:
|
|
85
|
-
xpk_exit(err_code)
|
|
86
|
-
|
|
87
66
|
k8s_client = setup_k8s_env(args)
|
|
88
67
|
install_storage_crd(k8s_client)
|
|
89
68
|
|
xpk/commands/storage.py
CHANGED
|
@@ -23,7 +23,6 @@ from kubernetes.client.rest import ApiException
|
|
|
23
23
|
|
|
24
24
|
from ..core import gcsfuse
|
|
25
25
|
from ..core.cluster import (
|
|
26
|
-
DEFAULT_NAMESPACE,
|
|
27
26
|
add_zone_and_project,
|
|
28
27
|
get_cluster_network,
|
|
29
28
|
setup_k8s_env,
|
|
@@ -35,12 +34,6 @@ from ..core.cluster import (
|
|
|
35
34
|
update_cluster_with_workload_identity_if_necessary,
|
|
36
35
|
)
|
|
37
36
|
from ..core.filestore import FilestoreClient, get_storage_class_name
|
|
38
|
-
from ..core.kjob import (
|
|
39
|
-
KJOB_API_GROUP_NAME,
|
|
40
|
-
KJOB_API_GROUP_VERSION,
|
|
41
|
-
KJOB_API_VOLUME_BUNDLE_PLURAL,
|
|
42
|
-
create_volume_bundle_instance,
|
|
43
|
-
)
|
|
44
37
|
from ..core.storage import (
|
|
45
38
|
GCP_FILESTORE_TYPE,
|
|
46
39
|
GCS_FUSE_TYPE,
|
|
@@ -98,9 +91,6 @@ def storage_create(args: Namespace) -> None:
|
|
|
98
91
|
|
|
99
92
|
k8s_api_client = setup_k8s_env(args)
|
|
100
93
|
create_storage_crds(k8s_api_client, args, manifest)
|
|
101
|
-
create_volume_bundle_instance(
|
|
102
|
-
k8s_api_client, args.name, manifest, args.readonly, args.mount_point
|
|
103
|
-
)
|
|
104
94
|
# Not required for Filestore. Will be uncommented when adding GCSFuse create
|
|
105
95
|
# return_code = update_cluster_with_workload_identity_if_necessary(args)
|
|
106
96
|
# if return_code > 0:
|
|
@@ -214,9 +204,6 @@ def storage_attach(args: Namespace) -> None:
|
|
|
214
204
|
|
|
215
205
|
k8s_api_client = setup_k8s_env(args)
|
|
216
206
|
create_storage_crds(k8s_api_client, args, manifest)
|
|
217
|
-
create_volume_bundle_instance(
|
|
218
|
-
k8s_api_client, args.name, manifest, args.readonly, args.mount_point
|
|
219
|
-
)
|
|
220
207
|
|
|
221
208
|
enable_csi_drivers_if_necessary(args)
|
|
222
209
|
|
|
@@ -332,18 +319,6 @@ def delete_storage_resources(k8s_api_client: ApiClient, storage: Storage):
|
|
|
332
319
|
"Storage Class",
|
|
333
320
|
)
|
|
334
321
|
|
|
335
|
-
delete_resource(
|
|
336
|
-
lambda name: api_instance.delete_namespaced_custom_object(
|
|
337
|
-
namespace=DEFAULT_NAMESPACE,
|
|
338
|
-
name=name,
|
|
339
|
-
group=KJOB_API_GROUP_NAME,
|
|
340
|
-
version=KJOB_API_GROUP_VERSION,
|
|
341
|
-
plural=KJOB_API_VOLUME_BUNDLE_PLURAL,
|
|
342
|
-
),
|
|
343
|
-
storage.name,
|
|
344
|
-
"VolumeBundle",
|
|
345
|
-
)
|
|
346
|
-
|
|
347
322
|
delete_resource(
|
|
348
323
|
lambda name: api_instance.delete_cluster_custom_object(
|
|
349
324
|
name=name,
|
xpk/core/cluster.py
CHANGED
|
@@ -717,10 +717,8 @@ def get_cluster_credentials(args) -> int:
|
|
|
717
717
|
location=location,
|
|
718
718
|
dns_endpoint=True,
|
|
719
719
|
)
|
|
720
|
-
if return_code != 0:
|
|
721
|
-
return return_code
|
|
722
720
|
|
|
723
|
-
if not _are_credentials_valid():
|
|
721
|
+
if return_code != 0 or not _are_credentials_valid():
|
|
724
722
|
xpk_print('Detected error. Retrying without --dns-endpoint flag...')
|
|
725
723
|
return_code = _get_credentials(
|
|
726
724
|
project=args.project,
|
xpk/core/config.py
CHANGED
|
@@ -53,14 +53,6 @@ PROJECT_KEY = 'project-id'
|
|
|
53
53
|
CLIENT_ID_KEY = 'client-id'
|
|
54
54
|
SEND_TELEMETRY_KEY = 'send-telemetry'
|
|
55
55
|
ZONE_KEY = 'zone'
|
|
56
|
-
KJOB_BATCH_IMAGE = 'batch-image'
|
|
57
|
-
KJOB_BATCH_WORKING_DIRECTORY = 'batch-working-directory'
|
|
58
|
-
KJOB_SHELL_IMAGE = 'shell-image'
|
|
59
|
-
KJOB_SHELL_INTERACTIVE_COMMAND = 'shell-interactive-command'
|
|
60
|
-
KJOB_SHELL_WORKING_DIRECTORY = 'shell-working-directory'
|
|
61
|
-
CONFIGS_KEY = 'configs'
|
|
62
|
-
GKE_ENDPOINT_KEY = 'gke-endpoint'
|
|
63
|
-
DEPENDENCIES_KEY = 'deps-verified-version'
|
|
64
56
|
|
|
65
57
|
DEFAULT_KEYS = [
|
|
66
58
|
CFG_BUCKET_KEY,
|
|
@@ -69,13 +61,6 @@ DEFAULT_KEYS = [
|
|
|
69
61
|
CLIENT_ID_KEY,
|
|
70
62
|
SEND_TELEMETRY_KEY,
|
|
71
63
|
ZONE_KEY,
|
|
72
|
-
GKE_ENDPOINT_KEY,
|
|
73
|
-
DEPENDENCIES_KEY,
|
|
74
|
-
KJOB_BATCH_IMAGE,
|
|
75
|
-
KJOB_BATCH_WORKING_DIRECTORY,
|
|
76
|
-
KJOB_SHELL_IMAGE,
|
|
77
|
-
KJOB_SHELL_INTERACTIVE_COMMAND,
|
|
78
|
-
KJOB_SHELL_WORKING_DIRECTORY,
|
|
79
64
|
]
|
|
80
65
|
VERTEX_TENSORBOARD_FEATURE_FLAG = XPK_CURRENT_VERSION >= '0.4.0'
|
|
81
66
|
|
|
@@ -80,15 +80,6 @@ class GpuConfig:
|
|
|
80
80
|
|
|
81
81
|
requires_topology: bool
|
|
82
82
|
gpu_direct_name: Literal['fastrak', 'rdma', 'tcpx', 'tcpxo'] = 'fastrak'
|
|
83
|
-
kjob_decorator_fn: Optional[Callable[[dict], dict]] = None
|
|
84
|
-
"""A function to decorate the kjob template for GPU-specific configurations.
|
|
85
|
-
|
|
86
|
-
Args:
|
|
87
|
-
job_manifest (dict): The kjob manifest as a dictionary.
|
|
88
|
-
|
|
89
|
-
Returns:
|
|
90
|
-
dict: The modified kjob manifest as a dictionary.
|
|
91
|
-
"""
|
|
92
83
|
nccl_installer: Optional[str] = None
|
|
93
84
|
jobset_decorator_fn: Optional[Callable[[str, list[str]], str]] = None
|
|
94
85
|
"""A function to decorate the jobset for GPU-specific configurations.
|
|
@@ -106,7 +97,7 @@ class GpuConfig:
|
|
|
106
97
|
parts = []
|
|
107
98
|
for f in dataclasses.fields(self):
|
|
108
99
|
value = getattr(self, f.name)
|
|
109
|
-
if f.name in ('
|
|
100
|
+
if f.name in ('jobset_decorator_fn') and value:
|
|
110
101
|
parts.append(f'{f.name}=<function {value.__name__}>')
|
|
111
102
|
else:
|
|
112
103
|
parts.append(f'{f.name}={repr(value)}')
|
|
@@ -420,7 +411,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
420
411
|
gpu_config=GpuConfig(
|
|
421
412
|
requires_topology=True,
|
|
422
413
|
nccl_installer=INSTALLER_NCCL_RDMA_A4X,
|
|
423
|
-
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
424
414
|
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
425
415
|
gpu_direct_name='rdma',
|
|
426
416
|
),
|
|
@@ -439,7 +429,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
439
429
|
gpu_config=GpuConfig(
|
|
440
430
|
requires_topology=True,
|
|
441
431
|
nccl_installer=INSTALLER_NCCL_RDMA_A4X,
|
|
442
|
-
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
443
432
|
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
444
433
|
gpu_direct_name='rdma',
|
|
445
434
|
),
|
|
@@ -458,7 +447,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
458
447
|
gpu_config=GpuConfig(
|
|
459
448
|
requires_topology=True,
|
|
460
449
|
nccl_installer=INSTALLER_NCCL_RDMA,
|
|
461
|
-
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
462
450
|
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
463
451
|
gpu_direct_name='rdma',
|
|
464
452
|
),
|
|
@@ -477,7 +465,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
477
465
|
gpu_config=GpuConfig(
|
|
478
466
|
requires_topology=True,
|
|
479
467
|
nccl_installer=INSTALLER_NCCL_RDMA,
|
|
480
|
-
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
481
468
|
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
482
469
|
gpu_direct_name='rdma',
|
|
483
470
|
),
|
|
@@ -497,7 +484,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
497
484
|
gpu_config=GpuConfig(
|
|
498
485
|
requires_topology=True,
|
|
499
486
|
nccl_installer=INSTALLER_NCCL_TCPX,
|
|
500
|
-
kjob_decorator_fn=tcpx_decorator.decorate_kjob_template,
|
|
501
487
|
jobset_decorator_fn=tcpx_decorator.decorate_jobset,
|
|
502
488
|
gpu_direct_name='tcpx',
|
|
503
489
|
),
|
|
@@ -517,7 +503,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
517
503
|
gpu_config=GpuConfig(
|
|
518
504
|
requires_topology=True,
|
|
519
505
|
nccl_installer=INSTALLER_NCCL_TCPXO,
|
|
520
|
-
kjob_decorator_fn=tcpxo_decorator.decorate_kjob_template,
|
|
521
506
|
jobset_decorator_fn=tcpxo_decorator.decorate_jobset,
|
|
522
507
|
gpu_direct_name='tcpxo',
|
|
523
508
|
),
|
|
@@ -18,21 +18,6 @@ import yaml
|
|
|
18
18
|
from ...utils.yaml import literal_string
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
22
|
-
spec = (
|
|
23
|
-
job_manifest.setdefault('spec', {})
|
|
24
|
-
.setdefault('template', {})
|
|
25
|
-
.setdefault('spec', {})
|
|
26
|
-
)
|
|
27
|
-
spec.setdefault('tolerations', [])
|
|
28
|
-
spec.setdefault('volumes', [])
|
|
29
|
-
|
|
30
|
-
add_volumes(job_manifest)
|
|
31
|
-
add_tolerations(job_manifest)
|
|
32
|
-
update_gpu_containers(job_manifest)
|
|
33
|
-
return job_manifest
|
|
34
|
-
|
|
35
|
-
|
|
36
21
|
def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
37
22
|
"""
|
|
38
23
|
Decorates a JobSet manifest with the necessary components for rdma-daemon.
|
|
@@ -22,14 +22,6 @@ from ...utils.yaml import literal_string
|
|
|
22
22
|
tcpx = 'v2.0.11'
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
26
|
-
add_volumes(job_manifest)
|
|
27
|
-
add_tolerations(job_manifest)
|
|
28
|
-
add_tcpx_daemon_container(job_manifest)
|
|
29
|
-
update_gpu_containers(job_manifest)
|
|
30
|
-
return job_manifest
|
|
31
|
-
|
|
32
|
-
|
|
33
25
|
def decorate_job(job_manifest: dict) -> dict:
|
|
34
26
|
add_annotations(job_manifest)
|
|
35
27
|
add_volumes(job_manifest)
|
|
@@ -47,24 +47,6 @@ spec:
|
|
|
47
47
|
image: my-sidecar-image
|
|
48
48
|
"""
|
|
49
49
|
|
|
50
|
-
# Minimal kjob template for testing
|
|
51
|
-
BASE_KJOB_TEMPLATE = {
|
|
52
|
-
"spec": {
|
|
53
|
-
"template": {
|
|
54
|
-
"spec": {
|
|
55
|
-
"containers": [
|
|
56
|
-
{
|
|
57
|
-
"name": "main-gpu-container",
|
|
58
|
-
"image": "my-gpu-image",
|
|
59
|
-
"resources": {"limits": {"nvidia.com/gpu": 8}},
|
|
60
|
-
},
|
|
61
|
-
{"name": "sidecar-container", "image": "my-sidecar-image"},
|
|
62
|
-
]
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
|
|
68
50
|
# Minimal job manifest for testing
|
|
69
51
|
BASE_JOB_MANIFEST = {
|
|
70
52
|
"spec": {
|
|
@@ -205,63 +187,3 @@ def test_decorate_job():
|
|
|
205
187
|
assert "devices.gke.io/container.tcpx-daemon" in annotations
|
|
206
188
|
assert "networking.gke.io/default-interface" in annotations
|
|
207
189
|
assert "networking.gke.io/interfaces" in annotations
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
def test_decorate_kjob_template():
|
|
211
|
-
"""Tests decorate_kjob_template."""
|
|
212
|
-
kjob_template = copy.deepcopy(BASE_KJOB_TEMPLATE)
|
|
213
|
-
|
|
214
|
-
decorated_manifest = tcpx_decorator.decorate_kjob_template(kjob_template)
|
|
215
|
-
|
|
216
|
-
pod_template_spec = decorated_manifest["spec"]["template"]["spec"]
|
|
217
|
-
|
|
218
|
-
# Check annotations are NOT added
|
|
219
|
-
assert "annotations" not in decorated_manifest["spec"]["template"].get(
|
|
220
|
-
"metadata", {}
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
# Check tolerations
|
|
224
|
-
tolerations = pod_template_spec["tolerations"]
|
|
225
|
-
assert {
|
|
226
|
-
"key": "user-workload",
|
|
227
|
-
"operator": "Equal",
|
|
228
|
-
"value": "true",
|
|
229
|
-
"effect": "NoSchedule",
|
|
230
|
-
} in tolerations
|
|
231
|
-
|
|
232
|
-
# Check volumes
|
|
233
|
-
volumes = pod_template_spec["volumes"]
|
|
234
|
-
volume_names = {v["name"] for v in volumes}
|
|
235
|
-
assert "libraries" in volume_names
|
|
236
|
-
assert "sys" in volume_names
|
|
237
|
-
assert "proc-sys" in volume_names
|
|
238
|
-
assert "tcpx-socket" in volume_names
|
|
239
|
-
assert "dshm" in volume_names
|
|
240
|
-
|
|
241
|
-
# Check init container
|
|
242
|
-
init_containers = pod_template_spec["initContainers"]
|
|
243
|
-
assert len(init_containers) == 1
|
|
244
|
-
tcpx_daemon = init_containers[0]
|
|
245
|
-
assert tcpx_daemon["name"] == "tcpx-daemon"
|
|
246
|
-
assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
|
|
247
|
-
|
|
248
|
-
# Check GPU container update
|
|
249
|
-
gpu_container = pod_template_spec["containers"][0]
|
|
250
|
-
assert gpu_container["name"] == "main-gpu-container"
|
|
251
|
-
|
|
252
|
-
# Check env
|
|
253
|
-
env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
|
|
254
|
-
assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
|
|
255
|
-
|
|
256
|
-
# Check volume mounts
|
|
257
|
-
volume_mounts = {
|
|
258
|
-
vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
|
|
259
|
-
}
|
|
260
|
-
assert volume_mounts["tcpx-socket"] == "/tmp"
|
|
261
|
-
assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
|
|
262
|
-
assert volume_mounts["dshm"] == "/dev/shm"
|
|
263
|
-
|
|
264
|
-
# Check non-GPU container is not updated
|
|
265
|
-
sidecar_container = pod_template_spec["containers"][1]
|
|
266
|
-
assert "env" not in sidecar_container
|
|
267
|
-
assert "volumeMounts" not in sidecar_container
|
|
@@ -22,22 +22,6 @@ from ...utils.yaml import literal_string
|
|
|
22
22
|
rxdm = 'v1.0.12'
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
26
|
-
spec = (
|
|
27
|
-
job_manifest.setdefault('spec', {})
|
|
28
|
-
.setdefault('template', {})
|
|
29
|
-
.setdefault('spec', {})
|
|
30
|
-
)
|
|
31
|
-
spec.setdefault('tolerations', [])
|
|
32
|
-
spec.setdefault('volumes', [])
|
|
33
|
-
|
|
34
|
-
add_volumes(job_manifest)
|
|
35
|
-
add_tolerations(job_manifest)
|
|
36
|
-
add_tcpxo_daemon_container(job_manifest)
|
|
37
|
-
update_gpu_containers(job_manifest)
|
|
38
|
-
return job_manifest
|
|
39
|
-
|
|
40
|
-
|
|
41
25
|
def decorate_job(job_manifest: dict, sub_networks: list[str]) -> dict:
|
|
42
26
|
job_manifest.setdefault('spec', {}).setdefault('template', {}).setdefault(
|
|
43
27
|
'metadata', {}
|
xpk/parser/common.py
CHANGED
|
@@ -180,157 +180,6 @@ def add_global_arguments(custom_parser_or_group: ParserOrArgumentGroup):
|
|
|
180
180
|
)
|
|
181
181
|
|
|
182
182
|
|
|
183
|
-
def add_slurm_arguments(custom_parser_or_group: ParserOrArgumentGroup):
|
|
184
|
-
"""Add Slurm job arguments to the parser.
|
|
185
|
-
|
|
186
|
-
Args:
|
|
187
|
-
custom_parser_or_group: parser or argument group to add global arguments to.
|
|
188
|
-
"""
|
|
189
|
-
custom_parser_or_group.add_argument(
|
|
190
|
-
'--ignore-unknown-flags',
|
|
191
|
-
type=bool,
|
|
192
|
-
action=argparse.BooleanOptionalAction,
|
|
193
|
-
default=False,
|
|
194
|
-
help='Ignore all the unsupported flags in the bash script.',
|
|
195
|
-
)
|
|
196
|
-
custom_parser_or_group.add_argument(
|
|
197
|
-
'-a',
|
|
198
|
-
'--array',
|
|
199
|
-
type=str,
|
|
200
|
-
default=None,
|
|
201
|
-
help=(
|
|
202
|
-
'Submit a job array, multiple jobs to be executed with identical'
|
|
203
|
-
' parameters. The indexes specification identifies what array index'
|
|
204
|
-
' values should be used. For example, "--array=0-15" or'
|
|
205
|
-
' "--array=0,6,16-32". Multiple values may be specified using a comma'
|
|
206
|
-
' separated list and/or a range of values with a "-" separator. For'
|
|
207
|
-
' example "--array=0-15%%4" will limit the number of simultaneously'
|
|
208
|
-
' running tasks from this job array to 4. The minimum index value is'
|
|
209
|
-
' 0. The maximum index value is 2147483647.'
|
|
210
|
-
),
|
|
211
|
-
)
|
|
212
|
-
custom_parser_or_group.add_argument(
|
|
213
|
-
'-c',
|
|
214
|
-
'--cpus-per-task',
|
|
215
|
-
type=str,
|
|
216
|
-
default=None,
|
|
217
|
-
help='How much cpus a container inside a pod requires.',
|
|
218
|
-
)
|
|
219
|
-
custom_parser_or_group.add_argument(
|
|
220
|
-
'--gpus-per-task',
|
|
221
|
-
type=str,
|
|
222
|
-
default=None,
|
|
223
|
-
help='How much gpus a container inside a pod requires.',
|
|
224
|
-
)
|
|
225
|
-
custom_parser_or_group.add_argument(
|
|
226
|
-
'--mem',
|
|
227
|
-
type=str,
|
|
228
|
-
default=None,
|
|
229
|
-
help='How much memory a pod requires.',
|
|
230
|
-
)
|
|
231
|
-
custom_parser_or_group.add_argument(
|
|
232
|
-
'--mem-per-task',
|
|
233
|
-
type=str,
|
|
234
|
-
default=None,
|
|
235
|
-
help='How much memory a container requires.',
|
|
236
|
-
)
|
|
237
|
-
custom_parser_or_group.add_argument(
|
|
238
|
-
'--mem-per-cpu',
|
|
239
|
-
type=str,
|
|
240
|
-
default=None,
|
|
241
|
-
help=(
|
|
242
|
-
'How much memory a container requires, it multiplies the number '
|
|
243
|
-
'of requested cpus per task by mem-per-cpu.'
|
|
244
|
-
),
|
|
245
|
-
)
|
|
246
|
-
custom_parser_or_group.add_argument(
|
|
247
|
-
'--mem-per-gpu',
|
|
248
|
-
type=str,
|
|
249
|
-
default=None,
|
|
250
|
-
help=(
|
|
251
|
-
'How much memory a container requires, it multiplies the number '
|
|
252
|
-
'of requested gpus per task by mem-per-gpu.'
|
|
253
|
-
),
|
|
254
|
-
)
|
|
255
|
-
custom_parser_or_group.add_argument(
|
|
256
|
-
'-N',
|
|
257
|
-
'--nodes',
|
|
258
|
-
type=int,
|
|
259
|
-
default=None,
|
|
260
|
-
help='Number of pods to be used at a time.',
|
|
261
|
-
)
|
|
262
|
-
custom_parser_or_group.add_argument(
|
|
263
|
-
'-n',
|
|
264
|
-
'--ntasks',
|
|
265
|
-
type=int,
|
|
266
|
-
default=None,
|
|
267
|
-
help='Number of identical containers inside of a pod, usually 1.',
|
|
268
|
-
)
|
|
269
|
-
custom_parser_or_group.add_argument(
|
|
270
|
-
'-o',
|
|
271
|
-
'--output',
|
|
272
|
-
type=str,
|
|
273
|
-
default=None,
|
|
274
|
-
help=(
|
|
275
|
-
'Where to redirect the standard output stream of a task. If not'
|
|
276
|
-
' passed it proceeds to stdout, and is available via kubectl logs.'
|
|
277
|
-
),
|
|
278
|
-
)
|
|
279
|
-
custom_parser_or_group.add_argument(
|
|
280
|
-
'-e',
|
|
281
|
-
'--error',
|
|
282
|
-
type=str,
|
|
283
|
-
default=None,
|
|
284
|
-
help=(
|
|
285
|
-
'Where to redirect std error stream of a task. If not passed it'
|
|
286
|
-
' proceeds to stdout, and is available via kubectl logs.'
|
|
287
|
-
),
|
|
288
|
-
)
|
|
289
|
-
custom_parser_or_group.add_argument(
|
|
290
|
-
'--input',
|
|
291
|
-
type=str,
|
|
292
|
-
default=None,
|
|
293
|
-
help='What to pipe into the script.',
|
|
294
|
-
)
|
|
295
|
-
custom_parser_or_group.add_argument(
|
|
296
|
-
'-J',
|
|
297
|
-
'--job-name',
|
|
298
|
-
type=str,
|
|
299
|
-
default=None,
|
|
300
|
-
help='What is the job name.',
|
|
301
|
-
)
|
|
302
|
-
custom_parser_or_group.add_argument(
|
|
303
|
-
'-D',
|
|
304
|
-
'--chdir',
|
|
305
|
-
type=str,
|
|
306
|
-
default=None,
|
|
307
|
-
help='Change directory before executing the script.',
|
|
308
|
-
)
|
|
309
|
-
custom_parser_or_group.add_argument(
|
|
310
|
-
'-t',
|
|
311
|
-
'--time',
|
|
312
|
-
type=str,
|
|
313
|
-
default=None,
|
|
314
|
-
help=(
|
|
315
|
-
'Set a limit on the total run time of the job. '
|
|
316
|
-
'A time limit of zero requests that no time limit be imposed. '
|
|
317
|
-
'Acceptable time formats include "minutes", "minutes:seconds", '
|
|
318
|
-
'"hours:minutes:seconds", "days-hours", "days-hours:minutes" '
|
|
319
|
-
'and "days-hours:minutes:seconds".'
|
|
320
|
-
),
|
|
321
|
-
)
|
|
322
|
-
custom_parser_or_group.add_argument(
|
|
323
|
-
'--priority',
|
|
324
|
-
type=str,
|
|
325
|
-
default='medium',
|
|
326
|
-
choices=['very-low', 'low', 'medium', 'high', 'very-high'],
|
|
327
|
-
help=(
|
|
328
|
-
'A priority, one of `very-low`, `low`, `medium`, `high` or'
|
|
329
|
-
' `very-high`. Defaults to `medium`.'
|
|
330
|
-
),
|
|
331
|
-
)
|
|
332
|
-
|
|
333
|
-
|
|
334
183
|
def add_tpu_type_argument(
|
|
335
184
|
custom_parser_or_group: ParserOrArgumentGroup,
|
|
336
185
|
required: bool = False,
|