xpk 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +5 -6
- xpk/commands/cluster.py +246 -73
- xpk/commands/cluster_gcluster.py +27 -0
- xpk/commands/common.py +40 -1
- xpk/commands/kjob_common.py +13 -1
- xpk/commands/run.py +4 -5
- xpk/commands/shell.py +2 -2
- xpk/commands/storage.py +24 -6
- xpk/commands/workload.py +66 -27
- xpk/core/blueprint/blueprint_generator.py +115 -47
- xpk/core/capacity.py +66 -6
- xpk/core/cluster.py +282 -13
- xpk/core/config.py +1 -65
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +145 -72
- xpk/core/filestore.py +2 -6
- xpk/core/gcsfuse.py +22 -4
- xpk/core/jobset.py +143 -0
- xpk/core/kjob.py +21 -18
- xpk/core/kueue.py +194 -4
- xpk/core/mtc.py +195 -0
- xpk/core/network.py +23 -1
- xpk/core/nodepool.py +17 -4
- xpk/core/pathways.py +2 -3
- xpk/core/resources.py +21 -0
- xpk/core/storage.py +1 -95
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +1 -45
- xpk/core/workload_decorators/rdma_decorator.py +8 -10
- xpk/core/workload_decorators/tcpx_decorator.py +185 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +22 -14
- xpk/parser/cluster.py +589 -389
- xpk/parser/storage.py +12 -3
- xpk/parser/workload.py +21 -3
- xpk/utils/kubectl.py +4 -1
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/METADATA +178 -96
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/RECORD +41 -38
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/WHEEL +1 -1
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/top_level.txt +0 -0
xpk/core/storage.py
CHANGED
|
@@ -46,6 +46,7 @@ STORAGE_CRD_NAME = f"{XPK_API_GROUP_NAME}.{STORAGE_CRD_PLURAL}"
|
|
|
46
46
|
GCS_FUSE_TYPE = "gcsfuse"
|
|
47
47
|
GCP_FILESTORE_TYPE = "gcpfilestore"
|
|
48
48
|
PARALLELSTORE_TYPE = "parallelstore"
|
|
49
|
+
LUSTRE_TYPE = "lustre"
|
|
49
50
|
GCE_PD_TYPE = "pd"
|
|
50
51
|
MANIFESTS_PATH = os.path.abspath("xpkclusters/storage-manifests")
|
|
51
52
|
GCS_FUSE_ANNOTATIONS = {
|
|
@@ -365,101 +366,6 @@ def get_storage_annotations(storages: list[Storage]) -> list[str]:
|
|
|
365
366
|
return annotations
|
|
366
367
|
|
|
367
368
|
|
|
368
|
-
def get_storage_volume_mounts_yaml(storages: list[Storage]) -> str:
|
|
369
|
-
"""
|
|
370
|
-
Generates the YAML representation of the volumeMounts section for the given Storages.
|
|
371
|
-
|
|
372
|
-
This function creates the YAML snippet that defines how the storage volumes
|
|
373
|
-
should be mounted within a Pod's containers.
|
|
374
|
-
|
|
375
|
-
Args:
|
|
376
|
-
storages: A list of Storage objects.
|
|
377
|
-
|
|
378
|
-
Returns:
|
|
379
|
-
A string containing the YAML representation of the volumeMounts section.
|
|
380
|
-
"""
|
|
381
|
-
yaml_str = ""
|
|
382
|
-
for storage in storages:
|
|
383
|
-
yaml_str += f"""- name: {storage.pv}
|
|
384
|
-
mountPath: {storage.mount_point}
|
|
385
|
-
readOnly: {storage.readonly}
|
|
386
|
-
"""
|
|
387
|
-
return yaml_str
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
def get_storage_volumes_yaml(storages: list[Storage]) -> str:
|
|
391
|
-
"""
|
|
392
|
-
Generates the YAML representation of the volumes section for the given Storages.
|
|
393
|
-
|
|
394
|
-
This function creates the YAML snippet that defines the volumes to be
|
|
395
|
-
mounted in a Pod, including the PersistentVolumeClaim associated with
|
|
396
|
-
each Storage.
|
|
397
|
-
|
|
398
|
-
Args:
|
|
399
|
-
storages: A list of Storage objects.
|
|
400
|
-
|
|
401
|
-
Returns:
|
|
402
|
-
A string containing the YAML representation of the volumes section.
|
|
403
|
-
"""
|
|
404
|
-
yaml_str = ""
|
|
405
|
-
for storage in storages:
|
|
406
|
-
yaml_str += f"""- name: {storage.pv}
|
|
407
|
-
persistentVolumeClaim:
|
|
408
|
-
claimName: {storage.pvc}
|
|
409
|
-
readOnly: {storage.readonly}
|
|
410
|
-
"""
|
|
411
|
-
return yaml_str
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
def get_storage_volume_mounts_for_gpu(
|
|
415
|
-
storages: list[Storage],
|
|
416
|
-
) -> list[dict]:
|
|
417
|
-
"""
|
|
418
|
-
Generates the YAML representation of the volumeMounts section for the given Storages.
|
|
419
|
-
|
|
420
|
-
This function creates the list of storage specifications that define how the storage volumes
|
|
421
|
-
should be mounted within a Pod's containers.
|
|
422
|
-
|
|
423
|
-
Args:
|
|
424
|
-
storages: A list of Storage objects.
|
|
425
|
-
|
|
426
|
-
Returns:
|
|
427
|
-
A list containing the dictionary representation of the volumeMounts section.
|
|
428
|
-
"""
|
|
429
|
-
return [
|
|
430
|
-
{
|
|
431
|
-
"name": storage.pv,
|
|
432
|
-
"mountPath": storage.mount_point,
|
|
433
|
-
"readOnly": storage.readonly,
|
|
434
|
-
}
|
|
435
|
-
for storage in storages
|
|
436
|
-
]
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
def get_storage_volumes_yaml_for_gpu(storages: list[Storage]) -> str:
|
|
440
|
-
"""
|
|
441
|
-
Generates the YAML representation of the volumes section for the given Storages.
|
|
442
|
-
|
|
443
|
-
This function creates the YAML snippet that defines the volumes to be
|
|
444
|
-
mounted in a Pod, including the PersistentVolumeClaim associated with
|
|
445
|
-
each Storage.
|
|
446
|
-
|
|
447
|
-
Args:
|
|
448
|
-
storages: A list of Storage objects.
|
|
449
|
-
|
|
450
|
-
Returns:
|
|
451
|
-
A string containing the YAML representation of the volumes section.
|
|
452
|
-
"""
|
|
453
|
-
yaml_str = ""
|
|
454
|
-
for storage in storages:
|
|
455
|
-
yaml_str += f"""- name: {storage.pv}
|
|
456
|
-
persistentVolumeClaim:
|
|
457
|
-
claimName: {storage.pvc}
|
|
458
|
-
readOnly: {storage.readonly}
|
|
459
|
-
"""
|
|
460
|
-
return yaml_str
|
|
461
|
-
|
|
462
|
-
|
|
463
369
|
def get_storage_volumes_yaml_dict(storages: list[Storage]) -> list[dict]:
|
|
464
370
|
vols = []
|
|
465
371
|
for storage in storages:
|
xpk/core/workload.py
CHANGED
|
@@ -14,18 +14,9 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
import yaml
|
|
18
|
-
|
|
19
|
-
from ..utils import templates
|
|
20
17
|
from ..utils.console import xpk_exit, xpk_print
|
|
21
|
-
from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE
|
|
22
18
|
from .commands import run_command_for_value
|
|
23
19
|
from .gcloud_context import zone_to_region
|
|
24
|
-
from .storage import Storage, get_storage_volume_mounts_for_gpu
|
|
25
|
-
from .system_characteristics import SystemCharacteristics
|
|
26
|
-
|
|
27
|
-
RXDM_CONTAINER_A3HIGH_PATH = '/../templates/rxdm_container_a3high.yaml'
|
|
28
|
-
RXDM_CONTAINER_A3MEGA_PATH = '/../templates/rxdm_container_a3mega.yaml'
|
|
29
20
|
|
|
30
21
|
|
|
31
22
|
def workload_list_awk_command(filter_key) -> str:
|
|
@@ -131,7 +122,7 @@ def get_workload_list(args) -> tuple[int, str]:
|
|
|
131
122
|
)
|
|
132
123
|
workload_list_filter_job_cmd = determine_workload_list_filter_by_job(args)
|
|
133
124
|
command = (
|
|
134
|
-
f'kubectl get workloads -o=custom-columns="{s}" '
|
|
125
|
+
f'kubectl get workloads --ignore-not-found -o=custom-columns="{s}" '
|
|
135
126
|
f'{workload_list_filter_status_cmd} {workload_list_filter_job_cmd}'
|
|
136
127
|
)
|
|
137
128
|
|
|
@@ -249,38 +240,3 @@ def wait_for_job_completion(args) -> int:
|
|
|
249
240
|
xpk_print('Your workload did not complete successfully')
|
|
250
241
|
return 125
|
|
251
242
|
return 0
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
def add_gpu_rxdm_container(
|
|
255
|
-
jobset_manifest_str: str,
|
|
256
|
-
system: SystemCharacteristics,
|
|
257
|
-
all_storages: list[Storage],
|
|
258
|
-
) -> str:
|
|
259
|
-
"""Add gpu rxdm container to jobset manifest based on user provided arguments.
|
|
260
|
-
|
|
261
|
-
Args:
|
|
262
|
-
jobset_manifest_str: the JobSet manifest as a YAML string.
|
|
263
|
-
system: system characteristics.
|
|
264
|
-
all_storages: list of all storages.
|
|
265
|
-
|
|
266
|
-
Returns:
|
|
267
|
-
str: the modified JobSet manifest as a YAML string.
|
|
268
|
-
"""
|
|
269
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
270
|
-
gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3HIGH_PATH)
|
|
271
|
-
elif system.device_type == H100_MEGA_DEVICE_TYPE:
|
|
272
|
-
gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3MEGA_PATH)
|
|
273
|
-
else:
|
|
274
|
-
return jobset_manifest_str
|
|
275
|
-
|
|
276
|
-
storage_volume_mounts = get_storage_volume_mounts_for_gpu(all_storages)
|
|
277
|
-
gpu_rxdm_container['volumeMounts'].extend(storage_volume_mounts)
|
|
278
|
-
|
|
279
|
-
manifest = yaml.safe_load(jobset_manifest_str)
|
|
280
|
-
|
|
281
|
-
for job in manifest['spec']['replicatedJobs']:
|
|
282
|
-
job['template']['spec']['template']['spec']['containers'].append(
|
|
283
|
-
gpu_rxdm_container
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
return yaml.dump(manifest, sort_keys=False)
|
|
@@ -68,22 +68,20 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
|
68
68
|
|
|
69
69
|
|
|
70
70
|
def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
71
|
-
|
|
72
|
-
'
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
],
|
|
78
|
-
']',
|
|
79
|
-
]
|
|
80
|
-
return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
|
|
71
|
+
entries = ',\n'.join([
|
|
72
|
+
f' {{"interfaceName":"eth{i}","network":"{network}"}}'
|
|
73
|
+
for i, network in enumerate(sub_networks)
|
|
74
|
+
])
|
|
75
|
+
interfaces = f'[\n{entries}\n]'
|
|
76
|
+
return 'networking.gke.io/interfaces', literal_string(interfaces)
|
|
81
77
|
|
|
82
78
|
|
|
83
79
|
def add_annotations(job_manifest: dict, sub_networks: list[str]):
|
|
84
80
|
"""Adds or updates annotations in the Pod template."""
|
|
85
81
|
annotations = job_manifest['spec']['template']['metadata']['annotations']
|
|
86
82
|
interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
|
|
83
|
+
if annotations is None:
|
|
84
|
+
annotations = {}
|
|
87
85
|
annotations.update({
|
|
88
86
|
'networking.gke.io/default-interface': 'eth0',
|
|
89
87
|
interfaces_key: interfaces_value,
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
from ...utils.yaml import literal_string
|
|
20
|
+
|
|
21
|
+
# Component version
|
|
22
|
+
tcpx = 'v2.0.11'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
26
|
+
add_volumes(job_manifest)
|
|
27
|
+
add_tolerations(job_manifest)
|
|
28
|
+
add_tcpx_daemon_container(job_manifest)
|
|
29
|
+
update_gpu_containers(job_manifest)
|
|
30
|
+
return job_manifest
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def decorate_job(job_manifest: dict) -> dict:
|
|
34
|
+
add_annotations(job_manifest)
|
|
35
|
+
add_volumes(job_manifest)
|
|
36
|
+
add_tolerations(job_manifest)
|
|
37
|
+
add_tcpx_daemon_container(job_manifest)
|
|
38
|
+
update_gpu_containers(job_manifest)
|
|
39
|
+
return job_manifest
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def decorate_jobset(jobset_manifest_str: str) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
jobset_manifest_str: The JobSet manifest as a YAML string.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
The modified JobSet manifest as a YAML string.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
manifest = yaml.safe_load(jobset_manifest_str)
|
|
54
|
+
|
|
55
|
+
for job in manifest['spec']['replicatedJobs']:
|
|
56
|
+
job_manifest = job['template']
|
|
57
|
+
job_manifest = decorate_job(job_manifest)
|
|
58
|
+
return yaml.dump(manifest, sort_keys=False)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_interfaces_annotation() -> dict:
|
|
62
|
+
interfaces = [
|
|
63
|
+
'[',
|
|
64
|
+
' {"interfaceName":"eth0","network":"default"},',
|
|
65
|
+
' {"interfaceName":"eth1","network":"vpc1"},',
|
|
66
|
+
' {"interfaceName":"eth2","network":"vpc2"},',
|
|
67
|
+
' {"interfaceName":"eth3","network":"vpc3"},',
|
|
68
|
+
' {"interfaceName":"eth4","network":"vpc4"}',
|
|
69
|
+
']',
|
|
70
|
+
]
|
|
71
|
+
return {'networking.gke.io/interfaces': literal_string('\n'.join(interfaces))}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_tcpx_deamon_annotation() -> dict:
|
|
75
|
+
return {
|
|
76
|
+
'devices.gke.io/container.tcpx-daemon': literal_string(
|
|
77
|
+
'- path: /dev/nvidia0\n'
|
|
78
|
+
'- path: /dev/nvidia1\n'
|
|
79
|
+
'- path: /dev/nvidia2\n'
|
|
80
|
+
'- path: /dev/nvidia3\n'
|
|
81
|
+
'- path: /dev/nvidia4\n'
|
|
82
|
+
'- path: /dev/nvidia5\n'
|
|
83
|
+
'- path: /dev/nvidia6\n'
|
|
84
|
+
'- path: /dev/nvidia7\n'
|
|
85
|
+
'- path: /dev/nvidiactl\n'
|
|
86
|
+
'- path: /dev/nvidia-uvm\n'
|
|
87
|
+
)
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def add_annotations(job_manifest: dict):
|
|
92
|
+
"""Adds or updates annotations in the Pod template."""
|
|
93
|
+
annotations: dict = (
|
|
94
|
+
job_manifest.setdefault('spec', {})
|
|
95
|
+
.setdefault('template', {})
|
|
96
|
+
.setdefault('metadata', {})
|
|
97
|
+
.setdefault('annotations', {})
|
|
98
|
+
)
|
|
99
|
+
annotations.update(get_tcpx_deamon_annotation())
|
|
100
|
+
annotations.update({'networking.gke.io/default-interface': 'eth0'})
|
|
101
|
+
annotations.update(get_interfaces_annotation())
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def add_tolerations(job_manifest: dict):
|
|
105
|
+
"""Adds tolerations to the Pod spec."""
|
|
106
|
+
tolerations: list = (
|
|
107
|
+
job_manifest.setdefault('spec', {})
|
|
108
|
+
.setdefault('template', {})
|
|
109
|
+
.setdefault('spec', {})
|
|
110
|
+
.setdefault('tolerations', [])
|
|
111
|
+
)
|
|
112
|
+
tolerations.append({
|
|
113
|
+
'key': 'user-workload',
|
|
114
|
+
'operator': 'Equal',
|
|
115
|
+
'value': 'true',
|
|
116
|
+
'effect': 'NoSchedule',
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def add_volumes(job_manifest: dict):
|
|
121
|
+
"""Adds volumes to the Pod spec."""
|
|
122
|
+
volumes: list = (
|
|
123
|
+
job_manifest.setdefault('spec', {})
|
|
124
|
+
.setdefault('template', {})
|
|
125
|
+
.setdefault('spec', {})
|
|
126
|
+
.setdefault('volumes', [])
|
|
127
|
+
)
|
|
128
|
+
volumes.append({
|
|
129
|
+
'name': 'libraries',
|
|
130
|
+
'hostPath': {'path': '/home/kubernetes/bin/nvidia/lib64'},
|
|
131
|
+
})
|
|
132
|
+
volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
|
|
133
|
+
volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
|
|
134
|
+
volumes.append(
|
|
135
|
+
{'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def add_tcpx_daemon_container(job_manifest):
|
|
140
|
+
"""Adds the tcpx-daemon container to the Pod spec."""
|
|
141
|
+
tcpxo_daemon_container = {
|
|
142
|
+
'name': 'tcpx-daemon',
|
|
143
|
+
'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{tcpx}',
|
|
144
|
+
'imagePullPolicy': 'Always',
|
|
145
|
+
'restartPolicy': 'Always',
|
|
146
|
+
'command': [
|
|
147
|
+
'/tcpgpudmarxd/build/app/tcpgpudmarxd',
|
|
148
|
+
'--gpu_nic_preset',
|
|
149
|
+
'a3vm',
|
|
150
|
+
'--gpu_shmem_type',
|
|
151
|
+
'fd',
|
|
152
|
+
'--uds_path',
|
|
153
|
+
'/run/tcpx',
|
|
154
|
+
'--setup_param',
|
|
155
|
+
'"--verbose 128 2 0 "',
|
|
156
|
+
],
|
|
157
|
+
'securityContext': {'capabilities': {'add': ['NET_ADMIN']}},
|
|
158
|
+
'volumeMounts': [
|
|
159
|
+
{'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'},
|
|
160
|
+
{'name': 'tcpx-socket', 'mountPath': '/run/tcpx'},
|
|
161
|
+
{'name': 'sys', 'mountPath': '/hostsysfs'},
|
|
162
|
+
{'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
|
|
163
|
+
],
|
|
164
|
+
'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
|
|
165
|
+
}
|
|
166
|
+
spec = job_manifest['spec']['template']['spec']
|
|
167
|
+
spec.setdefault('initContainers', [])
|
|
168
|
+
spec['initContainers'].append(tcpxo_daemon_container)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def update_gpu_containers(job_manifest):
|
|
172
|
+
for container in job_manifest['spec']['template']['spec']['containers']:
|
|
173
|
+
if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
|
|
174
|
+
env: list = container.setdefault('env', [])
|
|
175
|
+
env.append(
|
|
176
|
+
{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
|
|
177
|
+
)
|
|
178
|
+
volumeMounts: list = container.setdefault('volumeMounts', [])
|
|
179
|
+
volumeMounts.append({'name': 'tcpx-socket', 'mountPath': '/tmp'})
|
|
180
|
+
volumeMounts.append(
|
|
181
|
+
{'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'}
|
|
182
|
+
)
|
|
183
|
+
container['volumeMounts'].append(
|
|
184
|
+
{'name': 'dshm', 'mountPath': '/dev/shm'}
|
|
185
|
+
)
|
|
@@ -15,6 +15,7 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
import yaml
|
|
18
|
+
|
|
18
19
|
from ...utils.yaml import literal_string
|
|
19
20
|
|
|
20
21
|
# Component version
|
|
@@ -77,16 +78,12 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
|
77
78
|
|
|
78
79
|
|
|
79
80
|
def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
80
|
-
|
|
81
|
-
'
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
],
|
|
87
|
-
']',
|
|
88
|
-
]
|
|
89
|
-
return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
|
|
81
|
+
entries = ',\n'.join([
|
|
82
|
+
f' {{"interfaceName":"eth{i}","network":"{network}"}}'
|
|
83
|
+
for i, network in enumerate(sub_networks)
|
|
84
|
+
])
|
|
85
|
+
interfaces = f'[\n{entries}\n]'
|
|
86
|
+
return 'networking.gke.io/interfaces', literal_string(interfaces)
|
|
90
87
|
|
|
91
88
|
|
|
92
89
|
def get_tcpxo_deamon_entry() -> tuple[str, str]:
|
|
@@ -107,7 +104,11 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
|
|
|
107
104
|
|
|
108
105
|
def add_annotations(job_manifest: dict, sub_networks: list[str]):
|
|
109
106
|
"""Adds or updates annotations in the Pod template."""
|
|
110
|
-
|
|
107
|
+
metadata = job_manifest['spec']['template']['metadata']
|
|
108
|
+
annotations = metadata.get('annotations')
|
|
109
|
+
if annotations is None:
|
|
110
|
+
annotations = {}
|
|
111
|
+
metadata['annotations'] = annotations
|
|
111
112
|
tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
|
|
112
113
|
interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
|
|
113
114
|
annotations.update({
|
|
@@ -141,6 +142,9 @@ def add_volumes(job_manifest):
|
|
|
141
142
|
'name': 'aperture-devices',
|
|
142
143
|
'hostPath': {'path': '/dev/aperture_devices'},
|
|
143
144
|
})
|
|
145
|
+
volumes.append(
|
|
146
|
+
{'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
|
|
147
|
+
)
|
|
144
148
|
|
|
145
149
|
|
|
146
150
|
def add_tcpxo_daemon_container(job_manifest):
|
|
@@ -149,6 +153,7 @@ def add_tcpxo_daemon_container(job_manifest):
|
|
|
149
153
|
'name': 'tcpxo-daemon',
|
|
150
154
|
'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
|
|
151
155
|
'imagePullPolicy': 'Always',
|
|
156
|
+
'restartPolicy': 'Always',
|
|
152
157
|
'command': ['/bin/sh', '-c'],
|
|
153
158
|
'args': [
|
|
154
159
|
'set -ex\nchmod 755'
|
|
@@ -165,9 +170,9 @@ def add_tcpxo_daemon_container(job_manifest):
|
|
|
165
170
|
],
|
|
166
171
|
'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
|
|
167
172
|
}
|
|
168
|
-
job_manifest['spec']['template']['spec']
|
|
169
|
-
|
|
170
|
-
)
|
|
173
|
+
spec = job_manifest['spec']['template']['spec']
|
|
174
|
+
spec.setdefault('initContainers', [])
|
|
175
|
+
spec['initContainers'].append(tcpxo_daemon_container)
|
|
171
176
|
|
|
172
177
|
|
|
173
178
|
def update_gpu_containers(job_manifest):
|
|
@@ -188,3 +193,6 @@ def update_gpu_containers(job_manifest):
|
|
|
188
193
|
container['volumeMounts'].append(
|
|
189
194
|
{'name': 'libraries', 'mountPath': '/usr/local/nvidia'}
|
|
190
195
|
)
|
|
196
|
+
container['volumeMounts'].append(
|
|
197
|
+
{'name': 'dshm', 'mountPath': '/dev/shm'}
|
|
198
|
+
)
|