xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -13
- xpk/commands/cluster.py +240 -71
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/common.py +33 -1
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +30 -18
- xpk/commands/run.py +17 -12
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +75 -19
- xpk/commands/workload.py +161 -324
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +335 -45
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +193 -12
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +5 -1
- xpk/core/gcsfuse.py +27 -6
- xpk/core/kjob.py +66 -20
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/nap.py +4 -0
- xpk/core/network.py +34 -22
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/resources.py +21 -0
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +28 -83
- xpk/core/workload_decorators/rdma_decorator.py +11 -15
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
- xpk/parser/cluster.py +574 -381
- xpk/parser/storage.py +25 -5
- xpk/parser/workload.py +59 -31
- xpk/utils/kubectl.py +4 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/core/workload.py
CHANGED
|
@@ -14,12 +14,19 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
from ..utils import templates
|
|
17
20
|
from ..utils.console import xpk_exit, xpk_print
|
|
18
21
|
from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE
|
|
19
22
|
from .commands import run_command_for_value
|
|
20
23
|
from .gcloud_context import zone_to_region
|
|
24
|
+
from .storage import Storage, get_storage_volume_mounts_for_gpu
|
|
21
25
|
from .system_characteristics import SystemCharacteristics
|
|
22
26
|
|
|
27
|
+
RXDM_CONTAINER_A3HIGH_PATH = '/../templates/rxdm_container_a3high.yaml'
|
|
28
|
+
RXDM_CONTAINER_A3MEGA_PATH = '/../templates/rxdm_container_a3mega.yaml'
|
|
29
|
+
|
|
23
30
|
|
|
24
31
|
def workload_list_awk_command(filter_key) -> str:
|
|
25
32
|
"""Function returns the awk command needed from the filter specified.
|
|
@@ -124,7 +131,7 @@ def get_workload_list(args) -> tuple[int, str]:
|
|
|
124
131
|
)
|
|
125
132
|
workload_list_filter_job_cmd = determine_workload_list_filter_by_job(args)
|
|
126
133
|
command = (
|
|
127
|
-
f'kubectl get workloads -o=custom-columns="{s}" '
|
|
134
|
+
f'kubectl get workloads --ignore-not-found -o=custom-columns="{s}" '
|
|
128
135
|
f'{workload_list_filter_status_cmd} {workload_list_filter_job_cmd}'
|
|
129
136
|
)
|
|
130
137
|
|
|
@@ -244,98 +251,36 @@ def wait_for_job_completion(args) -> int:
|
|
|
244
251
|
return 0
|
|
245
252
|
|
|
246
253
|
|
|
247
|
-
def
|
|
248
|
-
|
|
254
|
+
def add_gpu_rxdm_container(
|
|
255
|
+
jobset_manifest_str: str,
|
|
256
|
+
system: SystemCharacteristics,
|
|
257
|
+
all_storages: list[Storage],
|
|
258
|
+
) -> str:
|
|
259
|
+
"""Add gpu rxdm container to jobset manifest based on user provided arguments.
|
|
249
260
|
|
|
250
261
|
Args:
|
|
262
|
+
jobset_manifest_str: the JobSet manifest as a YAML string.
|
|
251
263
|
system: system characteristics.
|
|
264
|
+
all_storages: list of all storages.
|
|
252
265
|
|
|
253
266
|
Returns:
|
|
254
|
-
str:
|
|
267
|
+
str: the modified JobSet manifest as a YAML string.
|
|
255
268
|
"""
|
|
256
|
-
gpu_volume = ''
|
|
257
269
|
if system.device_type == H100_DEVICE_TYPE:
|
|
258
|
-
|
|
259
|
-
hostPath:
|
|
260
|
-
path: /home/kubernetes/bin/nvidia/lib64
|
|
261
|
-
- name: tcpd-socket
|
|
262
|
-
hostPath:
|
|
263
|
-
path: /run/tcpx
|
|
264
|
-
- name: shared-memory
|
|
265
|
-
emptyDir:
|
|
266
|
-
medium: "Memory"
|
|
267
|
-
sizeLimit: 200Gi
|
|
268
|
-
- name: workload-terminated-volume
|
|
269
|
-
emptyDir:
|
|
270
|
-
- name: tcpx-nccl-plugin-volume
|
|
271
|
-
emptyDir:"""
|
|
270
|
+
gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3HIGH_PATH)
|
|
272
271
|
elif system.device_type == H100_MEGA_DEVICE_TYPE:
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
- name: shared-memory
|
|
277
|
-
emptyDir:
|
|
278
|
-
medium: "Memory"
|
|
279
|
-
sizeLimit: 1Gi
|
|
280
|
-
- name: workload-terminated-volume
|
|
281
|
-
emptyDir:"""
|
|
282
|
-
return gpu_volume
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
def get_gpu_rxdm_image(system: SystemCharacteristics) -> str:
|
|
286
|
-
"""Get config of rxdm based on user provided arguments.
|
|
287
|
-
|
|
288
|
-
Args:
|
|
289
|
-
system: system characteristics.
|
|
290
|
-
|
|
291
|
-
Returns:
|
|
292
|
-
str: yaml containing the rxdm name and image
|
|
293
|
-
"""
|
|
294
|
-
gpu_rxdm_image = ''
|
|
295
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
296
|
-
gpu_rxdm_image = """- name: tcpd-daemon
|
|
297
|
-
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9"""
|
|
298
|
-
elif system.device_type == H100_MEGA_DEVICE_TYPE:
|
|
299
|
-
gpu_rxdm_image = """- name: fastrak-daemon
|
|
300
|
-
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.9"""
|
|
301
|
-
return gpu_rxdm_image
|
|
272
|
+
gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3MEGA_PATH)
|
|
273
|
+
else:
|
|
274
|
+
return jobset_manifest_str
|
|
302
275
|
|
|
276
|
+
storage_volume_mounts = get_storage_volume_mounts_for_gpu(all_storages)
|
|
277
|
+
gpu_rxdm_container['volumeMounts'].extend(storage_volume_mounts)
|
|
303
278
|
|
|
304
|
-
|
|
305
|
-
"""Get rxdm command based on user provided arguments.
|
|
279
|
+
manifest = yaml.safe_load(jobset_manifest_str)
|
|
306
280
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
Returns:
|
|
311
|
-
str: command of running rxdm container
|
|
312
|
-
"""
|
|
313
|
-
gpu_rxdm_cmd = ''
|
|
314
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
315
|
-
gpu_rxdm_cmd = (
|
|
316
|
-
'/tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm'
|
|
317
|
-
' --gpu_shmem_type fd --setup_param "--verbose 128 2 0"'
|
|
281
|
+
for job in manifest['spec']['replicatedJobs']:
|
|
282
|
+
job['template']['spec']['template']['spec']['containers'].append(
|
|
283
|
+
gpu_rxdm_container
|
|
318
284
|
)
|
|
319
|
-
elif system.device_type == H100_MEGA_DEVICE_TYPE:
|
|
320
|
-
gpu_rxdm_cmd = (
|
|
321
|
-
'set -ex; chmod 755 /fts/entrypoint_rxdm_container.sh;'
|
|
322
|
-
' /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid='
|
|
323
|
-
' --alsologtostderr'
|
|
324
|
-
)
|
|
325
|
-
return gpu_rxdm_cmd
|
|
326
|
-
|
|
327
285
|
|
|
328
|
-
|
|
329
|
-
"""Get gpu tcp volume based on user provided arguments.
|
|
330
|
-
|
|
331
|
-
Args:
|
|
332
|
-
system: system characteristics.
|
|
333
|
-
|
|
334
|
-
Returns:
|
|
335
|
-
str: yaml containing gpu tcp volume
|
|
336
|
-
"""
|
|
337
|
-
gpu_tcp_volume = ''
|
|
338
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
339
|
-
gpu_tcp_volume = """- name: tcpd-socket
|
|
340
|
-
mountPath: /tmp"""
|
|
341
|
-
return gpu_tcp_volume
|
|
286
|
+
return yaml.dump(manifest, sort_keys=False)
|
|
@@ -33,7 +33,7 @@ def decorate_kjob_template(job_manifest) -> str:
|
|
|
33
33
|
return job_manifest
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
|
|
36
|
+
def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
37
37
|
"""
|
|
38
38
|
Decorates a JobSet manifest with the necessary components for rdma-daemon.
|
|
39
39
|
|
|
@@ -68,24 +68,20 @@ def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
|
|
|
68
68
|
|
|
69
69
|
|
|
70
70
|
def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
71
|
-
|
|
72
|
-
'
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def add_annotations(job_manifest, sub_networks):
|
|
71
|
+
entries = ',\n'.join([
|
|
72
|
+
f' {{"interfaceName":"eth{i}","network":"{network}"}}'
|
|
73
|
+
for i, network in enumerate(sub_networks)
|
|
74
|
+
])
|
|
75
|
+
interfaces = f'[\n{entries}\n]'
|
|
76
|
+
return 'networking.gke.io/interfaces', literal_string(interfaces)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def add_annotations(job_manifest: dict, sub_networks: list[str]):
|
|
84
80
|
"""Adds or updates annotations in the Pod template."""
|
|
85
81
|
annotations = job_manifest['spec']['template']['metadata']['annotations']
|
|
86
82
|
interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
|
|
87
83
|
annotations.update({
|
|
88
|
-
'networking.gke.io/default-interface':
|
|
84
|
+
'networking.gke.io/default-interface': 'eth0',
|
|
89
85
|
interfaces_key: interfaces_value,
|
|
90
86
|
})
|
|
91
87
|
|
|
@@ -16,7 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import yaml
|
|
18
18
|
|
|
19
|
-
from ...core.storage import GCS_FUSE_TYPE, get_storage_volumes_yaml_dict,
|
|
19
|
+
from ...core.storage import GCS_FUSE_TYPE, PARALLELSTORE_TYPE, get_storage_volumes_yaml_dict, GCS_FUSE_ANNOTATIONS, PARALLELSTORE_ANNOTATIONS
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def decorate_jobset(jobset_manifest_str, storages) -> str:
|
|
@@ -42,9 +42,14 @@ def decorate_jobset(jobset_manifest_str, storages) -> str:
|
|
|
42
42
|
def add_annotations(job_manifest, storages):
|
|
43
43
|
"""Adds or updates storage annotations in the Pod template."""
|
|
44
44
|
annotations = job_manifest['spec']['template']['metadata']['annotations']
|
|
45
|
-
gcs_present =
|
|
45
|
+
gcs_present = any(storage.type == GCS_FUSE_TYPE for storage in storages)
|
|
46
46
|
if gcs_present:
|
|
47
|
-
annotations.update(
|
|
47
|
+
annotations.update(GCS_FUSE_ANNOTATIONS)
|
|
48
|
+
parallelstore_present = any(
|
|
49
|
+
storage.type == PARALLELSTORE_TYPE for storage in storages
|
|
50
|
+
)
|
|
51
|
+
if parallelstore_present:
|
|
52
|
+
annotations.update(PARALLELSTORE_ANNOTATIONS)
|
|
48
53
|
|
|
49
54
|
|
|
50
55
|
def add_volumes(job_manifest, storage_volumes):
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
from ...utils.yaml import literal_string
|
|
20
|
+
|
|
21
|
+
# Component version
|
|
22
|
+
tcpx = 'v2.0.11'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
26
|
+
add_volumes(job_manifest)
|
|
27
|
+
add_tolerations(job_manifest)
|
|
28
|
+
add_tcpxo_daemon_container(job_manifest)
|
|
29
|
+
update_gpu_containers(job_manifest)
|
|
30
|
+
return job_manifest
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def decorate_job(job_manifest: dict) -> dict:
|
|
34
|
+
add_annotations(job_manifest)
|
|
35
|
+
add_volumes(job_manifest)
|
|
36
|
+
add_tolerations(job_manifest)
|
|
37
|
+
add_tcpxo_daemon_container(job_manifest)
|
|
38
|
+
update_gpu_containers(job_manifest)
|
|
39
|
+
return job_manifest
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def decorate_jobset(jobset_manifest_str: str) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
jobset_manifest_str: The JobSet manifest as a YAML string.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
The modified JobSet manifest as a YAML string.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
manifest = yaml.safe_load(jobset_manifest_str)
|
|
54
|
+
|
|
55
|
+
for job in manifest['spec']['replicatedJobs']:
|
|
56
|
+
job_manifest = job['template']
|
|
57
|
+
job_manifest = decorate_job(job_manifest)
|
|
58
|
+
return yaml.dump(manifest, sort_keys=False)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_interfaces_annotation() -> dict:
|
|
62
|
+
interfaces = [
|
|
63
|
+
'[',
|
|
64
|
+
' {"interfaceName":"eth0","network":"default"},',
|
|
65
|
+
' {"interfaceName":"eth1","network":"vpc1"},',
|
|
66
|
+
' {"interfaceName":"eth2","network":"vpc2"},',
|
|
67
|
+
' {"interfaceName":"eth3","network":"vpc3"},',
|
|
68
|
+
' {"interfaceName":"eth4","network":"vpc4"}',
|
|
69
|
+
']',
|
|
70
|
+
]
|
|
71
|
+
return {'networking.gke.io/interfaces': literal_string('\n'.join(interfaces))}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_tcpx_deamon_annotation() -> dict:
|
|
75
|
+
return {
|
|
76
|
+
'devices.gke.io/container.tcpx-daemon': literal_string(
|
|
77
|
+
'- path: /dev/nvidia0\n'
|
|
78
|
+
'- path: /dev/nvidia1\n'
|
|
79
|
+
'- path: /dev/nvidia2\n'
|
|
80
|
+
'- path: /dev/nvidia3\n'
|
|
81
|
+
'- path: /dev/nvidia4\n'
|
|
82
|
+
'- path: /dev/nvidia5\n'
|
|
83
|
+
'- path: /dev/nvidia6\n'
|
|
84
|
+
'- path: /dev/nvidia7\n'
|
|
85
|
+
'- path: /dev/nvidiactl\n'
|
|
86
|
+
'- path: /dev/nvidia-uvm\n'
|
|
87
|
+
)
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def add_annotations(job_manifest: dict):
|
|
92
|
+
"""Adds or updates annotations in the Pod template."""
|
|
93
|
+
annotations: dict = (
|
|
94
|
+
job_manifest.setdefault('spec', {})
|
|
95
|
+
.setdefault('template', {})
|
|
96
|
+
.setdefault('metadata', {})
|
|
97
|
+
.setdefault('annotations', {})
|
|
98
|
+
)
|
|
99
|
+
annotations.update(get_tcpx_deamon_annotation())
|
|
100
|
+
annotations.update({'networking.gke.io/default-interface': 'eth0'})
|
|
101
|
+
annotations.update(get_interfaces_annotation())
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def add_tolerations(job_manifest: dict):
|
|
105
|
+
"""Adds tolerations to the Pod spec."""
|
|
106
|
+
tolerations: list = (
|
|
107
|
+
job_manifest.setdefault('spec', {})
|
|
108
|
+
.setdefault('template', {})
|
|
109
|
+
.setdefault('spec', {})
|
|
110
|
+
.setdefault('tolerations', [])
|
|
111
|
+
)
|
|
112
|
+
tolerations.append({
|
|
113
|
+
'key': 'user-workload',
|
|
114
|
+
'operator': 'Equal',
|
|
115
|
+
'value': 'true',
|
|
116
|
+
'effect': 'NoSchedule',
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def add_volumes(job_manifest: dict):
|
|
121
|
+
"""Adds volumes to the Pod spec."""
|
|
122
|
+
volumes: list = (
|
|
123
|
+
job_manifest.setdefault('spec', {})
|
|
124
|
+
.setdefault('template', {})
|
|
125
|
+
.setdefault('spec', {})
|
|
126
|
+
.setdefault('volumes', [])
|
|
127
|
+
)
|
|
128
|
+
volumes.append({
|
|
129
|
+
'name': 'libraries',
|
|
130
|
+
'hostPath': {'path': '/home/kubernetes/bin/nvidia/lib64'},
|
|
131
|
+
})
|
|
132
|
+
volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
|
|
133
|
+
volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def add_tcpxo_daemon_container(job_manifest):
|
|
137
|
+
"""Adds the tcpxo-daemon container to the Pod spec."""
|
|
138
|
+
tcpxo_daemon_container = {
|
|
139
|
+
'name': 'tcpx-daemon',
|
|
140
|
+
'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{tcpx}',
|
|
141
|
+
'imagePullPolicy': 'Always',
|
|
142
|
+
'restartPolicy': 'Always',
|
|
143
|
+
'command': [
|
|
144
|
+
'/tcpgpudmarxd/build/app/tcpgpudmarxd',
|
|
145
|
+
'--gpu_nic_preset',
|
|
146
|
+
'a3vm',
|
|
147
|
+
'--gpu_shmem_type',
|
|
148
|
+
'fd',
|
|
149
|
+
'--uds_path',
|
|
150
|
+
'/run/tcpx',
|
|
151
|
+
'--setup_param',
|
|
152
|
+
'"--verbose 128 2 0 "',
|
|
153
|
+
],
|
|
154
|
+
'securityContext': {'capabilities': {'add': ['NET_ADMIN']}},
|
|
155
|
+
'volumeMounts': [
|
|
156
|
+
{'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'},
|
|
157
|
+
{'name': 'tcpx-socket', 'mountPath': '/run/tcpx'},
|
|
158
|
+
{'name': 'sys', 'mountPath': '/hostsysfs'},
|
|
159
|
+
{'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
|
|
160
|
+
],
|
|
161
|
+
'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
|
|
162
|
+
}
|
|
163
|
+
spec = job_manifest['spec']['template']['spec']
|
|
164
|
+
spec.setdefault('initContainers', [])
|
|
165
|
+
spec['initContainers'].append(tcpxo_daemon_container)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def update_gpu_containers(job_manifest):
|
|
169
|
+
for container in job_manifest['spec']['template']['spec']['containers']:
|
|
170
|
+
if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
|
|
171
|
+
env: list = container.setdefault('env', [])
|
|
172
|
+
env.append(
|
|
173
|
+
{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
|
|
174
|
+
)
|
|
175
|
+
volumeMounts: list = container.setdefault('volumeMounts', [])
|
|
176
|
+
volumeMounts.append({'name': 'tcpx-socket', 'mountPath': '/tmp'})
|
|
177
|
+
volumeMounts.append(
|
|
178
|
+
{'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'}
|
|
179
|
+
)
|
|
@@ -57,7 +57,7 @@ def decorate_job(job_manifest: dict, sub_networks: list[str]) -> dict:
|
|
|
57
57
|
return job_manifest
|
|
58
58
|
|
|
59
59
|
|
|
60
|
-
def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
|
|
60
|
+
def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
61
61
|
"""
|
|
62
62
|
Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
|
|
63
63
|
|
|
@@ -77,16 +77,12 @@ def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
|
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
80
|
-
|
|
81
|
-
'
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
],
|
|
87
|
-
']',
|
|
88
|
-
]
|
|
89
|
-
return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
|
|
80
|
+
entries = ',\n'.join([
|
|
81
|
+
f' {{"interfaceName":"eth{i}","network":"{network}"}}'
|
|
82
|
+
for i, network in enumerate(sub_networks)
|
|
83
|
+
])
|
|
84
|
+
interfaces = f'[\n{entries}\n]'
|
|
85
|
+
return 'networking.gke.io/interfaces', literal_string(interfaces)
|
|
90
86
|
|
|
91
87
|
|
|
92
88
|
def get_tcpxo_deamon_entry() -> tuple[str, str]:
|
|
@@ -105,9 +101,13 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
|
|
|
105
101
|
)
|
|
106
102
|
|
|
107
103
|
|
|
108
|
-
def add_annotations(job_manifest, sub_networks):
|
|
104
|
+
def add_annotations(job_manifest: dict, sub_networks: list[str]):
|
|
109
105
|
"""Adds or updates annotations in the Pod template."""
|
|
110
|
-
|
|
106
|
+
metadata = job_manifest['spec']['template']['metadata']
|
|
107
|
+
annotations = metadata.get('annotations')
|
|
108
|
+
if annotations is None:
|
|
109
|
+
annotations = {}
|
|
110
|
+
metadata['annotations'] = annotations
|
|
111
111
|
tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
|
|
112
112
|
interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
|
|
113
113
|
annotations.update({
|
|
@@ -149,6 +149,7 @@ def add_tcpxo_daemon_container(job_manifest):
|
|
|
149
149
|
'name': 'tcpxo-daemon',
|
|
150
150
|
'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
|
|
151
151
|
'imagePullPolicy': 'Always',
|
|
152
|
+
'restartPolicy': 'Always',
|
|
152
153
|
'command': ['/bin/sh', '-c'],
|
|
153
154
|
'args': [
|
|
154
155
|
'set -ex\nchmod 755'
|
|
@@ -165,9 +166,9 @@ def add_tcpxo_daemon_container(job_manifest):
|
|
|
165
166
|
],
|
|
166
167
|
'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
|
|
167
168
|
}
|
|
168
|
-
job_manifest['spec']['template']['spec']
|
|
169
|
-
|
|
170
|
-
)
|
|
169
|
+
spec = job_manifest['spec']['template']['spec']
|
|
170
|
+
spec.setdefault('initContainers', [])
|
|
171
|
+
spec['initContainers'].append(tcpxo_daemon_container)
|
|
171
172
|
|
|
172
173
|
|
|
173
174
|
def update_gpu_containers(job_manifest):
|