xpk 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +2 -3
- xpk/commands/cluster.py +225 -73
- xpk/commands/common.py +33 -1
- xpk/commands/kjob_common.py +10 -1
- xpk/commands/run.py +2 -3
- xpk/commands/storage.py +14 -3
- xpk/commands/workload.py +17 -15
- xpk/core/blueprint/blueprint_generator.py +18 -18
- xpk/core/cluster.py +119 -8
- xpk/core/config.py +1 -1
- xpk/core/filestore.py +2 -6
- xpk/core/gcsfuse.py +22 -4
- xpk/core/kjob.py +20 -13
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/network.py +23 -1
- xpk/core/pathways.py +1 -1
- xpk/core/resources.py +21 -0
- xpk/core/workload.py +1 -1
- xpk/core/workload_decorators/rdma_decorator.py +6 -10
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +15 -14
- xpk/parser/cluster.py +573 -389
- xpk/parser/storage.py +11 -2
- xpk/utils/kubectl.py +4 -1
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/METADATA +134 -91
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/RECORD +31 -29
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/core/network.py
CHANGED
|
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..utils.console import xpk_print
|
|
17
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
18
18
|
from ..utils.file import write_tmp_file
|
|
19
19
|
from .commands import run_command_for_value, run_command_with_updates
|
|
20
20
|
from .gcloud_context import zone_to_region
|
|
@@ -235,6 +235,28 @@ def create_cluster_network_config(args) -> int:
|
|
|
235
235
|
return 0
|
|
236
236
|
|
|
237
237
|
|
|
238
|
+
def get_cluster_subnetworks(args) -> list[str]:
|
|
239
|
+
"""Gets the list of cluster networks.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
args: user provided arguments for running the command.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
list[str]: list of cluster networks
|
|
246
|
+
"""
|
|
247
|
+
command = 'kubectl get GKENetworkParamSet'
|
|
248
|
+
return_code, stdout = run_command_for_value(
|
|
249
|
+
command, 'Get Cluster Networks', args
|
|
250
|
+
)
|
|
251
|
+
if return_code != 0:
|
|
252
|
+
xpk_print('GKE Cluster Get NetworkParamSet failed')
|
|
253
|
+
xpk_exit(return_code)
|
|
254
|
+
|
|
255
|
+
networks = [line.split()[0] for line in stdout.splitlines()][1:]
|
|
256
|
+
|
|
257
|
+
return networks
|
|
258
|
+
|
|
259
|
+
|
|
238
260
|
def set_up_cluster_network_for_a3(args) -> int:
|
|
239
261
|
"""Set up GKE Cluster networks, subnets and firewall rules for A3.
|
|
240
262
|
Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node.
|
xpk/core/pathways.py
CHANGED
|
@@ -211,7 +211,7 @@ def append_custom_pathways_worker(args) -> str:
|
|
|
211
211
|
"""
|
|
212
212
|
yaml = """"""
|
|
213
213
|
if args.server_image or args.custom_pathways_worker_args:
|
|
214
|
-
yaml = """- componentType:
|
|
214
|
+
yaml = """- componentType: worker"""
|
|
215
215
|
indentation = (
|
|
216
216
|
' ' * 8
|
|
217
217
|
) # Currently 8, based on the YAML, may need to update in the future.
|
xpk/core/resources.py
CHANGED
|
@@ -236,3 +236,24 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
|
|
|
236
236
|
return system
|
|
237
237
|
|
|
238
238
|
return None
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def get_cluster_capacity_type(args) -> CapacityType | None:
|
|
242
|
+
"""Get systemCharcteristics based on the cluster resources configMap
|
|
243
|
+
Args:
|
|
244
|
+
args: user provided arguments for running the command.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
returns system characteristics
|
|
248
|
+
"""
|
|
249
|
+
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
250
|
+
cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
|
|
251
|
+
|
|
252
|
+
if cluster_config_map is None:
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
capacityValue = cluster_config_map.get('capacity_type')
|
|
256
|
+
if capacityValue is not None:
|
|
257
|
+
return CapacityType[capacityValue.upper()]
|
|
258
|
+
|
|
259
|
+
return None
|
xpk/core/workload.py
CHANGED
|
@@ -131,7 +131,7 @@ def get_workload_list(args) -> tuple[int, str]:
|
|
|
131
131
|
)
|
|
132
132
|
workload_list_filter_job_cmd = determine_workload_list_filter_by_job(args)
|
|
133
133
|
command = (
|
|
134
|
-
f'kubectl get workloads -o=custom-columns="{s}" '
|
|
134
|
+
f'kubectl get workloads --ignore-not-found -o=custom-columns="{s}" '
|
|
135
135
|
f'{workload_list_filter_status_cmd} {workload_list_filter_job_cmd}'
|
|
136
136
|
)
|
|
137
137
|
|
|
@@ -68,16 +68,12 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
|
68
68
|
|
|
69
69
|
|
|
70
70
|
def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
71
|
-
|
|
72
|
-
'
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
],
|
|
78
|
-
']',
|
|
79
|
-
]
|
|
80
|
-
return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
|
|
71
|
+
entries = ',\n'.join([
|
|
72
|
+
f' {{"interfaceName":"eth{i}","network":"{network}"}}'
|
|
73
|
+
for i, network in enumerate(sub_networks)
|
|
74
|
+
])
|
|
75
|
+
interfaces = f'[\n{entries}\n]'
|
|
76
|
+
return 'networking.gke.io/interfaces', literal_string(interfaces)
|
|
81
77
|
|
|
82
78
|
|
|
83
79
|
def add_annotations(job_manifest: dict, sub_networks: list[str]):
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
from ...utils.yaml import literal_string
|
|
20
|
+
|
|
21
|
+
# Component version
|
|
22
|
+
tcpx = 'v2.0.11'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
26
|
+
add_volumes(job_manifest)
|
|
27
|
+
add_tolerations(job_manifest)
|
|
28
|
+
add_tcpxo_daemon_container(job_manifest)
|
|
29
|
+
update_gpu_containers(job_manifest)
|
|
30
|
+
return job_manifest
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def decorate_job(job_manifest: dict) -> dict:
|
|
34
|
+
add_annotations(job_manifest)
|
|
35
|
+
add_volumes(job_manifest)
|
|
36
|
+
add_tolerations(job_manifest)
|
|
37
|
+
add_tcpxo_daemon_container(job_manifest)
|
|
38
|
+
update_gpu_containers(job_manifest)
|
|
39
|
+
return job_manifest
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def decorate_jobset(jobset_manifest_str: str) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
jobset_manifest_str: The JobSet manifest as a YAML string.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
The modified JobSet manifest as a YAML string.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
manifest = yaml.safe_load(jobset_manifest_str)
|
|
54
|
+
|
|
55
|
+
for job in manifest['spec']['replicatedJobs']:
|
|
56
|
+
job_manifest = job['template']
|
|
57
|
+
job_manifest = decorate_job(job_manifest)
|
|
58
|
+
return yaml.dump(manifest, sort_keys=False)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_interfaces_annotation() -> dict:
|
|
62
|
+
interfaces = [
|
|
63
|
+
'[',
|
|
64
|
+
' {"interfaceName":"eth0","network":"default"},',
|
|
65
|
+
' {"interfaceName":"eth1","network":"vpc1"},',
|
|
66
|
+
' {"interfaceName":"eth2","network":"vpc2"},',
|
|
67
|
+
' {"interfaceName":"eth3","network":"vpc3"},',
|
|
68
|
+
' {"interfaceName":"eth4","network":"vpc4"}',
|
|
69
|
+
']',
|
|
70
|
+
]
|
|
71
|
+
return {'networking.gke.io/interfaces': literal_string('\n'.join(interfaces))}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_tcpx_deamon_annotation() -> dict:
|
|
75
|
+
return {
|
|
76
|
+
'devices.gke.io/container.tcpx-daemon': literal_string(
|
|
77
|
+
'- path: /dev/nvidia0\n'
|
|
78
|
+
'- path: /dev/nvidia1\n'
|
|
79
|
+
'- path: /dev/nvidia2\n'
|
|
80
|
+
'- path: /dev/nvidia3\n'
|
|
81
|
+
'- path: /dev/nvidia4\n'
|
|
82
|
+
'- path: /dev/nvidia5\n'
|
|
83
|
+
'- path: /dev/nvidia6\n'
|
|
84
|
+
'- path: /dev/nvidia7\n'
|
|
85
|
+
'- path: /dev/nvidiactl\n'
|
|
86
|
+
'- path: /dev/nvidia-uvm\n'
|
|
87
|
+
)
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def add_annotations(job_manifest: dict):
|
|
92
|
+
"""Adds or updates annotations in the Pod template."""
|
|
93
|
+
annotations: dict = (
|
|
94
|
+
job_manifest.setdefault('spec', {})
|
|
95
|
+
.setdefault('template', {})
|
|
96
|
+
.setdefault('metadata', {})
|
|
97
|
+
.setdefault('annotations', {})
|
|
98
|
+
)
|
|
99
|
+
annotations.update(get_tcpx_deamon_annotation())
|
|
100
|
+
annotations.update({'networking.gke.io/default-interface': 'eth0'})
|
|
101
|
+
annotations.update(get_interfaces_annotation())
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def add_tolerations(job_manifest: dict):
|
|
105
|
+
"""Adds tolerations to the Pod spec."""
|
|
106
|
+
tolerations: list = (
|
|
107
|
+
job_manifest.setdefault('spec', {})
|
|
108
|
+
.setdefault('template', {})
|
|
109
|
+
.setdefault('spec', {})
|
|
110
|
+
.setdefault('tolerations', [])
|
|
111
|
+
)
|
|
112
|
+
tolerations.append({
|
|
113
|
+
'key': 'user-workload',
|
|
114
|
+
'operator': 'Equal',
|
|
115
|
+
'value': 'true',
|
|
116
|
+
'effect': 'NoSchedule',
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def add_volumes(job_manifest: dict):
|
|
121
|
+
"""Adds volumes to the Pod spec."""
|
|
122
|
+
volumes: list = (
|
|
123
|
+
job_manifest.setdefault('spec', {})
|
|
124
|
+
.setdefault('template', {})
|
|
125
|
+
.setdefault('spec', {})
|
|
126
|
+
.setdefault('volumes', [])
|
|
127
|
+
)
|
|
128
|
+
volumes.append({
|
|
129
|
+
'name': 'libraries',
|
|
130
|
+
'hostPath': {'path': '/home/kubernetes/bin/nvidia/lib64'},
|
|
131
|
+
})
|
|
132
|
+
volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
|
|
133
|
+
volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def add_tcpxo_daemon_container(job_manifest):
|
|
137
|
+
"""Adds the tcpxo-daemon container to the Pod spec."""
|
|
138
|
+
tcpxo_daemon_container = {
|
|
139
|
+
'name': 'tcpx-daemon',
|
|
140
|
+
'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{tcpx}',
|
|
141
|
+
'imagePullPolicy': 'Always',
|
|
142
|
+
'restartPolicy': 'Always',
|
|
143
|
+
'command': [
|
|
144
|
+
'/tcpgpudmarxd/build/app/tcpgpudmarxd',
|
|
145
|
+
'--gpu_nic_preset',
|
|
146
|
+
'a3vm',
|
|
147
|
+
'--gpu_shmem_type',
|
|
148
|
+
'fd',
|
|
149
|
+
'--uds_path',
|
|
150
|
+
'/run/tcpx',
|
|
151
|
+
'--setup_param',
|
|
152
|
+
'"--verbose 128 2 0 "',
|
|
153
|
+
],
|
|
154
|
+
'securityContext': {'capabilities': {'add': ['NET_ADMIN']}},
|
|
155
|
+
'volumeMounts': [
|
|
156
|
+
{'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'},
|
|
157
|
+
{'name': 'tcpx-socket', 'mountPath': '/run/tcpx'},
|
|
158
|
+
{'name': 'sys', 'mountPath': '/hostsysfs'},
|
|
159
|
+
{'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
|
|
160
|
+
],
|
|
161
|
+
'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
|
|
162
|
+
}
|
|
163
|
+
spec = job_manifest['spec']['template']['spec']
|
|
164
|
+
spec.setdefault('initContainers', [])
|
|
165
|
+
spec['initContainers'].append(tcpxo_daemon_container)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def update_gpu_containers(job_manifest):
|
|
169
|
+
for container in job_manifest['spec']['template']['spec']['containers']:
|
|
170
|
+
if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
|
|
171
|
+
env: list = container.setdefault('env', [])
|
|
172
|
+
env.append(
|
|
173
|
+
{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
|
|
174
|
+
)
|
|
175
|
+
volumeMounts: list = container.setdefault('volumeMounts', [])
|
|
176
|
+
volumeMounts.append({'name': 'tcpx-socket', 'mountPath': '/tmp'})
|
|
177
|
+
volumeMounts.append(
|
|
178
|
+
{'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'}
|
|
179
|
+
)
|
|
@@ -77,16 +77,12 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
80
|
-
|
|
81
|
-
'
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
],
|
|
87
|
-
']',
|
|
88
|
-
]
|
|
89
|
-
return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
|
|
80
|
+
entries = ',\n'.join([
|
|
81
|
+
f' {{"interfaceName":"eth{i}","network":"{network}"}}'
|
|
82
|
+
for i, network in enumerate(sub_networks)
|
|
83
|
+
])
|
|
84
|
+
interfaces = f'[\n{entries}\n]'
|
|
85
|
+
return 'networking.gke.io/interfaces', literal_string(interfaces)
|
|
90
86
|
|
|
91
87
|
|
|
92
88
|
def get_tcpxo_deamon_entry() -> tuple[str, str]:
|
|
@@ -107,7 +103,11 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
|
|
|
107
103
|
|
|
108
104
|
def add_annotations(job_manifest: dict, sub_networks: list[str]):
|
|
109
105
|
"""Adds or updates annotations in the Pod template."""
|
|
110
|
-
|
|
106
|
+
metadata = job_manifest['spec']['template']['metadata']
|
|
107
|
+
annotations = metadata.get('annotations')
|
|
108
|
+
if annotations is None:
|
|
109
|
+
annotations = {}
|
|
110
|
+
metadata['annotations'] = annotations
|
|
111
111
|
tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
|
|
112
112
|
interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
|
|
113
113
|
annotations.update({
|
|
@@ -149,6 +149,7 @@ def add_tcpxo_daemon_container(job_manifest):
|
|
|
149
149
|
'name': 'tcpxo-daemon',
|
|
150
150
|
'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
|
|
151
151
|
'imagePullPolicy': 'Always',
|
|
152
|
+
'restartPolicy': 'Always',
|
|
152
153
|
'command': ['/bin/sh', '-c'],
|
|
153
154
|
'args': [
|
|
154
155
|
'set -ex\nchmod 755'
|
|
@@ -165,9 +166,9 @@ def add_tcpxo_daemon_container(job_manifest):
|
|
|
165
166
|
],
|
|
166
167
|
'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
|
|
167
168
|
}
|
|
168
|
-
job_manifest['spec']['template']['spec']
|
|
169
|
-
|
|
170
|
-
)
|
|
169
|
+
spec = job_manifest['spec']['template']['spec']
|
|
170
|
+
spec.setdefault('initContainers', [])
|
|
171
|
+
spec['initContainers'].append(tcpxo_daemon_container)
|
|
171
172
|
|
|
172
173
|
|
|
173
174
|
def update_gpu_containers(job_manifest):
|