xpk 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +8 -8
- xpk/commands/cluster.py +19 -19
- xpk/commands/cluster_gcluster.py +2 -1
- xpk/commands/common.py +7 -3
- xpk/commands/info.py +12 -12
- xpk/commands/inspector.py +1 -1
- xpk/commands/job.py +42 -12
- xpk/commands/kjob_common.py +2 -1
- xpk/commands/storage.py +6 -3
- xpk/commands/workload.py +28 -15
- xpk/core/blueprint/blueprint_generator.py +7 -7
- xpk/core/blueprint/blueprint_test.py +218 -0
- xpk/core/capacity.py +3 -1
- xpk/core/cluster.py +14 -8
- xpk/core/cluster_private.py +8 -2
- xpk/core/commands.py +13 -10
- xpk/core/config.py +3 -4
- xpk/core/config_test.py +71 -0
- xpk/core/docker_image.py +14 -5
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +10 -5
- xpk/core/filestore.py +7 -2
- xpk/core/gcloud_context.py +2 -2
- xpk/core/jobset.py +1 -1
- xpk/core/kjob.py +7 -3
- xpk/core/kueue.py +28 -8
- xpk/core/nap.py +5 -5
- xpk/core/network.py +1 -1
- xpk/core/nodepool.py +8 -3
- xpk/core/nodepool_test.py +82 -0
- xpk/core/pathways.py +6 -2
- xpk/core/ray.py +1 -1
- xpk/core/resources.py +18 -14
- xpk/core/scheduling.py +4 -0
- xpk/core/storage.py +14 -14
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +3 -2
- xpk/core/workload_decorators/storage_decorator.py +2 -1
- xpk/core/workload_decorators/tcpx_decorator.py +4 -2
- xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
- xpk/core/workload_test.py +28 -0
- xpk/main.py +12 -10
- xpk/parser/cluster.py +110 -49
- xpk/parser/common.py +45 -36
- xpk/parser/storage.py +12 -13
- xpk/parser/workload.py +57 -39
- xpk/utils/console.py +2 -1
- xpk/utils/execution_context.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/network.py +4 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/METADATA +4 -1
- xpk-0.13.0.dist-info/RECORD +101 -0
- xpk-0.11.0.dist-info/RECORD +0 -95
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/WHEEL +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import copy
|
|
18
|
+
|
|
19
|
+
import yaml
|
|
20
|
+
|
|
21
|
+
from xpk.core.workload_decorators import tcpx_decorator
|
|
22
|
+
from xpk.utils.yaml import literal_string
|
|
23
|
+
|
|
24
|
+
# Minimal JobSet manifest for testing
|
|
25
|
+
BASE_JOBSET_MANIFEST_STR = """
|
|
26
|
+
apiVersion: jobset.x-k8s.io/v1alpha2
|
|
27
|
+
kind: JobSet
|
|
28
|
+
metadata:
|
|
29
|
+
name: test-jobset
|
|
30
|
+
spec:
|
|
31
|
+
replicatedJobs:
|
|
32
|
+
- name: slice-job
|
|
33
|
+
template:
|
|
34
|
+
spec:
|
|
35
|
+
template:
|
|
36
|
+
metadata:
|
|
37
|
+
annotations:
|
|
38
|
+
existing-annotation: "true"
|
|
39
|
+
spec:
|
|
40
|
+
containers:
|
|
41
|
+
- name: main-gpu-container
|
|
42
|
+
image: my-gpu-image
|
|
43
|
+
resources:
|
|
44
|
+
limits:
|
|
45
|
+
nvidia.com/gpu: 8
|
|
46
|
+
- name: sidecar-container
|
|
47
|
+
image: my-sidecar-image
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# Minimal kjob template for testing
|
|
51
|
+
BASE_KJOB_TEMPLATE = {
|
|
52
|
+
"spec": {
|
|
53
|
+
"template": {
|
|
54
|
+
"spec": {
|
|
55
|
+
"containers": [
|
|
56
|
+
{
|
|
57
|
+
"name": "main-gpu-container",
|
|
58
|
+
"image": "my-gpu-image",
|
|
59
|
+
"resources": {"limits": {"nvidia.com/gpu": 8}},
|
|
60
|
+
},
|
|
61
|
+
{"name": "sidecar-container", "image": "my-sidecar-image"},
|
|
62
|
+
]
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# Minimal job manifest for testing
|
|
69
|
+
BASE_JOB_MANIFEST = {
|
|
70
|
+
"spec": {
|
|
71
|
+
"template": {
|
|
72
|
+
"metadata": {"annotations": {"existing-annotation": "true"}},
|
|
73
|
+
"spec": {
|
|
74
|
+
"containers": [
|
|
75
|
+
{
|
|
76
|
+
"name": "main-gpu-container",
|
|
77
|
+
"image": "my-gpu-image",
|
|
78
|
+
"resources": {"limits": {"nvidia.com/gpu": 8}},
|
|
79
|
+
},
|
|
80
|
+
{"name": "sidecar-container", "image": "my-sidecar-image"},
|
|
81
|
+
]
|
|
82
|
+
},
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_get_interfaces_annotation():
|
|
89
|
+
"""Tests get_interfaces_annotation."""
|
|
90
|
+
annotation = tcpx_decorator.get_interfaces_annotation()
|
|
91
|
+
assert "networking.gke.io/interfaces" in annotation
|
|
92
|
+
assert isinstance(annotation["networking.gke.io/interfaces"], literal_string)
|
|
93
|
+
expected_value = (
|
|
94
|
+
"[\n"
|
|
95
|
+
' {"interfaceName":"eth0","network":"default"},\n'
|
|
96
|
+
' {"interfaceName":"eth1","network":"vpc1"},\n'
|
|
97
|
+
' {"interfaceName":"eth2","network":"vpc2"},\n'
|
|
98
|
+
' {"interfaceName":"eth3","network":"vpc3"},\n'
|
|
99
|
+
' {"interfaceName":"eth4","network":"vpc4"}\n'
|
|
100
|
+
"]"
|
|
101
|
+
)
|
|
102
|
+
assert str(annotation["networking.gke.io/interfaces"]) == expected_value
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_get_tcpx_deamon_annotation():
|
|
106
|
+
"""Tests get_tcpx_deamon_annotation."""
|
|
107
|
+
annotation = tcpx_decorator.get_tcpx_deamon_annotation()
|
|
108
|
+
assert "devices.gke.io/container.tcpx-daemon" in annotation
|
|
109
|
+
assert isinstance(
|
|
110
|
+
annotation["devices.gke.io/container.tcpx-daemon"], literal_string
|
|
111
|
+
)
|
|
112
|
+
expected_value = (
|
|
113
|
+
"- path: /dev/nvidia0\n"
|
|
114
|
+
"- path: /dev/nvidia1\n"
|
|
115
|
+
"- path: /dev/nvidia2\n"
|
|
116
|
+
"- path: /dev/nvidia3\n"
|
|
117
|
+
"- path: /dev/nvidia4\n"
|
|
118
|
+
"- path: /dev/nvidia5\n"
|
|
119
|
+
"- path: /dev/nvidia6\n"
|
|
120
|
+
"- path: /dev/nvidia7\n"
|
|
121
|
+
"- path: /dev/nvidiactl\n"
|
|
122
|
+
"- path: /dev/nvidia-uvm\n"
|
|
123
|
+
)
|
|
124
|
+
assert (
|
|
125
|
+
str(annotation["devices.gke.io/container.tcpx-daemon"]) == expected_value
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def test_decorate_jobset():
|
|
130
|
+
"""Tests decorate_jobset."""
|
|
131
|
+
decorated_str = tcpx_decorator.decorate_jobset(BASE_JOBSET_MANIFEST_STR)
|
|
132
|
+
manifest = yaml.safe_load(decorated_str)
|
|
133
|
+
|
|
134
|
+
pod_template_spec = manifest["spec"]["replicatedJobs"][0]["template"]["spec"][
|
|
135
|
+
"template"
|
|
136
|
+
]["spec"]
|
|
137
|
+
pod_template_metadata = manifest["spec"]["replicatedJobs"][0]["template"][
|
|
138
|
+
"spec"
|
|
139
|
+
]["template"]["metadata"]
|
|
140
|
+
|
|
141
|
+
# Check annotations
|
|
142
|
+
annotations = pod_template_metadata["annotations"]
|
|
143
|
+
assert "existing-annotation" in annotations
|
|
144
|
+
assert "devices.gke.io/container.tcpx-daemon" in annotations
|
|
145
|
+
assert "networking.gke.io/default-interface" in annotations
|
|
146
|
+
assert "networking.gke.io/interfaces" in annotations
|
|
147
|
+
|
|
148
|
+
# Check tolerations
|
|
149
|
+
tolerations = pod_template_spec["tolerations"]
|
|
150
|
+
assert {
|
|
151
|
+
"key": "user-workload",
|
|
152
|
+
"operator": "Equal",
|
|
153
|
+
"value": "true",
|
|
154
|
+
"effect": "NoSchedule",
|
|
155
|
+
} in tolerations
|
|
156
|
+
|
|
157
|
+
# Check volumes
|
|
158
|
+
volumes = pod_template_spec["volumes"]
|
|
159
|
+
volume_names = {v["name"] for v in volumes}
|
|
160
|
+
assert "libraries" in volume_names
|
|
161
|
+
assert "sys" in volume_names
|
|
162
|
+
assert "proc-sys" in volume_names
|
|
163
|
+
assert "tcpx-socket" in volume_names
|
|
164
|
+
assert "dshm" in volume_names
|
|
165
|
+
|
|
166
|
+
# Check init container
|
|
167
|
+
init_containers = pod_template_spec["initContainers"]
|
|
168
|
+
assert len(init_containers) == 1
|
|
169
|
+
tcpx_daemon = init_containers[0]
|
|
170
|
+
assert tcpx_daemon["name"] == "tcpx-daemon"
|
|
171
|
+
assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
|
|
172
|
+
|
|
173
|
+
# Check GPU container update
|
|
174
|
+
gpu_container = pod_template_spec["containers"][0]
|
|
175
|
+
assert gpu_container["name"] == "main-gpu-container"
|
|
176
|
+
|
|
177
|
+
# Check env
|
|
178
|
+
env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
|
|
179
|
+
assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
|
|
180
|
+
|
|
181
|
+
# Check volume mounts
|
|
182
|
+
volume_mounts = {
|
|
183
|
+
vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
|
|
184
|
+
}
|
|
185
|
+
assert volume_mounts["tcpx-socket"] == "/tmp"
|
|
186
|
+
assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
|
|
187
|
+
assert volume_mounts["dshm"] == "/dev/shm"
|
|
188
|
+
|
|
189
|
+
# Check non-GPU container is not updated
|
|
190
|
+
sidecar_container = pod_template_spec["containers"][1]
|
|
191
|
+
assert "env" not in sidecar_container
|
|
192
|
+
assert "volumeMounts" not in sidecar_container
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def test_decorate_job():
|
|
196
|
+
"""Tests decorate_job."""
|
|
197
|
+
job_manifest = copy.deepcopy(BASE_JOB_MANIFEST)
|
|
198
|
+
|
|
199
|
+
decorated_manifest = tcpx_decorator.decorate_job(job_manifest)
|
|
200
|
+
pod_template_metadata = decorated_manifest["spec"]["template"]["metadata"]
|
|
201
|
+
|
|
202
|
+
# Check annotations
|
|
203
|
+
annotations = pod_template_metadata["annotations"]
|
|
204
|
+
assert "existing-annotation" in annotations
|
|
205
|
+
assert "devices.gke.io/container.tcpx-daemon" in annotations
|
|
206
|
+
assert "networking.gke.io/default-interface" in annotations
|
|
207
|
+
assert "networking.gke.io/interfaces" in annotations
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def test_decorate_kjob_template():
|
|
211
|
+
"""Tests decorate_kjob_template."""
|
|
212
|
+
kjob_template = copy.deepcopy(BASE_KJOB_TEMPLATE)
|
|
213
|
+
|
|
214
|
+
decorated_manifest = tcpx_decorator.decorate_kjob_template(kjob_template)
|
|
215
|
+
|
|
216
|
+
pod_template_spec = decorated_manifest["spec"]["template"]["spec"]
|
|
217
|
+
|
|
218
|
+
# Check annotations are NOT added
|
|
219
|
+
assert "annotations" not in decorated_manifest["spec"]["template"].get(
|
|
220
|
+
"metadata", {}
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Check tolerations
|
|
224
|
+
tolerations = pod_template_spec["tolerations"]
|
|
225
|
+
assert {
|
|
226
|
+
"key": "user-workload",
|
|
227
|
+
"operator": "Equal",
|
|
228
|
+
"value": "true",
|
|
229
|
+
"effect": "NoSchedule",
|
|
230
|
+
} in tolerations
|
|
231
|
+
|
|
232
|
+
# Check volumes
|
|
233
|
+
volumes = pod_template_spec["volumes"]
|
|
234
|
+
volume_names = {v["name"] for v in volumes}
|
|
235
|
+
assert "libraries" in volume_names
|
|
236
|
+
assert "sys" in volume_names
|
|
237
|
+
assert "proc-sys" in volume_names
|
|
238
|
+
assert "tcpx-socket" in volume_names
|
|
239
|
+
assert "dshm" in volume_names
|
|
240
|
+
|
|
241
|
+
# Check init container
|
|
242
|
+
init_containers = pod_template_spec["initContainers"]
|
|
243
|
+
assert len(init_containers) == 1
|
|
244
|
+
tcpx_daemon = init_containers[0]
|
|
245
|
+
assert tcpx_daemon["name"] == "tcpx-daemon"
|
|
246
|
+
assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
|
|
247
|
+
|
|
248
|
+
# Check GPU container update
|
|
249
|
+
gpu_container = pod_template_spec["containers"][0]
|
|
250
|
+
assert gpu_container["name"] == "main-gpu-container"
|
|
251
|
+
|
|
252
|
+
# Check env
|
|
253
|
+
env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
|
|
254
|
+
assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
|
|
255
|
+
|
|
256
|
+
# Check volume mounts
|
|
257
|
+
volume_mounts = {
|
|
258
|
+
vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
|
|
259
|
+
}
|
|
260
|
+
assert volume_mounts["tcpx-socket"] == "/tmp"
|
|
261
|
+
assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
|
|
262
|
+
assert volume_mounts["dshm"] == "/dev/shm"
|
|
263
|
+
|
|
264
|
+
# Check non-GPU container is not updated
|
|
265
|
+
sidecar_container = pod_template_spec["containers"][1]
|
|
266
|
+
assert "env" not in sidecar_container
|
|
267
|
+
assert "volumeMounts" not in sidecar_container
|
|
@@ -74,7 +74,8 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
|
74
74
|
for job in manifest['spec']['replicatedJobs']:
|
|
75
75
|
job_manifest = job['template']
|
|
76
76
|
job_manifest = decorate_job(job_manifest, sub_networks)
|
|
77
|
-
|
|
77
|
+
yaml_result: str = yaml.dump(manifest, sort_keys=False)
|
|
78
|
+
return yaml_result
|
|
78
79
|
|
|
79
80
|
|
|
80
81
|
def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from xpk.core.workload import get_jobsets_list_gcp_link
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_get_jobsets_list_gcp_link():
|
|
21
|
+
result = get_jobsets_list_gcp_link(
|
|
22
|
+
project='test-project',
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
assert (
|
|
26
|
+
result
|
|
27
|
+
== 'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project=test-project'
|
|
28
|
+
)
|
xpk/main.py
CHANGED
|
@@ -37,6 +37,7 @@ import sys
|
|
|
37
37
|
from .parser.core import set_parser
|
|
38
38
|
from .utils.console import xpk_print
|
|
39
39
|
from .utils.validation import validate_dependencies
|
|
40
|
+
from .utils.execution_context import set_dry_run
|
|
40
41
|
################### Compatibility Check ###################
|
|
41
42
|
# Check that the user runs the below version or greater.
|
|
42
43
|
|
|
@@ -56,18 +57,19 @@ if (
|
|
|
56
57
|
f' User currently is running {user_major_version}.{user_minor_version}'
|
|
57
58
|
)
|
|
58
59
|
|
|
59
|
-
# Create top level parser for xpk command.
|
|
60
|
-
parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
|
|
61
|
-
set_parser(parser=parser)
|
|
62
|
-
|
|
63
|
-
xpk_print('Starting xpk', flush=True)
|
|
64
|
-
validate_dependencies()
|
|
65
|
-
main_args = parser.parse_args()
|
|
66
|
-
main_args.enable_ray_cluster = False
|
|
67
|
-
main_args.func(main_args)
|
|
68
|
-
|
|
69
60
|
|
|
70
61
|
def main() -> None:
|
|
62
|
+
# Create top level parser for xpk command.
|
|
63
|
+
parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
|
|
64
|
+
set_parser(parser=parser)
|
|
65
|
+
|
|
66
|
+
xpk_print('Starting xpk', flush=True)
|
|
67
|
+
main_args = parser.parse_args()
|
|
68
|
+
main_args.enable_ray_cluster = False
|
|
69
|
+
set_dry_run('dry_run' in main_args and main_args.dry_run)
|
|
70
|
+
if not main_args.dry_run:
|
|
71
|
+
validate_dependencies()
|
|
72
|
+
main_args.func(main_args)
|
|
71
73
|
xpk_print('XPK Done.', flush=True)
|
|
72
74
|
|
|
73
75
|
|