xpk 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. xpk/commands/batch.py +8 -8
  2. xpk/commands/cluster.py +19 -19
  3. xpk/commands/cluster_gcluster.py +2 -1
  4. xpk/commands/common.py +7 -3
  5. xpk/commands/info.py +12 -12
  6. xpk/commands/inspector.py +1 -1
  7. xpk/commands/job.py +42 -12
  8. xpk/commands/kjob_common.py +2 -1
  9. xpk/commands/storage.py +6 -3
  10. xpk/commands/workload.py +28 -15
  11. xpk/core/blueprint/blueprint_generator.py +7 -7
  12. xpk/core/blueprint/blueprint_test.py +218 -0
  13. xpk/core/capacity.py +3 -1
  14. xpk/core/cluster.py +14 -8
  15. xpk/core/cluster_private.py +8 -2
  16. xpk/core/commands.py +13 -10
  17. xpk/core/config.py +3 -4
  18. xpk/core/config_test.py +71 -0
  19. xpk/core/docker_image.py +14 -5
  20. xpk/core/docker_manager.py +1 -1
  21. xpk/core/docker_resources.py +10 -5
  22. xpk/core/filestore.py +7 -2
  23. xpk/core/gcloud_context.py +2 -2
  24. xpk/core/jobset.py +1 -1
  25. xpk/core/kjob.py +7 -3
  26. xpk/core/kueue.py +28 -8
  27. xpk/core/nap.py +5 -5
  28. xpk/core/network.py +1 -1
  29. xpk/core/nodepool.py +8 -3
  30. xpk/core/nodepool_test.py +82 -0
  31. xpk/core/pathways.py +6 -2
  32. xpk/core/ray.py +1 -1
  33. xpk/core/resources.py +18 -14
  34. xpk/core/scheduling.py +4 -0
  35. xpk/core/storage.py +14 -14
  36. xpk/core/system_characteristics.py +1 -1
  37. xpk/core/workload.py +11 -0
  38. xpk/core/workload_decorators/rdma_decorator.py +3 -2
  39. xpk/core/workload_decorators/storage_decorator.py +2 -1
  40. xpk/core/workload_decorators/tcpx_decorator.py +4 -2
  41. xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
  42. xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
  43. xpk/core/workload_test.py +28 -0
  44. xpk/main.py +12 -10
  45. xpk/parser/cluster.py +110 -49
  46. xpk/parser/common.py +45 -36
  47. xpk/parser/storage.py +12 -13
  48. xpk/parser/workload.py +57 -39
  49. xpk/utils/console.py +2 -1
  50. xpk/utils/execution_context.py +28 -0
  51. xpk/utils/file.py +25 -10
  52. xpk/utils/network.py +4 -0
  53. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/METADATA +4 -1
  54. xpk-0.13.0.dist-info/RECORD +101 -0
  55. xpk-0.11.0.dist-info/RECORD +0 -95
  56. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/WHEEL +0 -0
  57. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/entry_points.txt +0 -0
  58. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/licenses/LICENSE +0 -0
  59. {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,267 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import copy
18
+
19
+ import yaml
20
+
21
+ from xpk.core.workload_decorators import tcpx_decorator
22
+ from xpk.utils.yaml import literal_string
23
+
24
+ # Minimal JobSet manifest for testing
25
+ BASE_JOBSET_MANIFEST_STR = """
26
+ apiVersion: jobset.x-k8s.io/v1alpha2
27
+ kind: JobSet
28
+ metadata:
29
+ name: test-jobset
30
+ spec:
31
+ replicatedJobs:
32
+ - name: slice-job
33
+ template:
34
+ spec:
35
+ template:
36
+ metadata:
37
+ annotations:
38
+ existing-annotation: "true"
39
+ spec:
40
+ containers:
41
+ - name: main-gpu-container
42
+ image: my-gpu-image
43
+ resources:
44
+ limits:
45
+ nvidia.com/gpu: 8
46
+ - name: sidecar-container
47
+ image: my-sidecar-image
48
+ """
49
+
50
+ # Minimal kjob template for testing
51
+ BASE_KJOB_TEMPLATE = {
52
+ "spec": {
53
+ "template": {
54
+ "spec": {
55
+ "containers": [
56
+ {
57
+ "name": "main-gpu-container",
58
+ "image": "my-gpu-image",
59
+ "resources": {"limits": {"nvidia.com/gpu": 8}},
60
+ },
61
+ {"name": "sidecar-container", "image": "my-sidecar-image"},
62
+ ]
63
+ }
64
+ }
65
+ }
66
+ }
67
+
68
+ # Minimal job manifest for testing
69
+ BASE_JOB_MANIFEST = {
70
+ "spec": {
71
+ "template": {
72
+ "metadata": {"annotations": {"existing-annotation": "true"}},
73
+ "spec": {
74
+ "containers": [
75
+ {
76
+ "name": "main-gpu-container",
77
+ "image": "my-gpu-image",
78
+ "resources": {"limits": {"nvidia.com/gpu": 8}},
79
+ },
80
+ {"name": "sidecar-container", "image": "my-sidecar-image"},
81
+ ]
82
+ },
83
+ }
84
+ }
85
+ }
86
+
87
+
88
+ def test_get_interfaces_annotation():
89
+ """Tests get_interfaces_annotation."""
90
+ annotation = tcpx_decorator.get_interfaces_annotation()
91
+ assert "networking.gke.io/interfaces" in annotation
92
+ assert isinstance(annotation["networking.gke.io/interfaces"], literal_string)
93
+ expected_value = (
94
+ "[\n"
95
+ ' {"interfaceName":"eth0","network":"default"},\n'
96
+ ' {"interfaceName":"eth1","network":"vpc1"},\n'
97
+ ' {"interfaceName":"eth2","network":"vpc2"},\n'
98
+ ' {"interfaceName":"eth3","network":"vpc3"},\n'
99
+ ' {"interfaceName":"eth4","network":"vpc4"}\n'
100
+ "]"
101
+ )
102
+ assert str(annotation["networking.gke.io/interfaces"]) == expected_value
103
+
104
+
105
+ def test_get_tcpx_deamon_annotation():
106
+ """Tests get_tcpx_deamon_annotation."""
107
+ annotation = tcpx_decorator.get_tcpx_deamon_annotation()
108
+ assert "devices.gke.io/container.tcpx-daemon" in annotation
109
+ assert isinstance(
110
+ annotation["devices.gke.io/container.tcpx-daemon"], literal_string
111
+ )
112
+ expected_value = (
113
+ "- path: /dev/nvidia0\n"
114
+ "- path: /dev/nvidia1\n"
115
+ "- path: /dev/nvidia2\n"
116
+ "- path: /dev/nvidia3\n"
117
+ "- path: /dev/nvidia4\n"
118
+ "- path: /dev/nvidia5\n"
119
+ "- path: /dev/nvidia6\n"
120
+ "- path: /dev/nvidia7\n"
121
+ "- path: /dev/nvidiactl\n"
122
+ "- path: /dev/nvidia-uvm\n"
123
+ )
124
+ assert (
125
+ str(annotation["devices.gke.io/container.tcpx-daemon"]) == expected_value
126
+ )
127
+
128
+
129
+ def test_decorate_jobset():
130
+ """Tests decorate_jobset."""
131
+ decorated_str = tcpx_decorator.decorate_jobset(BASE_JOBSET_MANIFEST_STR)
132
+ manifest = yaml.safe_load(decorated_str)
133
+
134
+ pod_template_spec = manifest["spec"]["replicatedJobs"][0]["template"]["spec"][
135
+ "template"
136
+ ]["spec"]
137
+ pod_template_metadata = manifest["spec"]["replicatedJobs"][0]["template"][
138
+ "spec"
139
+ ]["template"]["metadata"]
140
+
141
+ # Check annotations
142
+ annotations = pod_template_metadata["annotations"]
143
+ assert "existing-annotation" in annotations
144
+ assert "devices.gke.io/container.tcpx-daemon" in annotations
145
+ assert "networking.gke.io/default-interface" in annotations
146
+ assert "networking.gke.io/interfaces" in annotations
147
+
148
+ # Check tolerations
149
+ tolerations = pod_template_spec["tolerations"]
150
+ assert {
151
+ "key": "user-workload",
152
+ "operator": "Equal",
153
+ "value": "true",
154
+ "effect": "NoSchedule",
155
+ } in tolerations
156
+
157
+ # Check volumes
158
+ volumes = pod_template_spec["volumes"]
159
+ volume_names = {v["name"] for v in volumes}
160
+ assert "libraries" in volume_names
161
+ assert "sys" in volume_names
162
+ assert "proc-sys" in volume_names
163
+ assert "tcpx-socket" in volume_names
164
+ assert "dshm" in volume_names
165
+
166
+ # Check init container
167
+ init_containers = pod_template_spec["initContainers"]
168
+ assert len(init_containers) == 1
169
+ tcpx_daemon = init_containers[0]
170
+ assert tcpx_daemon["name"] == "tcpx-daemon"
171
+ assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
172
+
173
+ # Check GPU container update
174
+ gpu_container = pod_template_spec["containers"][0]
175
+ assert gpu_container["name"] == "main-gpu-container"
176
+
177
+ # Check env
178
+ env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
179
+ assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
180
+
181
+ # Check volume mounts
182
+ volume_mounts = {
183
+ vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
184
+ }
185
+ assert volume_mounts["tcpx-socket"] == "/tmp"
186
+ assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
187
+ assert volume_mounts["dshm"] == "/dev/shm"
188
+
189
+ # Check non-GPU container is not updated
190
+ sidecar_container = pod_template_spec["containers"][1]
191
+ assert "env" not in sidecar_container
192
+ assert "volumeMounts" not in sidecar_container
193
+
194
+
195
+ def test_decorate_job():
196
+ """Tests decorate_job."""
197
+ job_manifest = copy.deepcopy(BASE_JOB_MANIFEST)
198
+
199
+ decorated_manifest = tcpx_decorator.decorate_job(job_manifest)
200
+ pod_template_metadata = decorated_manifest["spec"]["template"]["metadata"]
201
+
202
+ # Check annotations
203
+ annotations = pod_template_metadata["annotations"]
204
+ assert "existing-annotation" in annotations
205
+ assert "devices.gke.io/container.tcpx-daemon" in annotations
206
+ assert "networking.gke.io/default-interface" in annotations
207
+ assert "networking.gke.io/interfaces" in annotations
208
+
209
+
210
+ def test_decorate_kjob_template():
211
+ """Tests decorate_kjob_template."""
212
+ kjob_template = copy.deepcopy(BASE_KJOB_TEMPLATE)
213
+
214
+ decorated_manifest = tcpx_decorator.decorate_kjob_template(kjob_template)
215
+
216
+ pod_template_spec = decorated_manifest["spec"]["template"]["spec"]
217
+
218
+ # Check annotations are NOT added
219
+ assert "annotations" not in decorated_manifest["spec"]["template"].get(
220
+ "metadata", {}
221
+ )
222
+
223
+ # Check tolerations
224
+ tolerations = pod_template_spec["tolerations"]
225
+ assert {
226
+ "key": "user-workload",
227
+ "operator": "Equal",
228
+ "value": "true",
229
+ "effect": "NoSchedule",
230
+ } in tolerations
231
+
232
+ # Check volumes
233
+ volumes = pod_template_spec["volumes"]
234
+ volume_names = {v["name"] for v in volumes}
235
+ assert "libraries" in volume_names
236
+ assert "sys" in volume_names
237
+ assert "proc-sys" in volume_names
238
+ assert "tcpx-socket" in volume_names
239
+ assert "dshm" in volume_names
240
+
241
+ # Check init container
242
+ init_containers = pod_template_spec["initContainers"]
243
+ assert len(init_containers) == 1
244
+ tcpx_daemon = init_containers[0]
245
+ assert tcpx_daemon["name"] == "tcpx-daemon"
246
+ assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
247
+
248
+ # Check GPU container update
249
+ gpu_container = pod_template_spec["containers"][0]
250
+ assert gpu_container["name"] == "main-gpu-container"
251
+
252
+ # Check env
253
+ env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
254
+ assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
255
+
256
+ # Check volume mounts
257
+ volume_mounts = {
258
+ vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
259
+ }
260
+ assert volume_mounts["tcpx-socket"] == "/tmp"
261
+ assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
262
+ assert volume_mounts["dshm"] == "/dev/shm"
263
+
264
+ # Check non-GPU container is not updated
265
+ sidecar_container = pod_template_spec["containers"][1]
266
+ assert "env" not in sidecar_container
267
+ assert "volumeMounts" not in sidecar_container
@@ -74,7 +74,8 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
74
74
  for job in manifest['spec']['replicatedJobs']:
75
75
  job_manifest = job['template']
76
76
  job_manifest = decorate_job(job_manifest, sub_networks)
77
- return yaml.dump(manifest, sort_keys=False)
77
+ yaml_result: str = yaml.dump(manifest, sort_keys=False)
78
+ return yaml_result
78
79
 
79
80
 
80
81
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
@@ -0,0 +1,28 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from xpk.core.workload import get_jobsets_list_gcp_link
18
+
19
+
20
+ def test_get_jobsets_list_gcp_link():
21
+ result = get_jobsets_list_gcp_link(
22
+ project='test-project',
23
+ )
24
+
25
+ assert (
26
+ result
27
+ == 'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project=test-project'
28
+ )
xpk/main.py CHANGED
@@ -37,6 +37,7 @@ import sys
37
37
  from .parser.core import set_parser
38
38
  from .utils.console import xpk_print
39
39
  from .utils.validation import validate_dependencies
40
+ from .utils.execution_context import set_dry_run
40
41
  ################### Compatibility Check ###################
41
42
  # Check that the user runs the below version or greater.
42
43
 
@@ -56,18 +57,19 @@ if (
56
57
  f' User currently is running {user_major_version}.{user_minor_version}'
57
58
  )
58
59
 
59
- # Create top level parser for xpk command.
60
- parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
61
- set_parser(parser=parser)
62
-
63
- xpk_print('Starting xpk', flush=True)
64
- validate_dependencies()
65
- main_args = parser.parse_args()
66
- main_args.enable_ray_cluster = False
67
- main_args.func(main_args)
68
-
69
60
 
70
61
  def main() -> None:
62
+ # Create top level parser for xpk command.
63
+ parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
64
+ set_parser(parser=parser)
65
+
66
+ xpk_print('Starting xpk', flush=True)
67
+ main_args = parser.parse_args()
68
+ main_args.enable_ray_cluster = False
69
+ set_dry_run('dry_run' in main_args and main_args.dry_run)
70
+ if not main_args.dry_run:
71
+ validate_dependencies()
72
+ main_args.func(main_args)
71
73
  xpk_print('XPK Done.', flush=True)
72
74
 
73
75