xpk 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. xpk/commands/cluster.py +270 -8
  2. xpk/commands/cluster_gcluster.py +2 -1
  3. xpk/commands/common.py +3 -3
  4. xpk/commands/info.py +12 -12
  5. xpk/commands/job.py +12 -10
  6. xpk/commands/kjob_common.py +2 -1
  7. xpk/commands/storage.py +1 -1
  8. xpk/commands/workload.py +12 -6
  9. xpk/core/blueprint/blueprint_generator.py +7 -7
  10. xpk/core/blueprint/blueprint_test.py +218 -0
  11. xpk/core/capacity.py +5 -3
  12. xpk/core/cluster.py +9 -7
  13. xpk/core/cluster_private.py +5 -1
  14. xpk/core/commands.py +3 -3
  15. xpk/core/config.py +3 -4
  16. xpk/core/config_test.py +71 -0
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +1 -1
  19. xpk/core/filestore.py +7 -2
  20. xpk/core/gcloud_context.py +2 -2
  21. xpk/core/jobset.py +1 -1
  22. xpk/core/kjob.py +2 -1
  23. xpk/core/kueue.py +12 -4
  24. xpk/core/nap.py +20 -6
  25. xpk/core/nodepool.py +52 -19
  26. xpk/core/nodepool_test.py +82 -0
  27. xpk/core/resources.py +1 -7
  28. xpk/core/scheduling.py +1 -1
  29. xpk/core/storage.py +14 -14
  30. xpk/core/system_characteristics.py +267 -1081
  31. xpk/core/workload.py +11 -0
  32. xpk/core/workload_decorators/rdma_decorator.py +3 -2
  33. xpk/core/workload_decorators/storage_decorator.py +2 -1
  34. xpk/core/workload_decorators/tcpx_decorator.py +4 -2
  35. xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
  36. xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
  37. xpk/core/workload_test.py +28 -0
  38. xpk/main.py +9 -10
  39. xpk/parser/cluster.py +67 -49
  40. xpk/parser/common.py +45 -36
  41. xpk/parser/storage.py +12 -13
  42. xpk/parser/workload.py +57 -39
  43. xpk/utils/console.py +2 -1
  44. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/METADATA +4 -1
  45. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/RECORD +49 -44
  46. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/WHEEL +0 -0
  47. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/entry_points.txt +0 -0
  48. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/licenses/LICENSE +0 -0
  49. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/top_level.txt +0 -0
xpk/core/workload.py CHANGED
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import re
17
18
  from ..utils.console import xpk_exit, xpk_print
18
19
  from .commands import run_command_for_value
19
20
  from .gcloud_context import zone_to_region
@@ -240,3 +241,13 @@ def wait_for_job_completion(args) -> int:
240
241
  xpk_print('Your workload did not complete successfully')
241
242
  return 125
242
243
  return 0
244
+
245
+
246
+ GCP_NAME_FILTER_VALUE_REGEX = re.compile(r'[a-z0-9\-]+')
247
+ """Defines correct name prefix value (contains only letters, numbers and dashes) that can be used in GCP filter chips."""
248
+
249
+
250
+ def get_jobsets_list_gcp_link(project: str) -> str:
251
+ """Returns a link to Cloud Console JobSets list"""
252
+
253
+ return f'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project={project}'
@@ -18,7 +18,7 @@ import yaml
18
18
  from ...utils.yaml import literal_string
19
19
 
20
20
 
21
- def decorate_kjob_template(job_manifest) -> str:
21
+ def decorate_kjob_template(job_manifest: dict) -> dict:
22
22
  spec = (
23
23
  job_manifest.setdefault('spec', {})
24
24
  .setdefault('template', {})
@@ -64,7 +64,8 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
64
64
  add_tolerations(job_manifest)
65
65
  update_gpu_containers(job_manifest)
66
66
 
67
- return yaml.dump(manifest, sort_keys=False)
67
+ yaml_str: str = yaml.dump(manifest, sort_keys=False)
68
+ return yaml_str
68
69
 
69
70
 
70
71
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
@@ -36,7 +36,8 @@ def decorate_jobset(jobset_manifest_str, storages) -> str:
36
36
  job_manifest = job['template']
37
37
  add_annotations(job_manifest, storages)
38
38
  add_volumes(job_manifest, storage_volumes)
39
- return yaml.dump(manifest, sort_keys=False)
39
+ yaml_result: str = yaml.dump(manifest, sort_keys=False)
40
+ return yaml_result
40
41
 
41
42
 
42
43
  def add_annotations(job_manifest, storages):
@@ -55,7 +55,8 @@ def decorate_jobset(jobset_manifest_str: str) -> str:
55
55
  for job in manifest['spec']['replicatedJobs']:
56
56
  job_manifest = job['template']
57
57
  job_manifest = decorate_job(job_manifest)
58
- return yaml.dump(manifest, sort_keys=False)
58
+ yaml_str: str = yaml.dump(manifest, sort_keys=False)
59
+ return yaml_str
59
60
 
60
61
 
61
62
  def get_interfaces_annotation() -> dict:
@@ -131,6 +132,7 @@ def add_volumes(job_manifest: dict):
131
132
  })
132
133
  volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
133
134
  volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
135
+ volumes.append({'name': 'tcpx-socket', 'hostPath': {'path': '/run/tcpx'}})
134
136
  volumes.append(
135
137
  {'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
136
138
  )
@@ -168,7 +170,7 @@ def add_tcpx_daemon_container(job_manifest):
168
170
  spec['initContainers'].append(tcpxo_daemon_container)
169
171
 
170
172
 
171
- def update_gpu_containers(job_manifest):
173
+ def update_gpu_containers(job_manifest) -> None:
172
174
  for container in job_manifest['spec']['template']['spec']['containers']:
173
175
  if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
174
176
  env: list = container.setdefault('env', [])
@@ -0,0 +1,267 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import copy
18
+
19
+ import yaml
20
+
21
+ from xpk.core.workload_decorators import tcpx_decorator
22
+ from xpk.utils.yaml import literal_string
23
+
24
+ # Minimal JobSet manifest for testing
25
+ BASE_JOBSET_MANIFEST_STR = """
26
+ apiVersion: jobset.x-k8s.io/v1alpha2
27
+ kind: JobSet
28
+ metadata:
29
+ name: test-jobset
30
+ spec:
31
+ replicatedJobs:
32
+ - name: slice-job
33
+ template:
34
+ spec:
35
+ template:
36
+ metadata:
37
+ annotations:
38
+ existing-annotation: "true"
39
+ spec:
40
+ containers:
41
+ - name: main-gpu-container
42
+ image: my-gpu-image
43
+ resources:
44
+ limits:
45
+ nvidia.com/gpu: 8
46
+ - name: sidecar-container
47
+ image: my-sidecar-image
48
+ """
49
+
50
+ # Minimal kjob template for testing
51
+ BASE_KJOB_TEMPLATE = {
52
+ "spec": {
53
+ "template": {
54
+ "spec": {
55
+ "containers": [
56
+ {
57
+ "name": "main-gpu-container",
58
+ "image": "my-gpu-image",
59
+ "resources": {"limits": {"nvidia.com/gpu": 8}},
60
+ },
61
+ {"name": "sidecar-container", "image": "my-sidecar-image"},
62
+ ]
63
+ }
64
+ }
65
+ }
66
+ }
67
+
68
+ # Minimal job manifest for testing
69
+ BASE_JOB_MANIFEST = {
70
+ "spec": {
71
+ "template": {
72
+ "metadata": {"annotations": {"existing-annotation": "true"}},
73
+ "spec": {
74
+ "containers": [
75
+ {
76
+ "name": "main-gpu-container",
77
+ "image": "my-gpu-image",
78
+ "resources": {"limits": {"nvidia.com/gpu": 8}},
79
+ },
80
+ {"name": "sidecar-container", "image": "my-sidecar-image"},
81
+ ]
82
+ },
83
+ }
84
+ }
85
+ }
86
+
87
+
88
+ def test_get_interfaces_annotation():
89
+ """Tests get_interfaces_annotation."""
90
+ annotation = tcpx_decorator.get_interfaces_annotation()
91
+ assert "networking.gke.io/interfaces" in annotation
92
+ assert isinstance(annotation["networking.gke.io/interfaces"], literal_string)
93
+ expected_value = (
94
+ "[\n"
95
+ ' {"interfaceName":"eth0","network":"default"},\n'
96
+ ' {"interfaceName":"eth1","network":"vpc1"},\n'
97
+ ' {"interfaceName":"eth2","network":"vpc2"},\n'
98
+ ' {"interfaceName":"eth3","network":"vpc3"},\n'
99
+ ' {"interfaceName":"eth4","network":"vpc4"}\n'
100
+ "]"
101
+ )
102
+ assert str(annotation["networking.gke.io/interfaces"]) == expected_value
103
+
104
+
105
+ def test_get_tcpx_deamon_annotation():
106
+ """Tests get_tcpx_deamon_annotation."""
107
+ annotation = tcpx_decorator.get_tcpx_deamon_annotation()
108
+ assert "devices.gke.io/container.tcpx-daemon" in annotation
109
+ assert isinstance(
110
+ annotation["devices.gke.io/container.tcpx-daemon"], literal_string
111
+ )
112
+ expected_value = (
113
+ "- path: /dev/nvidia0\n"
114
+ "- path: /dev/nvidia1\n"
115
+ "- path: /dev/nvidia2\n"
116
+ "- path: /dev/nvidia3\n"
117
+ "- path: /dev/nvidia4\n"
118
+ "- path: /dev/nvidia5\n"
119
+ "- path: /dev/nvidia6\n"
120
+ "- path: /dev/nvidia7\n"
121
+ "- path: /dev/nvidiactl\n"
122
+ "- path: /dev/nvidia-uvm\n"
123
+ )
124
+ assert (
125
+ str(annotation["devices.gke.io/container.tcpx-daemon"]) == expected_value
126
+ )
127
+
128
+
129
+ def test_decorate_jobset():
130
+ """Tests decorate_jobset."""
131
+ decorated_str = tcpx_decorator.decorate_jobset(BASE_JOBSET_MANIFEST_STR)
132
+ manifest = yaml.safe_load(decorated_str)
133
+
134
+ pod_template_spec = manifest["spec"]["replicatedJobs"][0]["template"]["spec"][
135
+ "template"
136
+ ]["spec"]
137
+ pod_template_metadata = manifest["spec"]["replicatedJobs"][0]["template"][
138
+ "spec"
139
+ ]["template"]["metadata"]
140
+
141
+ # Check annotations
142
+ annotations = pod_template_metadata["annotations"]
143
+ assert "existing-annotation" in annotations
144
+ assert "devices.gke.io/container.tcpx-daemon" in annotations
145
+ assert "networking.gke.io/default-interface" in annotations
146
+ assert "networking.gke.io/interfaces" in annotations
147
+
148
+ # Check tolerations
149
+ tolerations = pod_template_spec["tolerations"]
150
+ assert {
151
+ "key": "user-workload",
152
+ "operator": "Equal",
153
+ "value": "true",
154
+ "effect": "NoSchedule",
155
+ } in tolerations
156
+
157
+ # Check volumes
158
+ volumes = pod_template_spec["volumes"]
159
+ volume_names = {v["name"] for v in volumes}
160
+ assert "libraries" in volume_names
161
+ assert "sys" in volume_names
162
+ assert "proc-sys" in volume_names
163
+ assert "tcpx-socket" in volume_names
164
+ assert "dshm" in volume_names
165
+
166
+ # Check init container
167
+ init_containers = pod_template_spec["initContainers"]
168
+ assert len(init_containers) == 1
169
+ tcpx_daemon = init_containers[0]
170
+ assert tcpx_daemon["name"] == "tcpx-daemon"
171
+ assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
172
+
173
+ # Check GPU container update
174
+ gpu_container = pod_template_spec["containers"][0]
175
+ assert gpu_container["name"] == "main-gpu-container"
176
+
177
+ # Check env
178
+ env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
179
+ assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
180
+
181
+ # Check volume mounts
182
+ volume_mounts = {
183
+ vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
184
+ }
185
+ assert volume_mounts["tcpx-socket"] == "/tmp"
186
+ assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
187
+ assert volume_mounts["dshm"] == "/dev/shm"
188
+
189
+ # Check non-GPU container is not updated
190
+ sidecar_container = pod_template_spec["containers"][1]
191
+ assert "env" not in sidecar_container
192
+ assert "volumeMounts" not in sidecar_container
193
+
194
+
195
+ def test_decorate_job():
196
+ """Tests decorate_job."""
197
+ job_manifest = copy.deepcopy(BASE_JOB_MANIFEST)
198
+
199
+ decorated_manifest = tcpx_decorator.decorate_job(job_manifest)
200
+ pod_template_metadata = decorated_manifest["spec"]["template"]["metadata"]
201
+
202
+ # Check annotations
203
+ annotations = pod_template_metadata["annotations"]
204
+ assert "existing-annotation" in annotations
205
+ assert "devices.gke.io/container.tcpx-daemon" in annotations
206
+ assert "networking.gke.io/default-interface" in annotations
207
+ assert "networking.gke.io/interfaces" in annotations
208
+
209
+
210
+ def test_decorate_kjob_template():
211
+ """Tests decorate_kjob_template."""
212
+ kjob_template = copy.deepcopy(BASE_KJOB_TEMPLATE)
213
+
214
+ decorated_manifest = tcpx_decorator.decorate_kjob_template(kjob_template)
215
+
216
+ pod_template_spec = decorated_manifest["spec"]["template"]["spec"]
217
+
218
+ # Check annotations are NOT added
219
+ assert "annotations" not in decorated_manifest["spec"]["template"].get(
220
+ "metadata", {}
221
+ )
222
+
223
+ # Check tolerations
224
+ tolerations = pod_template_spec["tolerations"]
225
+ assert {
226
+ "key": "user-workload",
227
+ "operator": "Equal",
228
+ "value": "true",
229
+ "effect": "NoSchedule",
230
+ } in tolerations
231
+
232
+ # Check volumes
233
+ volumes = pod_template_spec["volumes"]
234
+ volume_names = {v["name"] for v in volumes}
235
+ assert "libraries" in volume_names
236
+ assert "sys" in volume_names
237
+ assert "proc-sys" in volume_names
238
+ assert "tcpx-socket" in volume_names
239
+ assert "dshm" in volume_names
240
+
241
+ # Check init container
242
+ init_containers = pod_template_spec["initContainers"]
243
+ assert len(init_containers) == 1
244
+ tcpx_daemon = init_containers[0]
245
+ assert tcpx_daemon["name"] == "tcpx-daemon"
246
+ assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
247
+
248
+ # Check GPU container update
249
+ gpu_container = pod_template_spec["containers"][0]
250
+ assert gpu_container["name"] == "main-gpu-container"
251
+
252
+ # Check env
253
+ env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
254
+ assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
255
+
256
+ # Check volume mounts
257
+ volume_mounts = {
258
+ vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
259
+ }
260
+ assert volume_mounts["tcpx-socket"] == "/tmp"
261
+ assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
262
+ assert volume_mounts["dshm"] == "/dev/shm"
263
+
264
+ # Check non-GPU container is not updated
265
+ sidecar_container = pod_template_spec["containers"][1]
266
+ assert "env" not in sidecar_container
267
+ assert "volumeMounts" not in sidecar_container
@@ -74,7 +74,8 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
74
74
  for job in manifest['spec']['replicatedJobs']:
75
75
  job_manifest = job['template']
76
76
  job_manifest = decorate_job(job_manifest, sub_networks)
77
- return yaml.dump(manifest, sort_keys=False)
77
+ yaml_result: str = yaml.dump(manifest, sort_keys=False)
78
+ return yaml_result
78
79
 
79
80
 
80
81
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
@@ -0,0 +1,28 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from xpk.core.workload import get_jobsets_list_gcp_link
18
+
19
+
20
+ def test_get_jobsets_list_gcp_link():
21
+ result = get_jobsets_list_gcp_link(
22
+ project='test-project',
23
+ )
24
+
25
+ assert (
26
+ result
27
+ == 'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project=test-project'
28
+ )
xpk/main.py CHANGED
@@ -56,18 +56,17 @@ if (
56
56
  f' User currently is running {user_major_version}.{user_minor_version}'
57
57
  )
58
58
 
59
- # Create top level parser for xpk command.
60
- parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
61
- set_parser(parser=parser)
62
-
63
- xpk_print('Starting xpk', flush=True)
64
- validate_dependencies()
65
- main_args = parser.parse_args()
66
- main_args.enable_ray_cluster = False
67
- main_args.func(main_args)
68
-
69
59
 
70
60
  def main() -> None:
61
+ # Create top level parser for xpk command.
62
+ parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
63
+ set_parser(parser=parser)
64
+
65
+ xpk_print('Starting xpk', flush=True)
66
+ validate_dependencies()
67
+ main_args = parser.parse_args()
68
+ main_args.enable_ray_cluster = False
69
+ main_args.func(main_args)
71
70
  xpk_print('XPK Done.', flush=True)
72
71
 
73
72