xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. xpk/commands/batch.py +19 -13
  2. xpk/commands/cluster.py +240 -71
  3. xpk/commands/cluster_gcluster.py +22 -5
  4. xpk/commands/common.py +33 -1
  5. xpk/commands/info.py +2 -4
  6. xpk/commands/job.py +7 -8
  7. xpk/commands/kjob_common.py +30 -18
  8. xpk/commands/run.py +17 -12
  9. xpk/commands/shell.py +3 -4
  10. xpk/commands/storage.py +75 -19
  11. xpk/commands/workload.py +161 -324
  12. xpk/core/blueprint/blueprint_definitions.py +2 -0
  13. xpk/core/blueprint/blueprint_generator.py +335 -45
  14. xpk/core/capacity.py +1 -0
  15. xpk/core/cluster.py +193 -12
  16. xpk/core/config.py +3 -1
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +9 -21
  19. xpk/core/filestore.py +5 -1
  20. xpk/core/gcsfuse.py +27 -6
  21. xpk/core/kjob.py +66 -20
  22. xpk/core/kueue.py +30 -0
  23. xpk/core/mtc.py +195 -0
  24. xpk/core/nap.py +4 -0
  25. xpk/core/network.py +34 -22
  26. xpk/core/nodepool.py +28 -26
  27. xpk/core/pathways.py +165 -210
  28. xpk/core/resources.py +21 -0
  29. xpk/core/scheduling.py +36 -0
  30. xpk/core/storage.py +66 -12
  31. xpk/core/system_characteristics.py +9 -0
  32. xpk/core/workload.py +28 -83
  33. xpk/core/workload_decorators/rdma_decorator.py +11 -15
  34. xpk/core/workload_decorators/storage_decorator.py +8 -3
  35. xpk/core/workload_decorators/tcpx_decorator.py +179 -0
  36. xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
  37. xpk/parser/cluster.py +574 -381
  38. xpk/parser/storage.py +25 -5
  39. xpk/parser/workload.py +59 -31
  40. xpk/utils/kubectl.py +4 -1
  41. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
  42. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
  43. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
  44. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
  45. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
  46. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/core/workload.py CHANGED
@@ -14,12 +14,19 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import yaml
18
+
19
+ from ..utils import templates
17
20
  from ..utils.console import xpk_exit, xpk_print
18
21
  from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE
19
22
  from .commands import run_command_for_value
20
23
  from .gcloud_context import zone_to_region
24
+ from .storage import Storage, get_storage_volume_mounts_for_gpu
21
25
  from .system_characteristics import SystemCharacteristics
22
26
 
27
+ RXDM_CONTAINER_A3HIGH_PATH = '/../templates/rxdm_container_a3high.yaml'
28
+ RXDM_CONTAINER_A3MEGA_PATH = '/../templates/rxdm_container_a3mega.yaml'
29
+
23
30
 
24
31
  def workload_list_awk_command(filter_key) -> str:
25
32
  """Function returns the awk command needed from the filter specified.
@@ -124,7 +131,7 @@ def get_workload_list(args) -> tuple[int, str]:
124
131
  )
125
132
  workload_list_filter_job_cmd = determine_workload_list_filter_by_job(args)
126
133
  command = (
127
- f'kubectl get workloads -o=custom-columns="{s}" '
134
+ f'kubectl get workloads --ignore-not-found -o=custom-columns="{s}" '
128
135
  f'{workload_list_filter_status_cmd} {workload_list_filter_job_cmd}'
129
136
  )
130
137
 
@@ -244,98 +251,36 @@ def wait_for_job_completion(args) -> int:
244
251
  return 0
245
252
 
246
253
 
247
- def get_gpu_volume(system: SystemCharacteristics) -> str:
248
- """Get gpu volume based on user provided arguments.
254
+ def add_gpu_rxdm_container(
255
+ jobset_manifest_str: str,
256
+ system: SystemCharacteristics,
257
+ all_storages: list[Storage],
258
+ ) -> str:
259
+ """Add gpu rxdm container to jobset manifest based on user provided arguments.
249
260
 
250
261
  Args:
262
+ jobset_manifest_str: the JobSet manifest as a YAML string.
251
263
  system: system characteristics.
264
+ all_storages: list of all storages.
252
265
 
253
266
  Returns:
254
- str: yaml containing gpu volume
267
+ str: the modified JobSet manifest as a YAML string.
255
268
  """
256
- gpu_volume = ''
257
269
  if system.device_type == H100_DEVICE_TYPE:
258
- gpu_volume = """- name: nvidia-install-dir-host
259
- hostPath:
260
- path: /home/kubernetes/bin/nvidia/lib64
261
- - name: tcpd-socket
262
- hostPath:
263
- path: /run/tcpx
264
- - name: shared-memory
265
- emptyDir:
266
- medium: "Memory"
267
- sizeLimit: 200Gi
268
- - name: workload-terminated-volume
269
- emptyDir:
270
- - name: tcpx-nccl-plugin-volume
271
- emptyDir:"""
270
+ gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3HIGH_PATH)
272
271
  elif system.device_type == H100_MEGA_DEVICE_TYPE:
273
- gpu_volume = """- name: nvidia-install-dir-host
274
- hostPath:
275
- path: /home/kubernetes/bin/nvidia/lib64
276
- - name: shared-memory
277
- emptyDir:
278
- medium: "Memory"
279
- sizeLimit: 1Gi
280
- - name: workload-terminated-volume
281
- emptyDir:"""
282
- return gpu_volume
283
-
284
-
285
- def get_gpu_rxdm_image(system: SystemCharacteristics) -> str:
286
- """Get config of rxdm based on user provided arguments.
287
-
288
- Args:
289
- system: system characteristics.
290
-
291
- Returns:
292
- str: yaml containing the rxdm name and image
293
- """
294
- gpu_rxdm_image = ''
295
- if system.device_type == H100_DEVICE_TYPE:
296
- gpu_rxdm_image = """- name: tcpd-daemon
297
- image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9"""
298
- elif system.device_type == H100_MEGA_DEVICE_TYPE:
299
- gpu_rxdm_image = """- name: fastrak-daemon
300
- image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.9"""
301
- return gpu_rxdm_image
272
+ gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3MEGA_PATH)
273
+ else:
274
+ return jobset_manifest_str
302
275
 
276
+ storage_volume_mounts = get_storage_volume_mounts_for_gpu(all_storages)
277
+ gpu_rxdm_container['volumeMounts'].extend(storage_volume_mounts)
303
278
 
304
- def get_gpu_rxdm_cmd(system: SystemCharacteristics) -> str:
305
- """Get rxdm command based on user provided arguments.
279
+ manifest = yaml.safe_load(jobset_manifest_str)
306
280
 
307
- Args:
308
- system: system characteristics.
309
-
310
- Returns:
311
- str: command of running rxdm container
312
- """
313
- gpu_rxdm_cmd = ''
314
- if system.device_type == H100_DEVICE_TYPE:
315
- gpu_rxdm_cmd = (
316
- '/tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm'
317
- ' --gpu_shmem_type fd --setup_param "--verbose 128 2 0"'
281
+ for job in manifest['spec']['replicatedJobs']:
282
+ job['template']['spec']['template']['spec']['containers'].append(
283
+ gpu_rxdm_container
318
284
  )
319
- elif system.device_type == H100_MEGA_DEVICE_TYPE:
320
- gpu_rxdm_cmd = (
321
- 'set -ex; chmod 755 /fts/entrypoint_rxdm_container.sh;'
322
- ' /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid='
323
- ' --alsologtostderr'
324
- )
325
- return gpu_rxdm_cmd
326
-
327
285
 
328
- def get_gpu_tcp_volume(system: SystemCharacteristics) -> str:
329
- """Get gpu tcp volume based on user provided arguments.
330
-
331
- Args:
332
- system: system characteristics.
333
-
334
- Returns:
335
- str: yaml containing gpu tcp volume
336
- """
337
- gpu_tcp_volume = ''
338
- if system.device_type == H100_DEVICE_TYPE:
339
- gpu_tcp_volume = """- name: tcpd-socket
340
- mountPath: /tmp"""
341
- return gpu_tcp_volume
286
+ return yaml.dump(manifest, sort_keys=False)
@@ -33,7 +33,7 @@ def decorate_kjob_template(job_manifest) -> str:
33
33
  return job_manifest
34
34
 
35
35
 
36
- def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
36
+ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
37
37
  """
38
38
  Decorates a JobSet manifest with the necessary components for rdma-daemon.
39
39
 
@@ -68,24 +68,20 @@ def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
68
68
 
69
69
 
70
70
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
71
- interfaces = [
72
- '[',
73
- ' {"interfaceName":"eth0","network":"default"},',
74
- *[
75
- f' {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<8 else ""}'
76
- for i in range(9)
77
- ],
78
- ']',
79
- ]
80
- return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
81
-
82
-
83
- def add_annotations(job_manifest, sub_networks):
71
+ entries = ',\n'.join([
72
+ f' {{"interfaceName":"eth{i}","network":"{network}"}}'
73
+ for i, network in enumerate(sub_networks)
74
+ ])
75
+ interfaces = f'[\n{entries}\n]'
76
+ return 'networking.gke.io/interfaces', literal_string(interfaces)
77
+
78
+
79
+ def add_annotations(job_manifest: dict, sub_networks: list[str]):
84
80
  """Adds or updates annotations in the Pod template."""
85
81
  annotations = job_manifest['spec']['template']['metadata']['annotations']
86
82
  interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
87
83
  annotations.update({
88
- 'networking.gke.io/default-interface': "'eth0'",
84
+ 'networking.gke.io/default-interface': 'eth0',
89
85
  interfaces_key: interfaces_value,
90
86
  })
91
87
 
@@ -16,7 +16,7 @@ limitations under the License.
16
16
 
17
17
  import yaml
18
18
 
19
- from ...core.storage import GCS_FUSE_TYPE, get_storage_volumes_yaml_dict, GCS_FUSE_ANNOTATION
19
+ from ...core.storage import GCS_FUSE_TYPE, PARALLELSTORE_TYPE, get_storage_volumes_yaml_dict, GCS_FUSE_ANNOTATIONS, PARALLELSTORE_ANNOTATIONS
20
20
 
21
21
 
22
22
  def decorate_jobset(jobset_manifest_str, storages) -> str:
@@ -42,9 +42,14 @@ def decorate_jobset(jobset_manifest_str, storages) -> str:
42
42
  def add_annotations(job_manifest, storages):
43
43
  """Adds or updates storage annotations in the Pod template."""
44
44
  annotations = job_manifest['spec']['template']['metadata']['annotations']
45
- gcs_present = [storage.type == GCS_FUSE_TYPE for storage in storages]
45
+ gcs_present = any(storage.type == GCS_FUSE_TYPE for storage in storages)
46
46
  if gcs_present:
47
- annotations.update(GCS_FUSE_ANNOTATION)
47
+ annotations.update(GCS_FUSE_ANNOTATIONS)
48
+ parallelstore_present = any(
49
+ storage.type == PARALLELSTORE_TYPE for storage in storages
50
+ )
51
+ if parallelstore_present:
52
+ annotations.update(PARALLELSTORE_ANNOTATIONS)
48
53
 
49
54
 
50
55
  def add_volumes(job_manifest, storage_volumes):
@@ -0,0 +1,179 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import yaml
18
+
19
+ from ...utils.yaml import literal_string
20
+
21
+ # Component version
22
+ tcpx = 'v2.0.11'
23
+
24
+
25
+ def decorate_kjob_template(job_manifest: dict) -> dict:
26
+ add_volumes(job_manifest)
27
+ add_tolerations(job_manifest)
28
+ add_tcpxo_daemon_container(job_manifest)
29
+ update_gpu_containers(job_manifest)
30
+ return job_manifest
31
+
32
+
33
+ def decorate_job(job_manifest: dict) -> dict:
34
+ add_annotations(job_manifest)
35
+ add_volumes(job_manifest)
36
+ add_tolerations(job_manifest)
37
+ add_tcpxo_daemon_container(job_manifest)
38
+ update_gpu_containers(job_manifest)
39
+ return job_manifest
40
+
41
+
42
+ def decorate_jobset(jobset_manifest_str: str) -> str:
43
+ """
44
+ Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
45
+
46
+ Args:
47
+ jobset_manifest_str: The JobSet manifest as a YAML string.
48
+
49
+ Returns:
50
+ The modified JobSet manifest as a YAML string.
51
+ """
52
+
53
+ manifest = yaml.safe_load(jobset_manifest_str)
54
+
55
+ for job in manifest['spec']['replicatedJobs']:
56
+ job_manifest = job['template']
57
+ job_manifest = decorate_job(job_manifest)
58
+ return yaml.dump(manifest, sort_keys=False)
59
+
60
+
61
+ def get_interfaces_annotation() -> dict:
62
+ interfaces = [
63
+ '[',
64
+ ' {"interfaceName":"eth0","network":"default"},',
65
+ ' {"interfaceName":"eth1","network":"vpc1"},',
66
+ ' {"interfaceName":"eth2","network":"vpc2"},',
67
+ ' {"interfaceName":"eth3","network":"vpc3"},',
68
+ ' {"interfaceName":"eth4","network":"vpc4"}',
69
+ ']',
70
+ ]
71
+ return {'networking.gke.io/interfaces': literal_string('\n'.join(interfaces))}
72
+
73
+
74
+ def get_tcpx_deamon_annotation() -> dict:
75
+ return {
76
+ 'devices.gke.io/container.tcpx-daemon': literal_string(
77
+ '- path: /dev/nvidia0\n'
78
+ '- path: /dev/nvidia1\n'
79
+ '- path: /dev/nvidia2\n'
80
+ '- path: /dev/nvidia3\n'
81
+ '- path: /dev/nvidia4\n'
82
+ '- path: /dev/nvidia5\n'
83
+ '- path: /dev/nvidia6\n'
84
+ '- path: /dev/nvidia7\n'
85
+ '- path: /dev/nvidiactl\n'
86
+ '- path: /dev/nvidia-uvm\n'
87
+ )
88
+ }
89
+
90
+
91
+ def add_annotations(job_manifest: dict):
92
+ """Adds or updates annotations in the Pod template."""
93
+ annotations: dict = (
94
+ job_manifest.setdefault('spec', {})
95
+ .setdefault('template', {})
96
+ .setdefault('metadata', {})
97
+ .setdefault('annotations', {})
98
+ )
99
+ annotations.update(get_tcpx_deamon_annotation())
100
+ annotations.update({'networking.gke.io/default-interface': 'eth0'})
101
+ annotations.update(get_interfaces_annotation())
102
+
103
+
104
+ def add_tolerations(job_manifest: dict):
105
+ """Adds tolerations to the Pod spec."""
106
+ tolerations: list = (
107
+ job_manifest.setdefault('spec', {})
108
+ .setdefault('template', {})
109
+ .setdefault('spec', {})
110
+ .setdefault('tolerations', [])
111
+ )
112
+ tolerations.append({
113
+ 'key': 'user-workload',
114
+ 'operator': 'Equal',
115
+ 'value': 'true',
116
+ 'effect': 'NoSchedule',
117
+ })
118
+
119
+
120
+ def add_volumes(job_manifest: dict):
121
+ """Adds volumes to the Pod spec."""
122
+ volumes: list = (
123
+ job_manifest.setdefault('spec', {})
124
+ .setdefault('template', {})
125
+ .setdefault('spec', {})
126
+ .setdefault('volumes', [])
127
+ )
128
+ volumes.append({
129
+ 'name': 'libraries',
130
+ 'hostPath': {'path': '/home/kubernetes/bin/nvidia/lib64'},
131
+ })
132
+ volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
133
+ volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
134
+
135
+
136
+ def add_tcpxo_daemon_container(job_manifest):
137
+ """Adds the tcpxo-daemon container to the Pod spec."""
138
+ tcpxo_daemon_container = {
139
+ 'name': 'tcpx-daemon',
140
+ 'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{tcpx}',
141
+ 'imagePullPolicy': 'Always',
142
+ 'restartPolicy': 'Always',
143
+ 'command': [
144
+ '/tcpgpudmarxd/build/app/tcpgpudmarxd',
145
+ '--gpu_nic_preset',
146
+ 'a3vm',
147
+ '--gpu_shmem_type',
148
+ 'fd',
149
+ '--uds_path',
150
+ '/run/tcpx',
151
+ '--setup_param',
152
+ '"--verbose 128 2 0 "',
153
+ ],
154
+ 'securityContext': {'capabilities': {'add': ['NET_ADMIN']}},
155
+ 'volumeMounts': [
156
+ {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'},
157
+ {'name': 'tcpx-socket', 'mountPath': '/run/tcpx'},
158
+ {'name': 'sys', 'mountPath': '/hostsysfs'},
159
+ {'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
160
+ ],
161
+ 'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
162
+ }
163
+ spec = job_manifest['spec']['template']['spec']
164
+ spec.setdefault('initContainers', [])
165
+ spec['initContainers'].append(tcpxo_daemon_container)
166
+
167
+
168
+ def update_gpu_containers(job_manifest):
169
+ for container in job_manifest['spec']['template']['spec']['containers']:
170
+ if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
171
+ env: list = container.setdefault('env', [])
172
+ env.append(
173
+ {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
174
+ )
175
+ volumeMounts: list = container.setdefault('volumeMounts', [])
176
+ volumeMounts.append({'name': 'tcpx-socket', 'mountPath': '/tmp'})
177
+ volumeMounts.append(
178
+ {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'}
179
+ )
@@ -57,7 +57,7 @@ def decorate_job(job_manifest: dict, sub_networks: list[str]) -> dict:
57
57
  return job_manifest
58
58
 
59
59
 
60
- def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
60
+ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
61
61
  """
62
62
  Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
63
63
 
@@ -77,16 +77,12 @@ def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
77
77
 
78
78
 
79
79
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
80
- interfaces = [
81
- '[',
82
- ' {"interfaceName":"eth0","network":"default"},',
83
- *[
84
- f' {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<7 else ""}'
85
- for i in range(8)
86
- ],
87
- ']',
88
- ]
89
- return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
80
+ entries = ',\n'.join([
81
+ f' {{"interfaceName":"eth{i}","network":"{network}"}}'
82
+ for i, network in enumerate(sub_networks)
83
+ ])
84
+ interfaces = f'[\n{entries}\n]'
85
+ return 'networking.gke.io/interfaces', literal_string(interfaces)
90
86
 
91
87
 
92
88
  def get_tcpxo_deamon_entry() -> tuple[str, str]:
@@ -105,9 +101,13 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
105
101
  )
106
102
 
107
103
 
108
- def add_annotations(job_manifest, sub_networks):
104
+ def add_annotations(job_manifest: dict, sub_networks: list[str]):
109
105
  """Adds or updates annotations in the Pod template."""
110
- annotations = job_manifest['spec']['template']['metadata']['annotations']
106
+ metadata = job_manifest['spec']['template']['metadata']
107
+ annotations = metadata.get('annotations')
108
+ if annotations is None:
109
+ annotations = {}
110
+ metadata['annotations'] = annotations
111
111
  tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
112
112
  interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
113
113
  annotations.update({
@@ -149,6 +149,7 @@ def add_tcpxo_daemon_container(job_manifest):
149
149
  'name': 'tcpxo-daemon',
150
150
  'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
151
151
  'imagePullPolicy': 'Always',
152
+ 'restartPolicy': 'Always',
152
153
  'command': ['/bin/sh', '-c'],
153
154
  'args': [
154
155
  'set -ex\nchmod 755'
@@ -165,9 +166,9 @@ def add_tcpxo_daemon_container(job_manifest):
165
166
  ],
166
167
  'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
167
168
  }
168
- job_manifest['spec']['template']['spec']['containers'].append(
169
- tcpxo_daemon_container
170
- )
169
+ spec = job_manifest['spec']['template']['spec']
170
+ spec.setdefault('initContainers', [])
171
+ spec['initContainers'].append(tcpxo_daemon_container)
171
172
 
172
173
 
173
174
  def update_gpu_containers(job_manifest):