xpk 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. xpk/commands/batch.py +5 -6
  2. xpk/commands/cluster.py +246 -73
  3. xpk/commands/cluster_gcluster.py +27 -0
  4. xpk/commands/common.py +40 -1
  5. xpk/commands/kjob_common.py +13 -1
  6. xpk/commands/run.py +4 -5
  7. xpk/commands/shell.py +2 -2
  8. xpk/commands/storage.py +24 -6
  9. xpk/commands/workload.py +66 -27
  10. xpk/core/blueprint/blueprint_generator.py +115 -47
  11. xpk/core/capacity.py +66 -6
  12. xpk/core/cluster.py +282 -13
  13. xpk/core/config.py +1 -65
  14. xpk/core/docker_manager.py +1 -1
  15. xpk/core/docker_resources.py +145 -72
  16. xpk/core/filestore.py +2 -6
  17. xpk/core/gcsfuse.py +22 -4
  18. xpk/core/jobset.py +143 -0
  19. xpk/core/kjob.py +21 -18
  20. xpk/core/kueue.py +194 -4
  21. xpk/core/mtc.py +195 -0
  22. xpk/core/network.py +23 -1
  23. xpk/core/nodepool.py +17 -4
  24. xpk/core/pathways.py +2 -3
  25. xpk/core/resources.py +21 -0
  26. xpk/core/storage.py +1 -95
  27. xpk/core/system_characteristics.py +1 -1
  28. xpk/core/workload.py +1 -45
  29. xpk/core/workload_decorators/rdma_decorator.py +8 -10
  30. xpk/core/workload_decorators/tcpx_decorator.py +185 -0
  31. xpk/core/workload_decorators/tcpxo_decorator.py +22 -14
  32. xpk/parser/cluster.py +589 -389
  33. xpk/parser/storage.py +12 -3
  34. xpk/parser/workload.py +21 -3
  35. xpk/utils/kubectl.py +4 -1
  36. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/METADATA +178 -96
  37. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/RECORD +41 -38
  38. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/WHEEL +1 -1
  39. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/entry_points.txt +0 -0
  40. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/licenses/LICENSE +0 -0
  41. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/top_level.txt +0 -0
xpk/core/storage.py CHANGED
@@ -46,6 +46,7 @@ STORAGE_CRD_NAME = f"{XPK_API_GROUP_NAME}.{STORAGE_CRD_PLURAL}"
46
46
  GCS_FUSE_TYPE = "gcsfuse"
47
47
  GCP_FILESTORE_TYPE = "gcpfilestore"
48
48
  PARALLELSTORE_TYPE = "parallelstore"
49
+ LUSTRE_TYPE = "lustre"
49
50
  GCE_PD_TYPE = "pd"
50
51
  MANIFESTS_PATH = os.path.abspath("xpkclusters/storage-manifests")
51
52
  GCS_FUSE_ANNOTATIONS = {
@@ -365,101 +366,6 @@ def get_storage_annotations(storages: list[Storage]) -> list[str]:
365
366
  return annotations
366
367
 
367
368
 
368
- def get_storage_volume_mounts_yaml(storages: list[Storage]) -> str:
369
- """
370
- Generates the YAML representation of the volumeMounts section for the given Storages.
371
-
372
- This function creates the YAML snippet that defines how the storage volumes
373
- should be mounted within a Pod's containers.
374
-
375
- Args:
376
- storages: A list of Storage objects.
377
-
378
- Returns:
379
- A string containing the YAML representation of the volumeMounts section.
380
- """
381
- yaml_str = ""
382
- for storage in storages:
383
- yaml_str += f"""- name: {storage.pv}
384
- mountPath: {storage.mount_point}
385
- readOnly: {storage.readonly}
386
- """
387
- return yaml_str
388
-
389
-
390
- def get_storage_volumes_yaml(storages: list[Storage]) -> str:
391
- """
392
- Generates the YAML representation of the volumes section for the given Storages.
393
-
394
- This function creates the YAML snippet that defines the volumes to be
395
- mounted in a Pod, including the PersistentVolumeClaim associated with
396
- each Storage.
397
-
398
- Args:
399
- storages: A list of Storage objects.
400
-
401
- Returns:
402
- A string containing the YAML representation of the volumes section.
403
- """
404
- yaml_str = ""
405
- for storage in storages:
406
- yaml_str += f"""- name: {storage.pv}
407
- persistentVolumeClaim:
408
- claimName: {storage.pvc}
409
- readOnly: {storage.readonly}
410
- """
411
- return yaml_str
412
-
413
-
414
- def get_storage_volume_mounts_for_gpu(
415
- storages: list[Storage],
416
- ) -> list[dict]:
417
- """
418
- Generates the YAML representation of the volumeMounts section for the given Storages.
419
-
420
- This function creates the list of storage specifications that define how the storage volumes
421
- should be mounted within a Pod's containers.
422
-
423
- Args:
424
- storages: A list of Storage objects.
425
-
426
- Returns:
427
- A list containing the dictionary representation of the volumeMounts section.
428
- """
429
- return [
430
- {
431
- "name": storage.pv,
432
- "mountPath": storage.mount_point,
433
- "readOnly": storage.readonly,
434
- }
435
- for storage in storages
436
- ]
437
-
438
-
439
- def get_storage_volumes_yaml_for_gpu(storages: list[Storage]) -> str:
440
- """
441
- Generates the YAML representation of the volumes section for the given Storages.
442
-
443
- This function creates the YAML snippet that defines the volumes to be
444
- mounted in a Pod, including the PersistentVolumeClaim associated with
445
- each Storage.
446
-
447
- Args:
448
- storages: A list of Storage objects.
449
-
450
- Returns:
451
- A string containing the YAML representation of the volumes section.
452
- """
453
- yaml_str = ""
454
- for storage in storages:
455
- yaml_str += f"""- name: {storage.pv}
456
- persistentVolumeClaim:
457
- claimName: {storage.pvc}
458
- readOnly: {storage.readonly}
459
- """
460
- return yaml_str
461
-
462
-
463
369
  def get_storage_volumes_yaml_dict(storages: list[Storage]) -> list[dict]:
464
370
  vols = []
465
371
  for storage in storages:
@@ -1156,7 +1156,7 @@ UserFacingNameToSystemCharacteristics = {
1156
1156
  2,
1157
1157
  'tpu-v5-lite-podslice',
1158
1158
  'ct5lp-hightpu-4t',
1159
- 8,
1159
+ 4,
1160
1160
  AcceleratorType['TPU'],
1161
1161
  'v5litepod-8',
1162
1162
  ),
xpk/core/workload.py CHANGED
@@ -14,18 +14,9 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- import yaml
18
-
19
- from ..utils import templates
20
17
  from ..utils.console import xpk_exit, xpk_print
21
- from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE
22
18
  from .commands import run_command_for_value
23
19
  from .gcloud_context import zone_to_region
24
- from .storage import Storage, get_storage_volume_mounts_for_gpu
25
- from .system_characteristics import SystemCharacteristics
26
-
27
- RXDM_CONTAINER_A3HIGH_PATH = '/../templates/rxdm_container_a3high.yaml'
28
- RXDM_CONTAINER_A3MEGA_PATH = '/../templates/rxdm_container_a3mega.yaml'
29
20
 
30
21
 
31
22
  def workload_list_awk_command(filter_key) -> str:
@@ -131,7 +122,7 @@ def get_workload_list(args) -> tuple[int, str]:
131
122
  )
132
123
  workload_list_filter_job_cmd = determine_workload_list_filter_by_job(args)
133
124
  command = (
134
- f'kubectl get workloads -o=custom-columns="{s}" '
125
+ f'kubectl get workloads --ignore-not-found -o=custom-columns="{s}" '
135
126
  f'{workload_list_filter_status_cmd} {workload_list_filter_job_cmd}'
136
127
  )
137
128
 
@@ -249,38 +240,3 @@ def wait_for_job_completion(args) -> int:
249
240
  xpk_print('Your workload did not complete successfully')
250
241
  return 125
251
242
  return 0
252
-
253
-
254
- def add_gpu_rxdm_container(
255
- jobset_manifest_str: str,
256
- system: SystemCharacteristics,
257
- all_storages: list[Storage],
258
- ) -> str:
259
- """Add gpu rxdm container to jobset manifest based on user provided arguments.
260
-
261
- Args:
262
- jobset_manifest_str: the JobSet manifest as a YAML string.
263
- system: system characteristics.
264
- all_storages: list of all storages.
265
-
266
- Returns:
267
- str: the modified JobSet manifest as a YAML string.
268
- """
269
- if system.device_type == H100_DEVICE_TYPE:
270
- gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3HIGH_PATH)
271
- elif system.device_type == H100_MEGA_DEVICE_TYPE:
272
- gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3MEGA_PATH)
273
- else:
274
- return jobset_manifest_str
275
-
276
- storage_volume_mounts = get_storage_volume_mounts_for_gpu(all_storages)
277
- gpu_rxdm_container['volumeMounts'].extend(storage_volume_mounts)
278
-
279
- manifest = yaml.safe_load(jobset_manifest_str)
280
-
281
- for job in manifest['spec']['replicatedJobs']:
282
- job['template']['spec']['template']['spec']['containers'].append(
283
- gpu_rxdm_container
284
- )
285
-
286
- return yaml.dump(manifest, sort_keys=False)
@@ -68,22 +68,20 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
68
68
 
69
69
 
70
70
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
71
- interfaces = [
72
- '[',
73
- ' {"interfaceName":"eth0","network":"default"},',
74
- *[
75
- f' {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<8 else ""}'
76
- for i in range(9)
77
- ],
78
- ']',
79
- ]
80
- return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
71
+ entries = ',\n'.join([
72
+ f' {{"interfaceName":"eth{i}","network":"{network}"}}'
73
+ for i, network in enumerate(sub_networks)
74
+ ])
75
+ interfaces = f'[\n{entries}\n]'
76
+ return 'networking.gke.io/interfaces', literal_string(interfaces)
81
77
 
82
78
 
83
79
  def add_annotations(job_manifest: dict, sub_networks: list[str]):
84
80
  """Adds or updates annotations in the Pod template."""
85
81
  annotations = job_manifest['spec']['template']['metadata']['annotations']
86
82
  interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
83
+ if annotations is None:
84
+ annotations = {}
87
85
  annotations.update({
88
86
  'networking.gke.io/default-interface': 'eth0',
89
87
  interfaces_key: interfaces_value,
@@ -0,0 +1,185 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import yaml
18
+
19
+ from ...utils.yaml import literal_string
20
+
21
+ # Component version
22
+ tcpx = 'v2.0.11'
23
+
24
+
25
+ def decorate_kjob_template(job_manifest: dict) -> dict:
26
+ add_volumes(job_manifest)
27
+ add_tolerations(job_manifest)
28
+ add_tcpx_daemon_container(job_manifest)
29
+ update_gpu_containers(job_manifest)
30
+ return job_manifest
31
+
32
+
33
+ def decorate_job(job_manifest: dict) -> dict:
34
+ add_annotations(job_manifest)
35
+ add_volumes(job_manifest)
36
+ add_tolerations(job_manifest)
37
+ add_tcpx_daemon_container(job_manifest)
38
+ update_gpu_containers(job_manifest)
39
+ return job_manifest
40
+
41
+
42
+ def decorate_jobset(jobset_manifest_str: str) -> str:
43
+ """
44
+ Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
45
+
46
+ Args:
47
+ jobset_manifest_str: The JobSet manifest as a YAML string.
48
+
49
+ Returns:
50
+ The modified JobSet manifest as a YAML string.
51
+ """
52
+
53
+ manifest = yaml.safe_load(jobset_manifest_str)
54
+
55
+ for job in manifest['spec']['replicatedJobs']:
56
+ job_manifest = job['template']
57
+ job_manifest = decorate_job(job_manifest)
58
+ return yaml.dump(manifest, sort_keys=False)
59
+
60
+
61
+ def get_interfaces_annotation() -> dict:
62
+ interfaces = [
63
+ '[',
64
+ ' {"interfaceName":"eth0","network":"default"},',
65
+ ' {"interfaceName":"eth1","network":"vpc1"},',
66
+ ' {"interfaceName":"eth2","network":"vpc2"},',
67
+ ' {"interfaceName":"eth3","network":"vpc3"},',
68
+ ' {"interfaceName":"eth4","network":"vpc4"}',
69
+ ']',
70
+ ]
71
+ return {'networking.gke.io/interfaces': literal_string('\n'.join(interfaces))}
72
+
73
+
74
+ def get_tcpx_deamon_annotation() -> dict:
75
+ return {
76
+ 'devices.gke.io/container.tcpx-daemon': literal_string(
77
+ '- path: /dev/nvidia0\n'
78
+ '- path: /dev/nvidia1\n'
79
+ '- path: /dev/nvidia2\n'
80
+ '- path: /dev/nvidia3\n'
81
+ '- path: /dev/nvidia4\n'
82
+ '- path: /dev/nvidia5\n'
83
+ '- path: /dev/nvidia6\n'
84
+ '- path: /dev/nvidia7\n'
85
+ '- path: /dev/nvidiactl\n'
86
+ '- path: /dev/nvidia-uvm\n'
87
+ )
88
+ }
89
+
90
+
91
+ def add_annotations(job_manifest: dict):
92
+ """Adds or updates annotations in the Pod template."""
93
+ annotations: dict = (
94
+ job_manifest.setdefault('spec', {})
95
+ .setdefault('template', {})
96
+ .setdefault('metadata', {})
97
+ .setdefault('annotations', {})
98
+ )
99
+ annotations.update(get_tcpx_deamon_annotation())
100
+ annotations.update({'networking.gke.io/default-interface': 'eth0'})
101
+ annotations.update(get_interfaces_annotation())
102
+
103
+
104
+ def add_tolerations(job_manifest: dict):
105
+ """Adds tolerations to the Pod spec."""
106
+ tolerations: list = (
107
+ job_manifest.setdefault('spec', {})
108
+ .setdefault('template', {})
109
+ .setdefault('spec', {})
110
+ .setdefault('tolerations', [])
111
+ )
112
+ tolerations.append({
113
+ 'key': 'user-workload',
114
+ 'operator': 'Equal',
115
+ 'value': 'true',
116
+ 'effect': 'NoSchedule',
117
+ })
118
+
119
+
120
+ def add_volumes(job_manifest: dict):
121
+ """Adds volumes to the Pod spec."""
122
+ volumes: list = (
123
+ job_manifest.setdefault('spec', {})
124
+ .setdefault('template', {})
125
+ .setdefault('spec', {})
126
+ .setdefault('volumes', [])
127
+ )
128
+ volumes.append({
129
+ 'name': 'libraries',
130
+ 'hostPath': {'path': '/home/kubernetes/bin/nvidia/lib64'},
131
+ })
132
+ volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
133
+ volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
134
+ volumes.append(
135
+ {'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
136
+ )
137
+
138
+
139
+ def add_tcpx_daemon_container(job_manifest):
140
+ """Adds the tcpx-daemon container to the Pod spec."""
141
+ tcpxo_daemon_container = {
142
+ 'name': 'tcpx-daemon',
143
+ 'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{tcpx}',
144
+ 'imagePullPolicy': 'Always',
145
+ 'restartPolicy': 'Always',
146
+ 'command': [
147
+ '/tcpgpudmarxd/build/app/tcpgpudmarxd',
148
+ '--gpu_nic_preset',
149
+ 'a3vm',
150
+ '--gpu_shmem_type',
151
+ 'fd',
152
+ '--uds_path',
153
+ '/run/tcpx',
154
+ '--setup_param',
155
+ '"--verbose 128 2 0 "',
156
+ ],
157
+ 'securityContext': {'capabilities': {'add': ['NET_ADMIN']}},
158
+ 'volumeMounts': [
159
+ {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'},
160
+ {'name': 'tcpx-socket', 'mountPath': '/run/tcpx'},
161
+ {'name': 'sys', 'mountPath': '/hostsysfs'},
162
+ {'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
163
+ ],
164
+ 'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
165
+ }
166
+ spec = job_manifest['spec']['template']['spec']
167
+ spec.setdefault('initContainers', [])
168
+ spec['initContainers'].append(tcpxo_daemon_container)
169
+
170
+
171
+ def update_gpu_containers(job_manifest):
172
+ for container in job_manifest['spec']['template']['spec']['containers']:
173
+ if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
174
+ env: list = container.setdefault('env', [])
175
+ env.append(
176
+ {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
177
+ )
178
+ volumeMounts: list = container.setdefault('volumeMounts', [])
179
+ volumeMounts.append({'name': 'tcpx-socket', 'mountPath': '/tmp'})
180
+ volumeMounts.append(
181
+ {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'}
182
+ )
183
+ container['volumeMounts'].append(
184
+ {'name': 'dshm', 'mountPath': '/dev/shm'}
185
+ )
@@ -15,6 +15,7 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  import yaml
18
+
18
19
  from ...utils.yaml import literal_string
19
20
 
20
21
  # Component version
@@ -77,16 +78,12 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
77
78
 
78
79
 
79
80
  def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
80
- interfaces = [
81
- '[',
82
- ' {"interfaceName":"eth0","network":"default"},',
83
- *[
84
- f' {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<7 else ""}'
85
- for i in range(8)
86
- ],
87
- ']',
88
- ]
89
- return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
81
+ entries = ',\n'.join([
82
+ f' {{"interfaceName":"eth{i}","network":"{network}"}}'
83
+ for i, network in enumerate(sub_networks)
84
+ ])
85
+ interfaces = f'[\n{entries}\n]'
86
+ return 'networking.gke.io/interfaces', literal_string(interfaces)
90
87
 
91
88
 
92
89
  def get_tcpxo_deamon_entry() -> tuple[str, str]:
@@ -107,7 +104,11 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
107
104
 
108
105
  def add_annotations(job_manifest: dict, sub_networks: list[str]):
109
106
  """Adds or updates annotations in the Pod template."""
110
- annotations = job_manifest['spec']['template']['metadata']['annotations']
107
+ metadata = job_manifest['spec']['template']['metadata']
108
+ annotations = metadata.get('annotations')
109
+ if annotations is None:
110
+ annotations = {}
111
+ metadata['annotations'] = annotations
111
112
  tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
112
113
  interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
113
114
  annotations.update({
@@ -141,6 +142,9 @@ def add_volumes(job_manifest):
141
142
  'name': 'aperture-devices',
142
143
  'hostPath': {'path': '/dev/aperture_devices'},
143
144
  })
145
+ volumes.append(
146
+ {'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
147
+ )
144
148
 
145
149
 
146
150
  def add_tcpxo_daemon_container(job_manifest):
@@ -149,6 +153,7 @@ def add_tcpxo_daemon_container(job_manifest):
149
153
  'name': 'tcpxo-daemon',
150
154
  'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
151
155
  'imagePullPolicy': 'Always',
156
+ 'restartPolicy': 'Always',
152
157
  'command': ['/bin/sh', '-c'],
153
158
  'args': [
154
159
  'set -ex\nchmod 755'
@@ -165,9 +170,9 @@ def add_tcpxo_daemon_container(job_manifest):
165
170
  ],
166
171
  'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
167
172
  }
168
- job_manifest['spec']['template']['spec']['containers'].append(
169
- tcpxo_daemon_container
170
- )
173
+ spec = job_manifest['spec']['template']['spec']
174
+ spec.setdefault('initContainers', [])
175
+ spec['initContainers'].append(tcpxo_daemon_container)
171
176
 
172
177
 
173
178
  def update_gpu_containers(job_manifest):
@@ -188,3 +193,6 @@ def update_gpu_containers(job_manifest):
188
193
  container['volumeMounts'].append(
189
194
  {'name': 'libraries', 'mountPath': '/usr/local/nvidia'}
190
195
  )
196
+ container['volumeMounts'].append(
197
+ {'name': 'dshm', 'mountPath': '/dev/shm'}
198
+ )