xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. xpk/commands/batch.py +19 -13
  2. xpk/commands/cluster.py +240 -71
  3. xpk/commands/cluster_gcluster.py +22 -5
  4. xpk/commands/common.py +33 -1
  5. xpk/commands/info.py +2 -4
  6. xpk/commands/job.py +7 -8
  7. xpk/commands/kjob_common.py +30 -18
  8. xpk/commands/run.py +17 -12
  9. xpk/commands/shell.py +3 -4
  10. xpk/commands/storage.py +75 -19
  11. xpk/commands/workload.py +161 -324
  12. xpk/core/blueprint/blueprint_definitions.py +2 -0
  13. xpk/core/blueprint/blueprint_generator.py +335 -45
  14. xpk/core/capacity.py +1 -0
  15. xpk/core/cluster.py +193 -12
  16. xpk/core/config.py +3 -1
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +9 -21
  19. xpk/core/filestore.py +5 -1
  20. xpk/core/gcsfuse.py +27 -6
  21. xpk/core/kjob.py +66 -20
  22. xpk/core/kueue.py +30 -0
  23. xpk/core/mtc.py +195 -0
  24. xpk/core/nap.py +4 -0
  25. xpk/core/network.py +34 -22
  26. xpk/core/nodepool.py +28 -26
  27. xpk/core/pathways.py +165 -210
  28. xpk/core/resources.py +21 -0
  29. xpk/core/scheduling.py +36 -0
  30. xpk/core/storage.py +66 -12
  31. xpk/core/system_characteristics.py +9 -0
  32. xpk/core/workload.py +28 -83
  33. xpk/core/workload_decorators/rdma_decorator.py +11 -15
  34. xpk/core/workload_decorators/storage_decorator.py +8 -3
  35. xpk/core/workload_decorators/tcpx_decorator.py +179 -0
  36. xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
  37. xpk/parser/cluster.py +574 -381
  38. xpk/parser/storage.py +25 -5
  39. xpk/parser/workload.py +59 -31
  40. xpk/utils/kubectl.py +4 -1
  41. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
  42. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
  43. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
  44. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
  45. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
  46. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
@@ -14,31 +14,43 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from ..core.kjob import get_a3mega_pod_template_annotations, get_a3ultra_pod_template_annotations
18
- from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
17
+ from ..core.capacity import (
18
+ B200_DEVICE_TYPE,
19
+ H100_MEGA_DEVICE_TYPE,
20
+ H200_DEVICE_TYPE,
21
+ )
19
22
  from ..core.cluster import get_gpu_type_from_cluster
23
+ from ..core.kjob import (
24
+ get_a3mega_pod_template_annotations,
25
+ get_a3ultra_pod_template_annotations,
26
+ get_a4_pod_template_annotations,
27
+ Kueue_TAS_annotation,
28
+ )
29
+ from .common import is_TAS_possible
20
30
 
21
31
 
22
- def add_tcpxo_annotations(args, cmd: str) -> str:
23
- tcpxo, interfaces, eth0 = get_a3mega_pod_template_annotations(args)
24
- cmd += f" --pod-template-annotation {tcpxo} \\\n"
25
- cmd += f" --pod-template-annotation {eth0} \\\n"
26
- cmd += f" --pod-template-annotation {interfaces} "
27
- return cmd
32
+ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
33
+ gpu_type = get_gpu_type_from_cluster(args)
28
34
 
35
+ if gpu_type == H100_MEGA_DEVICE_TYPE:
36
+ annotations = get_a3mega_pod_template_annotations(args)
37
+ elif gpu_type == H200_DEVICE_TYPE:
38
+ annotations = get_a3ultra_pod_template_annotations(args)
39
+ elif gpu_type == B200_DEVICE_TYPE:
40
+ annotations = get_a4_pod_template_annotations(args)
41
+ else:
42
+ annotations = []
43
+
44
+ flags = [
45
+ f" --pod-template-annotation {annotation} " for annotation in annotations
46
+ ]
47
+ cmd += "\\\n".join(flags)
29
48
 
30
- def add_rdma_annotations(args, cmd) -> str:
31
- eth0, interfaces = get_a3ultra_pod_template_annotations(args)
32
- cmd += f" --pod-template-annotation {eth0} \\\n"
33
- cmd += f" --pod-template-annotation {interfaces} \\\n"
34
49
  return cmd
35
50
 
36
51
 
37
- def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
38
- gpu_type = get_gpu_type_from_cluster(args)
52
+ def add_TAS_annotations_to_command(args, cmd: str) -> str:
53
+ if is_TAS_possible(args):
54
+ cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
39
55
 
40
- if gpu_type == H100_MEGA_DEVICE_TYPE:
41
- return add_tcpxo_annotations(args, cmd)
42
- if gpu_type == H200_DEVICE_TYPE:
43
- return add_rdma_annotations(args, cmd)
44
56
  return cmd
xpk/commands/run.py CHANGED
@@ -16,15 +16,22 @@ limitations under the License.
16
16
 
17
17
  from argparse import Namespace
18
18
 
19
- from ..core.cluster import create_xpk_k8s_service_account
19
+ from ..core.cluster import (
20
+ create_xpk_k8s_service_account,
21
+ get_cluster_credentials,
22
+ )
20
23
  from ..core.commands import run_command_with_full_controls
21
24
  from ..core.gcloud_context import add_zone_and_project
25
+ from ..core.kjob import (
26
+ AppProfileDefaults,
27
+ JobTemplateDefaults,
28
+ get_storage_annotations,
29
+ prepare_kjob,
30
+ )
22
31
  from ..core.kueue import LOCAL_QUEUE_NAME
23
32
  from ..utils.console import xpk_exit, xpk_print
24
- from .common import set_cluster_command
25
- from ..core.kjob import JobTemplateDefaults, AppProfileDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
26
- from .kjob_common import add_gpu_networking_annotations_to_command
27
33
  from .kind import set_local_cluster_command
34
+ from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
28
35
 
29
36
 
30
37
  def run(args: Namespace) -> None:
@@ -37,12 +44,11 @@ def run(args: Namespace) -> None:
37
44
  """
38
45
  if not args.kind_cluster:
39
46
  add_zone_and_project(args)
40
- set_cluster_command_code = set_cluster_command(args)
47
+ get_cluster_credentials(args)
41
48
  else:
42
49
  set_cluster_command_code = set_local_cluster_command(args)
43
-
44
- if set_cluster_command_code != 0:
45
- xpk_exit(set_cluster_command_code)
50
+ if set_cluster_command_code != 0:
51
+ xpk_exit(set_cluster_command_code)
46
52
 
47
53
  err_code = prepare_kjob(args)
48
54
  if err_code > 0:
@@ -57,16 +63,15 @@ def submit_job(args: Namespace) -> None:
57
63
  'kubectl kjob create slurm --profile'
58
64
  f' {AppProfileDefaults.NAME.value} '
59
65
  f' --localqueue {LOCAL_QUEUE_NAME} '
60
- f" --pod-template-annotation '{Kueue_TAS_annotation}'"
61
66
  f' --stream-container {JobTemplateDefaults.CONTAINER_NAME.value}'
62
67
  f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
63
68
  ' --wait --rm --first-node-ip'
64
69
  )
65
70
  cmd = add_gpu_networking_annotations_to_command(args, cmd)
71
+ cmd = add_TAS_annotations_to_command(args, cmd)
66
72
 
67
- gcsfuse_annotation = get_gcsfuse_annotation(args)
68
- if gcsfuse_annotation is not None:
69
- cmd += f' --pod-template-annotation {gcsfuse_annotation}'
73
+ for annotation in get_storage_annotations(args):
74
+ cmd += f' --pod-template-annotation {annotation}'
70
75
 
71
76
  if args.timeout:
72
77
  cmd += f' --wait-timeout {args.timeout}s'
xpk/commands/shell.py CHANGED
@@ -20,7 +20,7 @@ from ..core.kjob import (
20
20
  AppProfileDefaults,
21
21
  prepare_kjob,
22
22
  get_pod_template_interactive_command,
23
- get_gcsfuse_annotation,
23
+ get_storage_annotations,
24
24
  )
25
25
 
26
26
  exit_instructions = 'To exit the shell input "exit".'
@@ -89,9 +89,8 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
89
89
  f' {AppProfileDefaults.NAME.value} --pod-running-timeout 180s'
90
90
  )
91
91
 
92
- gcsfuse_annotation = get_gcsfuse_annotation(args)
93
- if gcsfuse_annotation is not None:
94
- cmd += f' --pod-template-annotation {gcsfuse_annotation}'
92
+ for annotation in get_storage_annotations(args):
93
+ cmd += f' --pod-template-annotation {annotation}'
95
94
 
96
95
  return run_command_with_full_controls(
97
96
  command=cmd,
xpk/commands/storage.py CHANGED
@@ -27,6 +27,8 @@ from ..core.cluster import (
27
27
  add_zone_and_project,
28
28
  get_cluster_network,
29
29
  setup_k8s_env,
30
+ update_cluster_with_parallelstore_driver_if_necessary,
31
+ update_cluster_with_pd_driver_if_necessary,
30
32
  update_cluster_with_gcpfilestore_driver_if_necessary,
31
33
  update_cluster_with_gcsfuse_driver_if_necessary,
32
34
  update_cluster_with_workload_identity_if_necessary,
@@ -41,6 +43,8 @@ from ..core.kjob import (
41
43
  from ..core.storage import (
42
44
  GCP_FILESTORE_TYPE,
43
45
  GCS_FUSE_TYPE,
46
+ GCE_PD_TYPE,
47
+ PARALLELSTORE_TYPE,
44
48
  STORAGE_CRD_PLURAL,
45
49
  XPK_API_GROUP_NAME,
46
50
  XPK_API_GROUP_VERSION,
@@ -78,7 +82,10 @@ def storage_create(args: Namespace) -> None:
78
82
  manifest = list(yaml.safe_load_all(f))
79
83
  else:
80
84
  manifest = filestore_client.manifest(
81
- args.name, args.vol, args.access_mode, filestore_network
85
+ args.name,
86
+ args.vol,
87
+ args.access_mode,
88
+ filestore_network,
82
89
  )
83
90
 
84
91
  k8s_api_client = setup_k8s_env(args)
@@ -86,9 +93,10 @@ def storage_create(args: Namespace) -> None:
86
93
  create_volume_bundle_instance(
87
94
  k8s_api_client, args.name, manifest, args.readonly, args.mount_point
88
95
  )
89
- return_code = update_cluster_with_workload_identity_if_necessary(args)
90
- if return_code > 0:
91
- xpk_exit(return_code)
96
+ # Not required for Filestore. Will be uncommented when adding GCSFuse create
97
+ # return_code = update_cluster_with_workload_identity_if_necessary(args)
98
+ # if return_code > 0:
99
+ # xpk_exit(return_code)
92
100
  return_code = update_cluster_with_gcpfilestore_driver_if_necessary(args)
93
101
  if return_code > 0:
94
102
  xpk_exit(return_code)
@@ -131,6 +139,7 @@ def storage_delete(args: Namespace) -> None:
131
139
 
132
140
  def storage_attach(args: Namespace) -> None:
133
141
  add_zone_and_project(args)
142
+ manifest = [{}]
134
143
  if args.type == GCP_FILESTORE_TYPE:
135
144
  if args.instance is None:
136
145
  args.instance = args.name
@@ -148,10 +157,13 @@ def storage_attach(args: Namespace) -> None:
148
157
  else:
149
158
  filestore_network = get_cluster_network(args)
150
159
  manifest = filestore_client.manifest(
151
- args.name, args.vol, args.access_mode, filestore_network
160
+ args.name,
161
+ args.vol,
162
+ args.access_mode,
163
+ filestore_network,
152
164
  )
153
165
 
154
- else: # args.type == GCS_FUSE_TYPE:
166
+ elif args.type == GCS_FUSE_TYPE:
155
167
  if args.manifest is None and args.size is None:
156
168
  xpk_print("--size is required when attaching gcsfuse storage.")
157
169
  xpk_exit(1)
@@ -164,30 +176,65 @@ def storage_attach(args: Namespace) -> None:
164
176
  manifest = list(yaml.safe_load_all(f))
165
177
  else:
166
178
  manifest = gcsfuse.manifest(
167
- name=args.name, bucket=args.bucket, size=args.size
179
+ args.name,
180
+ args.bucket,
181
+ args.size,
182
+ args.mount_options,
183
+ args.prefetch_metadata,
168
184
  )
169
185
 
186
+ elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE]:
187
+ if args.manifest is None:
188
+ xpk_print(
189
+ "Parallelstore and PersistentDisk are currently supported only with"
190
+ " --manifest"
191
+ )
192
+ xpk_exit(1)
193
+
194
+ with open(args.manifest, "r", encoding="utf-8") as f:
195
+ manifest = list(yaml.safe_load_all(f))
196
+
197
+ else:
198
+ xpk_print(f"Storage type {args.type} is not supported.")
199
+ xpk_exit(1)
200
+
170
201
  k8s_api_client = setup_k8s_env(args)
171
202
  create_storage_crds(k8s_api_client, args, manifest)
172
203
  create_volume_bundle_instance(
173
204
  k8s_api_client, args.name, manifest, args.readonly, args.mount_point
174
205
  )
175
- return_code = update_cluster_with_workload_identity_if_necessary(args)
176
- if return_code > 0:
177
- xpk_exit(return_code)
178
-
179
- # args.type can have only two values after parsing
180
- return_code = (
181
- update_cluster_with_gcsfuse_driver_if_necessary(args)
182
- if args.type == GCS_FUSE_TYPE
183
- else update_cluster_with_gcpfilestore_driver_if_necessary(args)
184
- )
185
- if return_code > 0:
186
- xpk_exit(return_code)
206
+
207
+ enable_csi_drivers_if_necessary(args)
187
208
 
188
209
  apply_kubectl_manifest(k8s_api_client, manifest)
189
210
 
190
211
 
212
+ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
213
+ if args.type == GCS_FUSE_TYPE:
214
+ return_code = update_cluster_with_workload_identity_if_necessary(args)
215
+ if return_code > 0:
216
+ xpk_exit(return_code)
217
+
218
+ return_code = update_cluster_with_gcsfuse_driver_if_necessary(args)
219
+ if return_code > 0:
220
+ xpk_exit(return_code)
221
+
222
+ if args.type == GCP_FILESTORE_TYPE:
223
+ return_code = update_cluster_with_gcpfilestore_driver_if_necessary(args)
224
+ if return_code > 0:
225
+ xpk_exit(return_code)
226
+
227
+ if args.type == PARALLELSTORE_TYPE:
228
+ return_code = update_cluster_with_parallelstore_driver_if_necessary(args)
229
+ if return_code > 0:
230
+ xpk_exit(return_code)
231
+
232
+ if args.type == GCE_PD_TYPE:
233
+ return_code = update_cluster_with_pd_driver_if_necessary(args)
234
+ if return_code > 0:
235
+ xpk_exit(return_code)
236
+
237
+
191
238
  def storage_list(args: Namespace) -> None:
192
239
  k8s_api_client = setup_k8s_env(args)
193
240
  storages = list_storages(k8s_api_client)
@@ -278,3 +325,12 @@ def delete_storage_resources(k8s_api_client: ApiClient, storage: Storage):
278
325
  storage.name,
279
326
  "Storage",
280
327
  )
328
+
329
+ # remove kubernetes.io/pvc-protection
330
+ delete_resource(
331
+ lambda name: core_api.patch_namespaced_persistent_volume_claim(
332
+ name, "default", {"metadata": {"finalizers": None}}
333
+ ),
334
+ storage.pvc,
335
+ "Persistent Volume Claim finalizers",
336
+ )