xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. integration/README.md +19 -0
  2. integration/gcluster_a3mega_test.py +11 -0
  3. integration/gcluster_a3ultra_test.py +11 -0
  4. integration/gcluster_a4_test.py +11 -0
  5. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  6. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  7. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  8. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  9. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  10. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  11. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  12. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  13. xpk/blueprints/a4/storage_crd.yaml +52 -0
  14. xpk/commands/cluster.py +89 -32
  15. xpk/commands/cluster_gcluster.py +25 -5
  16. xpk/commands/cluster_gcluster_test.py +16 -3
  17. xpk/commands/cluster_test.py +353 -7
  18. xpk/commands/config.py +3 -5
  19. xpk/commands/inspector.py +5 -3
  20. xpk/commands/kind.py +3 -1
  21. xpk/commands/managed_ml_diagnostics.py +249 -0
  22. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  23. xpk/commands/storage.py +8 -10
  24. xpk/commands/workload.py +143 -142
  25. xpk/commands/workload_test.py +160 -118
  26. xpk/core/blueprint/blueprint_generator.py +73 -33
  27. xpk/core/blueprint/blueprint_test.py +9 -0
  28. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  29. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  30. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  31. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  32. xpk/core/capacity.py +48 -8
  33. xpk/core/capacity_test.py +32 -1
  34. xpk/core/cluster.py +55 -104
  35. xpk/core/cluster_test.py +170 -0
  36. xpk/core/commands.py +4 -10
  37. xpk/core/config.py +88 -7
  38. xpk/core/config_test.py +67 -11
  39. xpk/core/docker_container.py +3 -1
  40. xpk/core/docker_image.py +10 -6
  41. xpk/core/docker_resources.py +1 -10
  42. xpk/core/gcloud_context.py +18 -12
  43. xpk/core/gcloud_context_test.py +111 -1
  44. xpk/core/kjob.py +17 -19
  45. xpk/core/kueue_manager.py +205 -51
  46. xpk/core/kueue_manager_test.py +158 -4
  47. xpk/core/nap.py +13 -14
  48. xpk/core/nodepool.py +37 -43
  49. xpk/core/nodepool_test.py +42 -19
  50. xpk/core/pathways.py +23 -0
  51. xpk/core/pathways_test.py +57 -0
  52. xpk/core/resources.py +84 -27
  53. xpk/core/scheduling.py +144 -133
  54. xpk/core/scheduling_test.py +298 -6
  55. xpk/core/system_characteristics.py +256 -19
  56. xpk/core/system_characteristics_test.py +128 -5
  57. xpk/core/telemetry.py +263 -0
  58. xpk/core/telemetry_test.py +211 -0
  59. xpk/core/vertex.py +4 -3
  60. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  61. xpk/main.py +33 -13
  62. xpk/parser/cluster.py +40 -67
  63. xpk/parser/cluster_test.py +83 -3
  64. xpk/parser/common.py +84 -0
  65. xpk/parser/storage.py +10 -0
  66. xpk/parser/storage_test.py +47 -0
  67. xpk/parser/workload.py +14 -29
  68. xpk/parser/workload_test.py +3 -49
  69. xpk/telemetry_uploader.py +29 -0
  70. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  71. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  72. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  73. xpk/utils/console.py +41 -10
  74. xpk/utils/console_test.py +106 -0
  75. xpk/utils/feature_flags.py +10 -1
  76. xpk/utils/file.py +4 -1
  77. xpk/utils/topology.py +4 -0
  78. xpk/utils/user_agent.py +35 -0
  79. xpk/utils/user_agent_test.py +44 -0
  80. xpk/utils/user_input.py +48 -0
  81. xpk/utils/user_input_test.py +92 -0
  82. xpk/utils/validation.py +2 -13
  83. xpk/utils/versions.py +31 -0
  84. xpk-0.16.0.dist-info/METADATA +127 -0
  85. xpk-0.16.0.dist-info/RECORD +168 -0
  86. xpk-0.14.4.dist-info/METADATA +0 -1645
  87. xpk-0.14.4.dist-info/RECORD +0 -139
  88. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  89. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  90. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  91. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py CHANGED
@@ -16,9 +16,7 @@ limitations under the License.
16
16
 
17
17
  from ..core.blueprint.blueprint_generator import (
18
18
  a3high_device_type,
19
- a3mega_device_type,
20
- a3ultra_device_type,
21
- a4_device_type,
19
+ a4x_device_types,
22
20
  )
23
21
  from ..core.cluster import (
24
22
  XPK_SA,
@@ -27,15 +25,14 @@ from ..core.cluster import (
27
25
  setup_k8s_env,
28
26
  )
29
27
  from ..core.commands import run_command_with_updates, run_commands
30
- from ..core.kueue_manager import KueueManager, has_sub_slicing_enabled
31
28
  from ..core.config import (VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION)
32
29
  from ..core.docker_container import (
33
30
  get_main_container_docker_image,
34
31
  get_user_workload_container,
35
32
  )
33
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
36
34
  from ..core.docker_resources import get_volumes, parse_env_config
37
35
  from ..core.gcloud_context import add_zone_and_project
38
- from ..core.kueue_manager import LOCAL_QUEUE_NAME
39
36
  from ..core.monitoring import get_gke_outlier_dashboard
40
37
  from ..core.nap import (
41
38
  get_autoprovisioning_node_selector_args,
@@ -53,17 +50,20 @@ from ..core.pathways import (
53
50
  get_user_workload_for_pathways,
54
51
  try_to_delete_pathwaysjob_first,
55
52
  )
56
- from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics, SystemCharacteristics
57
- from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
53
+ from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics_from_config_map
54
+ from ..core.resources import ConfigMapType, get_cluster_configmap
55
+ from ..core.nodepool import ensure_resource_policy_exists
58
56
  from ..core.scheduling import (
57
+ WorkloadScheduling,
59
58
  check_if_workload_can_schedule,
60
- create_accelerator_label,
61
- create_machine_label,
62
59
  create_tpu_machine_type,
63
60
  create_tpu_topology,
64
61
  get_cpu_affinity,
65
62
  get_gpu_scheduler,
66
63
  create_sub_slicing_annotations,
64
+ create_placement_policy_label,
65
+ get_placement_policy_name,
66
+ is_placement_policy_supported,
67
67
  )
68
68
  from ..core.storage import (
69
69
  GCE_PD_TYPE,
@@ -78,8 +78,9 @@ from ..core.storage import (
78
78
  )
79
79
  from ..core.system_characteristics import (
80
80
  AcceleratorType,
81
+ create_accelerator_label,
82
+ create_machine_label,
81
83
  get_system_characteristics,
82
- compute_vms_per_slice,
83
84
  )
84
85
  from ..core.vertex import create_vertex_experiment
85
86
  from ..core.workload import (
@@ -90,20 +91,16 @@ from ..core.workload import (
90
91
  get_cluster_location,
91
92
  )
92
93
  from ..core.workload_decorators import (
93
- rdma_decorator,
94
94
  storage_decorator,
95
- tcpx_decorator,
96
- tcpxo_decorator,
97
95
  )
98
- from ..utils.console import get_user_input, xpk_exit, xpk_print
99
- from packaging.version import Version
96
+ from ..utils.console import ask_for_user_consent, xpk_exit, xpk_print
100
97
  from ..utils.file import write_tmp_file
101
98
  from ..utils.execution_context import is_dry_run
102
99
  from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
103
100
  from . import cluster_gcluster
104
- from .common import is_TAS_possible, validate_sub_slicing_system
105
- from ..utils.topology import is_topology_contained
106
- from ..utils.feature_flags import FeatureFlags
101
+ from .common import is_TAS_possible
102
+ from jinja2 import Environment, FileSystemLoader
103
+ from ..utils.templates import get_templates_absolute_path
107
104
 
108
105
  WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
109
106
  kind: JobSet
@@ -144,6 +141,7 @@ spec:
144
141
  nodeSelector:
145
142
  {accelerator_label}
146
143
  {machine_label}
144
+ {placement_policy_label}
147
145
  {autoprovisioning_args}
148
146
  priorityClassName: {args.priority}
149
147
  hostNetwork: true
@@ -193,6 +191,8 @@ spec:
193
191
  {gpu_scheduler}
194
192
  priorityClassName: {args.priority}
195
193
  restartPolicy: Never
194
+ nodeSelector:
195
+ {placement_policy_label}
196
196
  imagePullSecrets:
197
197
  - name: {args.docker_image_pull_secret}
198
198
  hostNetwork: true
@@ -238,6 +238,8 @@ spec:
238
238
  spec:
239
239
  priorityClassName: {args.priority}
240
240
  restartPolicy: Never
241
+ nodeSelector:
242
+ {placement_policy_label}
241
243
  imagePullSecrets:
242
244
  - name: {args.docker_image_pull_secret}
243
245
  dnsPolicy: ClusterFirstWithHostNet
@@ -273,6 +275,7 @@ PW_WORKLOAD_CREATE_YAML = """
273
275
  terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
274
276
  priorityClassName: {args.priority}
275
277
  nodeSelector:
278
+ {placement_policy_label}
276
279
  {autoprovisioning_args}
277
280
  pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
278
281
  controller:
@@ -284,8 +287,7 @@ PW_WORKLOAD_CREATE_YAML = """
284
287
  {user_workload}
285
288
  """
286
289
 
287
- SUB_SLICING_TOPOLOGIES = ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
288
- SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
290
+ ARM_GPU_WORKLOAD_CREATE_JINJA_FILE = 'arm_gpu_workload_crate.yaml.j2'
289
291
 
290
292
 
291
293
  def workload_create_pathways(args) -> None:
@@ -337,26 +339,35 @@ def workload_create(args) -> None:
337
339
  )
338
340
  xpk_exit(1)
339
341
 
340
- system, return_code = get_system_characteristics(args)
341
- if return_code > 0 or system is None:
342
+ workload_system, return_code = get_system_characteristics(args)
343
+ if return_code > 0 or workload_system is None:
342
344
  xpk_print('Fetching system characteristics failed!')
343
345
  xpk_exit(return_code)
344
346
 
345
- if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None:
346
- _validate_sub_slicing_availability()
347
- _validate_sub_slicing_topology(system, args.sub_slicing_topology)
348
-
349
- if not check_if_workload_can_schedule(args, system):
347
+ resources_config_map = get_cluster_configmap(
348
+ args.cluster, ConfigMapType.RESOURCES
349
+ )
350
+ cluster_system = get_cluster_system_characteristics_from_config_map(
351
+ resources_config_map
352
+ )
353
+ workload_scheduling = check_if_workload_can_schedule(
354
+ args=args,
355
+ workload_system=workload_system,
356
+ cluster_system=cluster_system,
357
+ resources_config_map=resources_config_map,
358
+ )
359
+ if workload_scheduling == WorkloadScheduling.UNAVAILABLE:
350
360
  xpk_exit(1)
351
361
 
352
362
  xpk_print('Starting workload create', flush=True)
353
363
 
354
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
355
- cluster_config_map = get_cluster_configmap(metadata_configmap_name)
364
+ cluster_config_map = get_cluster_configmap(
365
+ args.cluster, ConfigMapType.METADATA
366
+ )
356
367
  cluster_xpk_version = None
357
368
  if cluster_config_map is None:
358
369
  xpk_print(
359
- f'Warning: Unable to find ConfigMap: {metadata_configmap_name} for the'
370
+ 'Warning: Unable to find ConfigMap for the'
360
371
  ' cluster. We recommend to upgrade your cluster by running `xpk'
361
372
  ' cluster create`.'
362
373
  )
@@ -388,7 +399,7 @@ def workload_create(args) -> None:
388
399
 
389
400
  autoprovisioning_args = ''
390
401
  autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
391
- args, system
402
+ args, workload_system
392
403
  )
393
404
  if return_code != 0:
394
405
  xpk_exit(return_code)
@@ -481,23 +492,35 @@ def workload_create(args) -> None:
481
492
  podFailurePolicy:
482
493
  rules:
483
494
  - action: FailJob
484
- onPodConditions: []
485
495
  onExitCodes:
486
- containerName: {get_main_container_docker_image(args, system)}
496
+ containerName: {get_main_container_docker_image(args, workload_system)}
487
497
  operator: NotIn
488
498
  values: [{restart_on_exit_codes}]"""
489
499
 
500
+ if is_placement_policy_supported(workload_system):
501
+ ensure_resource_policy_exists(
502
+ resource_policy_name=get_placement_policy_name(workload_system),
503
+ project=args.project,
504
+ zone=args.zone,
505
+ topology=workload_system.topology,
506
+ )
507
+
508
+ placement_policy_label = (
509
+ create_placement_policy_label(workload_system)
510
+ if is_placement_policy_supported(workload_system)
511
+ else ''
512
+ )
513
+
490
514
  # Create the workload file based on accelerator type or workload type.
491
- if system.accelerator_type == AcceleratorType.GPU:
515
+ if workload_system.accelerator_type == AcceleratorType.GPU:
492
516
  container, debugging_dashboard_id = get_user_workload_container(
493
- args, system
517
+ args, workload_system
494
518
  )
495
519
  gpu_scheduler, return_code = get_gpu_scheduler(
496
- args, system, autoprovisioning_args
520
+ args, workload_system, autoprovisioning_args
497
521
  )
498
522
  if return_code != 0:
499
523
  xpk_exit(return_code)
500
- system_characteristics = get_cluster_system_characteristics(args)
501
524
  capacity_type = get_cluster_capacity_type(args)
502
525
 
503
526
  annotations = (
@@ -505,30 +528,55 @@ def workload_create(args) -> None:
505
528
  'kueue.x-k8s.io/podset-preferred-topology:'
506
529
  ' "cloud.google.com/gce-topology-host"'
507
530
  )
508
- if is_TAS_possible(system_characteristics, capacity_type)
531
+ if is_TAS_possible(cluster_system, capacity_type)
509
532
  else ''
510
533
  )
511
534
 
512
535
  if (
513
- system.device_type in cluster_gcluster.supported_device_types
514
- or system.device_type == a3high_device_type
536
+ workload_system.device_type in cluster_gcluster.supported_device_types
537
+ or workload_system.device_type == a3high_device_type
538
+ or workload_system.device_type in a4x_device_types
515
539
  ):
516
- yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
517
- args=args,
518
- container=container,
519
- service_account=XPK_SA,
520
- failure_policy_rules=failure_policy_rules,
521
- pod_failure_policy=pod_failure_policy,
522
- annotations=annotations,
523
- )
540
+ if workload_system.device_type in a4x_device_types:
541
+ template_env = Environment(
542
+ loader=FileSystemLoader(searchpath=get_templates_absolute_path())
543
+ )
544
+ workload_create_yaml = template_env.get_template(
545
+ ARM_GPU_WORKLOAD_CREATE_JINJA_FILE
546
+ )
547
+ yml_string = workload_create_yaml.render(
548
+ workload=args.workload,
549
+ num_nodes=args.num_nodes,
550
+ ttl_seconds_after_finished=args.ttl_seconds_after_finished,
551
+ max_restarts=args.max_restarts,
552
+ priority=args.priority,
553
+ termination_grace_period_seconds=args.termination_grace_period_seconds,
554
+ docker_image_pull_secret=args.docker_image_pull_secret,
555
+ container=container,
556
+ service_account=XPK_SA,
557
+ failure_policy_rules=failure_policy_rules,
558
+ pod_failure_policy=pod_failure_policy,
559
+ annotations=annotations,
560
+ placement_policy_label=placement_policy_label,
561
+ )
562
+ else:
563
+ yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
564
+ args=args,
565
+ container=container,
566
+ service_account=XPK_SA,
567
+ failure_policy_rules=failure_policy_rules,
568
+ pod_failure_policy=pod_failure_policy,
569
+ annotations=annotations,
570
+ placement_policy_label=placement_policy_label,
571
+ )
524
572
 
525
573
  sub_networks = get_cluster_subnetworks()
526
- if args.device_type == a3high_device_type:
527
- yml_string = tcpx_decorator.decorate_jobset(yml_string)
528
- elif args.device_type == a3mega_device_type:
529
- yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
530
- elif args.device_type in [a3ultra_device_type, a4_device_type]:
531
- yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
574
+
575
+ if workload_system.gpu_config and callable(
576
+ workload_system.gpu_config.jobset_decorator_fn
577
+ ):
578
+ decorator_fn = workload_system.gpu_config.jobset_decorator_fn
579
+ yml_string = decorator_fn(yml_string, sub_networks)
532
580
 
533
581
  if all_storages:
534
582
  yml_string = storage_decorator.decorate_jobset(yml_string, all_storages)
@@ -537,61 +585,64 @@ def workload_create(args) -> None:
537
585
  args=args,
538
586
  container=container,
539
587
  gpu_scheduler=gpu_scheduler,
540
- volumes=get_volumes(args, system),
588
+ volumes=get_volumes(args, workload_system),
541
589
  storage_annotations=('\n' + (' ' * 12)).join(
542
590
  get_storage_annotations(all_storages)
543
591
  ),
544
592
  service_account=service_account,
545
593
  failure_policy_rules=failure_policy_rules,
546
594
  pod_failure_policy=pod_failure_policy,
595
+ placement_policy_label=placement_policy_label,
547
596
  )
548
597
 
549
598
  elif args.use_pathways and ensure_pathways_workload_prerequisites(
550
- args, system
599
+ args, workload_system
551
600
  ):
552
601
  yml_string = PW_WORKLOAD_CREATE_YAML.format(
553
602
  args=args,
554
- system=system,
555
- topology=create_tpu_topology(system.accelerator_type, system),
556
- machine_type=create_tpu_machine_type(system.accelerator_type, system),
603
+ topology=create_tpu_topology(workload_system),
604
+ machine_type=create_tpu_machine_type(workload_system),
557
605
  custom_pathways_proxy_server=append_custom_pathways_proxy_server(args),
558
606
  custom_pathways_server=append_custom_pathways_server(args),
559
607
  custom_pathways_worker=append_custom_pathways_worker(args),
560
608
  colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
561
- user_workload=get_user_workload_for_pathways(args, system),
609
+ user_workload=get_user_workload_for_pathways(args, workload_system),
562
610
  local_queue_name=LOCAL_QUEUE_NAME,
563
611
  autoprovisioning_args=autoprovisioning_args,
612
+ placement_policy_label=placement_policy_label,
564
613
  )
565
614
  else:
615
+ use_sub_slicing = (
616
+ workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
617
+ )
618
+ if use_sub_slicing:
619
+ xpk_print('Workload will be scheduled using the Sub-slicing feature.')
620
+
566
621
  container, debugging_dashboard_id = get_user_workload_container(
567
- args, system
622
+ args, workload_system
568
623
  )
569
624
  yml_string = WORKLOAD_CREATE_YAML.format(
570
625
  args=args,
571
626
  container=container,
572
- vms_per_slice=(
573
- compute_vms_per_slice(args.sub_slicing_topology)
574
- if system.accelerator_type == AcceleratorType.TPU
575
- and FeatureFlags.SUB_SLICING_ENABLED
576
- and args.sub_slicing_topology is not None
577
- else system.vms_per_slice
578
- ),
579
- affinity=get_cpu_affinity(system.accelerator_type),
580
- accelerator_label=create_accelerator_label(
581
- system.accelerator_type, system
582
- ),
627
+ vms_per_slice=workload_system.vms_per_slice,
628
+ affinity=get_cpu_affinity(workload_system.accelerator_type),
629
+ accelerator_label=create_accelerator_label(workload_system),
583
630
  sub_slicing_annotations=(
584
- ''
585
- if not FeatureFlags.SUB_SLICING_ENABLED
586
- or args.sub_slicing_topology is None
587
- else ('\n' + (' ' * 16)).join(
588
- create_sub_slicing_annotations(args.sub_slicing_topology)
631
+ ('\n' + (' ' * 16)).join(
632
+ create_sub_slicing_annotations(workload_system.topology)
589
633
  )
634
+ if use_sub_slicing
635
+ else ''
636
+ ),
637
+ placement_policy_label=placement_policy_label,
638
+ machine_label=(
639
+ create_machine_label(cluster_system)
640
+ if use_sub_slicing and cluster_system
641
+ else create_machine_label(workload_system)
590
642
  ),
591
- machine_label=create_machine_label(system.accelerator_type, system),
592
643
  local_queue_name=LOCAL_QUEUE_NAME,
593
644
  autoprovisioning_args=autoprovisioning_args,
594
- volumes=get_volumes(args, system),
645
+ volumes=get_volumes(args, workload_system),
595
646
  storage_annotations=('\n' + (' ' * 16)).join(
596
647
  get_storage_annotations(all_storages)
597
648
  ),
@@ -599,10 +650,18 @@ def workload_create(args) -> None:
599
650
  tpu_toleration="""
600
651
  - operator: "Exists"
601
652
  key: google.com/tpu
602
- """ if system.accelerator_type == AcceleratorType.TPU else '',
653
+ """ if workload_system.accelerator_type == AcceleratorType.TPU else '',
603
654
  failure_policy_rules=failure_policy_rules,
604
655
  pod_failure_policy=pod_failure_policy,
605
656
  )
657
+ if args.output_manifest_file:
658
+ with open(args.output_manifest_file, 'w', encoding='utf-8') as f:
659
+ f.write(yml_string)
660
+ xpk_print(
661
+ f'Workload {args.workload} manifest written to'
662
+ f' {args.output_manifest_file}'
663
+ )
664
+
606
665
  tmp = write_tmp_file(yml_string)
607
666
  command = f'kubectl apply -f {str(tmp)}'
608
667
  return_code = run_command_with_updates(command, 'Creating Workload')
@@ -616,7 +675,7 @@ def workload_create(args) -> None:
616
675
 
617
676
  # Get GKE outlier dashboard for TPU
618
677
  outlier_dashboard_id = None
619
- if system.accelerator_type == AcceleratorType.TPU:
678
+ if workload_system.accelerator_type == AcceleratorType.TPU:
620
679
  outlier_dashboard_id = get_gke_outlier_dashboard(args)
621
680
 
622
681
  # Outlier and debugging dashboards
@@ -683,63 +742,6 @@ def workload_create(args) -> None:
683
742
  xpk_exit(0)
684
743
 
685
744
 
686
- def _validate_sub_slicing_availability():
687
- return_code, sub_slicing_enabled = has_sub_slicing_enabled()
688
- if return_code != 0:
689
- xpk_print(
690
- 'Error: Unable to validate sub-slicing support on a given cluster.'
691
- )
692
- xpk_exit(1)
693
-
694
- if not sub_slicing_enabled:
695
- xpk_print(
696
- 'Error: Cluster has not been not set up for Sub-slicing. Please enable'
697
- ' --sub-slicing in "cluster create" command first.'
698
- )
699
- xpk_exit(1)
700
-
701
- kueue_manager = KueueManager()
702
- return_code, current_version = kueue_manager.get_installed_kueue_version()
703
- if return_code != 0:
704
- xpk_print(
705
- 'Error: Unable to validate sub-slicing support on a given cluster.'
706
- )
707
- xpk_exit(1)
708
-
709
- if current_version < SUB_SLICING_MINIMUM_KUEUE_VERSION:
710
- xpk_print(
711
- "Error: Current Kueue version ({current_version}) doesn't support"
712
- ' Sub-slicing. The minimal required version is'
713
- ' v{SUB_SLICING_MINIMUM_KUEUE_VERSION}. Please either update Kueue'
714
- ' manually, or run "cluster create --sub-slicing" on the existing'
715
- ' cluster.'
716
- )
717
- xpk_exit(1)
718
-
719
-
720
- def _validate_sub_slicing_topology(
721
- system_characteristics: SystemCharacteristics, sub_slicing_topology: str
722
- ) -> None:
723
- if sub_slicing_topology not in SUB_SLICING_TOPOLOGIES:
724
- xpk_print(
725
- f'Error: --sub-slicing-topology={sub_slicing_topology} shape is'
726
- f' invalid. It has to be one of: {", ".join(SUB_SLICING_TOPOLOGIES)}.'
727
- )
728
- xpk_exit(1)
729
-
730
- if not is_topology_contained(
731
- contained=sub_slicing_topology, container=system_characteristics.topology
732
- ):
733
- xpk_print(
734
- f'Error: --sub-slicing-topology={sub_slicing_topology} shape is too'
735
- ' large. The shape cannot be bigger than'
736
- f' {system_characteristics.topology}.'
737
- )
738
- xpk_exit(1)
739
-
740
- validate_sub_slicing_system(system_characteristics)
741
-
742
-
743
745
  def get_restart_exit_codes(args) -> list:
744
746
  exit_codes = [42]
745
747
  exit_codes.extend(range(127, 256, 1))
@@ -785,11 +787,10 @@ def workload_delete(args) -> None:
785
787
  xpk_exit(return_code)
786
788
  # Skip the header
787
789
  workloads = [x.split(' ')[0] for x in return_value.splitlines()][1:]
788
- if workloads and not args.force:
789
- will_delete = get_user_input(
790
+ if workloads:
791
+ will_delete = ask_for_user_consent(
790
792
  f'Planning to delete {len(workloads)} workloads in the cluster'
791
- f' {args.cluster} including {workloads}. \nDo you wish to delete: y'
792
- ' (yes) / n (no):\n'
793
+ f' {args.cluster} including {workloads}. \nDo you wish to delete?'
793
794
  )
794
795
  else:
795
796
  workloads = [args.workload]