xpk 0.16.1__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. xpk/commands/cluster.py +48 -5
  2. xpk/commands/cluster_gcluster.py +3 -0
  3. xpk/commands/cluster_gcluster_test.py +2 -0
  4. xpk/commands/cluster_test.py +203 -0
  5. xpk/commands/common.py +6 -0
  6. xpk/commands/kind.py +2 -0
  7. xpk/commands/workload.py +35 -16
  8. xpk/commands/workload_test.py +1 -0
  9. xpk/core/capacity.py +83 -46
  10. xpk/core/capacity_test.py +82 -28
  11. xpk/core/commands.py +39 -12
  12. xpk/core/kueue_manager.py +42 -11
  13. xpk/core/kueue_manager_test.py +83 -3
  14. xpk/core/nap.py +5 -4
  15. xpk/core/nodepool.py +57 -20
  16. xpk/core/nodepool_test.py +152 -23
  17. xpk/core/pathways.py +2 -1
  18. xpk/core/resources.py +3 -3
  19. xpk/core/scheduling.py +54 -10
  20. xpk/core/scheduling_test.py +118 -13
  21. xpk/core/system_characteristics.py +41 -24
  22. xpk/core/system_characteristics_test.py +37 -4
  23. xpk/core/telemetry.py +5 -0
  24. xpk/core/telemetry_test.py +19 -2
  25. xpk/core/updates.py +1 -1
  26. xpk/main.py +2 -1
  27. xpk/parser/cluster.py +34 -2
  28. xpk/parser/cluster_test.py +117 -0
  29. xpk/parser/common.py +32 -0
  30. xpk/parser/common_test.py +49 -0
  31. xpk/templates/kueue_config.yaml.j2 +21 -5
  32. xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
  33. xpk/utils/kueue.py +6 -2
  34. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/METADATA +2 -1
  35. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/RECORD +39 -37
  36. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/WHEEL +0 -0
  37. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/entry_points.txt +0 -0
  38. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/licenses/LICENSE +0 -0
  39. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py CHANGED
@@ -19,7 +19,7 @@ from tabulate import tabulate
19
19
  from ..utils.feature_flags import FeatureFlags
20
20
  from ..utils.versions import ReleaseChannel
21
21
  from ..core.pathways import get_pathways_machine_types
22
- from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type
22
+ from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type, parse_reservation
23
23
  from ..core.cluster import (
24
24
  get_all_clusters_programmatic,
25
25
  get_cluster_credentials,
@@ -79,7 +79,7 @@ from ..utils.file import write_tmp_file
79
79
  from ..utils.execution_context import is_dry_run, is_quiet
80
80
  from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
81
81
  from . import cluster_gcluster
82
- from .common import set_cluster_command, validate_sub_slicing_system
82
+ from .common import set_cluster_command, validate_sub_slicing_system, validate_super_slicing_system
83
83
  from jinja2 import Environment, FileSystemLoader
84
84
  from ..utils.templates import get_templates_absolute_path
85
85
  import shutil
@@ -211,6 +211,11 @@ def _validate_cluster_create_args(args, system: SystemCharacteristics):
211
211
  if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
212
212
  validate_sub_slicing_system(system)
213
213
  _validate_sub_slicing_reservation(args)
214
+ if FeatureFlags.SUPER_SLICING_ENABLED:
215
+ _validate_num_slices_and_set_default(args)
216
+ if args.super_slicing:
217
+ validate_super_slicing_system(system)
218
+ _validate_super_slicing_reservation(args)
214
219
  if args.enable_pathways:
215
220
  _validate_pathways_machine(args)
216
221
 
@@ -233,15 +238,30 @@ def _validate_pathways_machine(args):
233
238
 
234
239
 
235
240
  def _validate_sub_slicing_reservation(args):
241
+ _validate_gsc_reservation(args, 'Sub-slicing')
242
+
243
+
244
+ def _validate_super_slicing_reservation(args):
245
+ _validate_gsc_reservation(args, 'Super-slicing')
246
+ reservation = parse_reservation(args.reservation, args.project)
247
+ if reservation.block_name is None:
248
+ xpk_print(
249
+ 'Error: Validation failed: Super-slicing cluster creation'
250
+ ' requires a block or sub-block reservation.'
251
+ )
252
+ xpk_exit(1)
253
+
254
+
255
+ def _validate_gsc_reservation(args, creation_description: str):
236
256
  if args.reservation is None:
237
257
  xpk_print(
238
- 'Error: Validation failed: Sub-slicing cluster creation requires'
239
- ' Cluster Director reservation to be specified.'
258
+ f'Error: Validation failed: {creation_description} cluster creation'
259
+ ' requires Cluster Director reservation to be specified.'
240
260
  )
241
261
  xpk_exit(1)
242
262
 
243
263
  deployment_type = get_reservation_deployment_type(
244
- reservation=args.reservation, project=args.project, zone=args.zone
264
+ reservation_path=args.reservation, project=args.project, zone=args.zone
245
265
  )
246
266
  if deployment_type != 'DENSE':
247
267
  xpk_print(
@@ -263,6 +283,22 @@ def _validate_sub_slicing_reservation(args):
263
283
  xpk_exit(1)
264
284
 
265
285
 
286
+ def _validate_num_slices_and_set_default(args):
287
+ if args.num_cubes is not None and not args.super_slicing:
288
+ xpk_print('--num-cubes can only be used with --super-slicing')
289
+ xpk_exit(1)
290
+
291
+ if (
292
+ args.num_cubes is not None
293
+ and args.num_slices is not None
294
+ and args.num_cubes != args.num_slices
295
+ ):
296
+ xpk_print('--num-cubes must not be different from --num-slices')
297
+ xpk_exit(1)
298
+
299
+ args.num_slices = args.num_slices or args.num_cubes or 1
300
+
301
+
266
302
  def cluster_create(args) -> None:
267
303
  """Function around cluster creation.
268
304
 
@@ -374,6 +410,7 @@ def cluster_create(args) -> None:
374
410
  )
375
411
  if return_code != 0:
376
412
  xpk_exit(return_code)
413
+ assert gke_node_pool_version
377
414
 
378
415
  run_gke_node_pool_create_command_code = run_gke_node_pool_create_command(
379
416
  args, system, gke_node_pool_version
@@ -1256,6 +1293,9 @@ def run_gke_cluster_create_command(
1256
1293
  addons_str = ','.join(addons)
1257
1294
  command += f' --addons={addons_str}'
1258
1295
 
1296
+ if FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing:
1297
+ command += ' --enable-slice-controller'
1298
+
1259
1299
  if args.custom_cluster_arguments:
1260
1300
  command += f' {args.custom_cluster_arguments}'
1261
1301
 
@@ -1347,6 +1387,9 @@ def _install_kueue(
1347
1387
  configure_sub_slicing=(
1348
1388
  FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
1349
1389
  ),
1390
+ configure_super_slicing=(
1391
+ FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing
1392
+ ),
1350
1393
  )
1351
1394
  )
1352
1395
 
@@ -173,6 +173,9 @@ def __install_kueue(args) -> int:
173
173
  configure_sub_slicing=(
174
174
  FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
175
175
  ),
176
+ configure_super_slicing=(
177
+ FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing
178
+ ),
176
179
  ),
177
180
  tolerations=tolerations,
178
181
  )
@@ -97,6 +97,7 @@ def test_install_kueue_standard(
97
97
  accelerator_type=AcceleratorType.GPU,
98
98
  device_type="h100-mega-80gb-8",
99
99
  supports_sub_slicing=False,
100
+ supports_super_slicing=False,
100
101
  docker_platform=DockerPlatform.ARM,
101
102
  gpu_config=GpuConfig(requires_topology=True),
102
103
  )
@@ -150,6 +151,7 @@ def test_install_kueue_with_autoprovisioning(
150
151
  accelerator_type=AcceleratorType.GPU,
151
152
  device_type="h100-mega-80gb-8",
152
153
  supports_sub_slicing=False,
154
+ supports_super_slicing=False,
153
155
  docker_platform=DockerPlatform.ARM,
154
156
  gpu_config=GpuConfig(requires_topology=True),
155
157
  )
@@ -110,6 +110,7 @@ def construct_args(**kwargs: Any) -> Namespace:
110
110
  cluster='test-cluster',
111
111
  default_pool_cpu_num_nodes='100',
112
112
  sub_slicing=False,
113
+ super_slicing=False,
113
114
  gke_version='',
114
115
  private=False,
115
116
  authorized_networks=None,
@@ -226,6 +227,9 @@ GPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
226
227
  SUB_SLICING_SYSTEM: SystemCharacteristics = (
227
228
  UserFacingNameToSystemCharacteristics['v6e-4x4']
228
229
  )
230
+ SUPER_SLICING_SYSTEM: SystemCharacteristics = (
231
+ UserFacingNameToSystemCharacteristics['tpu7x-4x4x4']
232
+ )
229
233
  TPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
230
234
  'v6e-4x4'
231
235
  ]
@@ -608,3 +612,202 @@ def test_cluster_create_calls_run_command_with_correct_channel_and_version(
608
612
  ]
609
613
 
610
614
  mocks.commands_tester.assert_command_run(*expected_command_parts)
615
+
616
+
617
+ def test_run_gke_cluster_create_command_with_super_slicing_enables_slice_controller(
618
+ mocks: _Mocks,
619
+ ):
620
+ FeatureFlags.SUPER_SLICING_ENABLED = True
621
+ result = run_gke_cluster_create_command(
622
+ args=construct_args(gke_version='1.2.3', super_slicing=True),
623
+ gke_control_plane_version='1.2.3',
624
+ system=SUPER_SLICING_SYSTEM,
625
+ release_channel=ReleaseChannel.REGULAR,
626
+ )
627
+
628
+ assert result == 0
629
+ mocks.commands_tester.assert_command_run(
630
+ 'clusters create', '--enable-slice-controller'
631
+ )
632
+
633
+
634
+ def test_validate_cluster_create_args_for_correct_super_slicing_args_pass(
635
+ mocks: _Mocks,
636
+ ):
637
+ FeatureFlags.SUPER_SLICING_ENABLED = True
638
+ args = construct_args(
639
+ super_slicing=True,
640
+ reservation='test-reservation/reservationBlocks/block',
641
+ num_cubes=None,
642
+ num_slices=None,
643
+ )
644
+
645
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
646
+ args = construct_args(
647
+ super_slicing=True,
648
+ reservation='test-reservation/reservationBlocks/block/reservationSubBlocks/subblock',
649
+ num_cubes=None,
650
+ num_slices=None,
651
+ )
652
+ _validate_cluster_create_args(
653
+ args, UserFacingNameToSystemCharacteristics['tpu7x-128']
654
+ )
655
+
656
+ assert mocks.common_print_mock.call_count == 0
657
+
658
+
659
+ def test_validate_cluster_create_args_for_super_slicing_system_not_supported_throws(
660
+ mocks: _Mocks,
661
+ ):
662
+ FeatureFlags.SUPER_SLICING_ENABLED = True
663
+ args = construct_args(
664
+ super_slicing=True,
665
+ reservation='test-reservation/reservationBlocks/block',
666
+ num_cubes=None,
667
+ num_slices=None,
668
+ )
669
+
670
+ with pytest.raises(SystemExit):
671
+ _validate_cluster_create_args(
672
+ args, UserFacingNameToSystemCharacteristics['tpu7x-4x4x8']
673
+ )
674
+
675
+ assert mocks.common_print_mock.call_count == 1
676
+ assert (
677
+ mocks.common_print_mock.call_args[0][0]
678
+ == 'Error: tpu7x-256 does not support Super-slicing.'
679
+ )
680
+
681
+
682
+ def test_validate_cluster_create_args_for_super_slicing_missing_reservation(
683
+ mocks: _Mocks,
684
+ ):
685
+ FeatureFlags.SUPER_SLICING_ENABLED = True
686
+ args = construct_args(
687
+ super_slicing=True,
688
+ reservation=None,
689
+ num_cubes=None,
690
+ num_slices=None,
691
+ )
692
+
693
+ with pytest.raises(SystemExit):
694
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
695
+
696
+ assert mocks.commands_print_mock.call_count == 1
697
+ assert (
698
+ 'Validation failed: Super-slicing cluster creation requires'
699
+ in mocks.commands_print_mock.call_args[0][0]
700
+ )
701
+
702
+
703
+ def test_validate_cluster_create_args_for_super_slicing_reservation_no_blocks(
704
+ mocks: _Mocks,
705
+ ):
706
+ FeatureFlags.SUPER_SLICING_ENABLED = True
707
+ args = construct_args(
708
+ super_slicing=True,
709
+ reservation='reservation',
710
+ num_cubes=None,
711
+ num_slices=None,
712
+ )
713
+
714
+ with pytest.raises(SystemExit):
715
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
716
+
717
+ assert mocks.commands_print_mock.call_count == 1
718
+ assert (
719
+ 'requires a block or sub-block reservation'
720
+ in mocks.commands_print_mock.call_args[0][0]
721
+ )
722
+
723
+
724
+ def test_validate_cluster_create_args_for_super_slicing_sparse_deployment_type_reservation(
725
+ mocks: _Mocks,
726
+ ):
727
+ FeatureFlags.SUPER_SLICING_ENABLED = True
728
+ args = construct_args(
729
+ super_slicing=True,
730
+ reservation='test-reservation/reservationBlocks/block',
731
+ num_cubes=None,
732
+ num_slices=None,
733
+ )
734
+ mocks.commands_get_reservation_deployment_type.return_value = 'SPARSE'
735
+
736
+ with pytest.raises(SystemExit):
737
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
738
+
739
+ assert mocks.commands_print_mock.call_count == 5
740
+ assert (
741
+ 'Refer to the documentation for more information on creating Cluster'
742
+ in mocks.commands_print_mock.call_args[0][0]
743
+ )
744
+
745
+
746
+ def test_validate_cluster_create_args_forbids_num_cubes_without_superslicing(
747
+ mocks: _Mocks,
748
+ ):
749
+ FeatureFlags.SUPER_SLICING_ENABLED = True # enable the feature
750
+ args = construct_args(
751
+ super_slicing=False, # but disable the flag
752
+ reservation='test-reservation/reservationBlocks/block',
753
+ num_cubes=1,
754
+ num_slices=None,
755
+ )
756
+
757
+ with pytest.raises(SystemExit):
758
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
759
+
760
+ assert mocks.commands_print_mock.call_count == 1
761
+ assert (
762
+ '--num-cubes can only be used with --super-slicing'
763
+ in mocks.commands_print_mock.call_args[0][0]
764
+ )
765
+
766
+
767
+ def test_validate_cluster_create_args_forbids_num_cubes_different_from_num_slices(
768
+ mocks: _Mocks,
769
+ ):
770
+ FeatureFlags.SUPER_SLICING_ENABLED = True
771
+ args = construct_args(
772
+ super_slicing=True,
773
+ reservation='test-reservation/reservationBlocks/block',
774
+ num_cubes=1,
775
+ num_slices=2,
776
+ )
777
+
778
+ with pytest.raises(SystemExit):
779
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
780
+
781
+ assert mocks.commands_print_mock.call_count == 1
782
+ assert (
783
+ '--num-cubes must not be different from --num-slices'
784
+ in mocks.commands_print_mock.call_args[0][0]
785
+ )
786
+
787
+
788
+ @pytest.mark.parametrize(
789
+ 'num_cubes, num_slices, expected',
790
+ [
791
+ (None, None, 1),
792
+ (3, None, 3),
793
+ (None, 3, 3),
794
+ (3, 3, 3),
795
+ ],
796
+ )
797
+ def test_validate_cluster_create_args_sets_correct_num_slices(
798
+ mocks: _Mocks,
799
+ num_cubes: int | None,
800
+ num_slices: int | None,
801
+ expected: int,
802
+ ):
803
+ FeatureFlags.SUPER_SLICING_ENABLED = True
804
+ args = construct_args(
805
+ super_slicing=True,
806
+ reservation='test-reservation/reservationBlocks/block',
807
+ num_cubes=num_cubes,
808
+ num_slices=num_slices,
809
+ )
810
+
811
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
812
+
813
+ assert args.num_slices == expected
xpk/commands/common.py CHANGED
@@ -73,3 +73,9 @@ def validate_sub_slicing_system(system: SystemCharacteristics):
73
73
  if not system.supports_sub_slicing:
74
74
  xpk_print(f'Error: {system.device_type} does not support Sub-slicing.')
75
75
  xpk_exit(1)
76
+
77
+
78
+ def validate_super_slicing_system(system: SystemCharacteristics):
79
+ if not system.supports_super_slicing:
80
+ xpk_print(f'Error: {system.device_type} does not support Super-slicing.')
81
+ xpk_exit(1)
xpk/commands/kind.py CHANGED
@@ -98,6 +98,7 @@ def cluster_create(args) -> None:
98
98
  AcceleratorType.CPU,
99
99
  'kind',
100
100
  supports_sub_slicing=False,
101
+ supports_super_slicing=False,
101
102
  docker_platform=DockerPlatform.ARM,
102
103
  )
103
104
 
@@ -113,6 +114,7 @@ def cluster_create(args) -> None:
113
114
  is_pathways_cluster=False,
114
115
  flex=False,
115
116
  configure_sub_slicing=False,
117
+ configure_super_slicing=False,
116
118
  ),
117
119
  )
118
120
 
xpk/commands/workload.py CHANGED
@@ -132,6 +132,7 @@ spec:
132
132
  annotations:
133
133
  {storage_annotations}
134
134
  {sub_slicing_annotations}
135
+ {annotations_machine_label}
135
136
  spec:
136
137
  schedulerName: {args.scheduler}
137
138
  imagePullSecrets:
@@ -140,7 +141,7 @@ spec:
140
141
  {affinity}
141
142
  nodeSelector:
142
143
  {accelerator_label}
143
- {machine_label}
144
+ {node_selector_machine_label}
144
145
  {placement_policy_label}
145
146
  {autoprovisioning_args}
146
147
  priorityClassName: {args.priority}
@@ -492,25 +493,29 @@ def workload_create(args) -> None:
492
493
  podFailurePolicy:
493
494
  rules:
494
495
  - action: FailJob
495
- onPodConditions: []
496
496
  onExitCodes:
497
497
  containerName: {get_main_container_docker_image(args, workload_system)}
498
498
  operator: NotIn
499
499
  values: [{restart_on_exit_codes}]"""
500
500
 
501
- if is_placement_policy_supported(workload_system):
501
+ placement_policy_label = ''
502
+ if (
503
+ # Don't bother with placement for sub/super-slicing workloads:
504
+ workload_scheduling == WorkloadScheduling.AVAILABLE
505
+ and is_placement_policy_supported(workload_system)
506
+ ):
502
507
  ensure_resource_policy_exists(
503
- resource_policy_name=get_placement_policy_name(workload_system),
508
+ resource_policy_name=get_placement_policy_name(
509
+ workload_system, super_slicing=False
510
+ ),
504
511
  project=args.project,
505
512
  zone=args.zone,
506
513
  topology=workload_system.topology,
514
+ super_slicing=False,
515
+ )
516
+ placement_policy_label = create_placement_policy_label(
517
+ workload_system, super_slicing=False
507
518
  )
508
-
509
- placement_policy_label = (
510
- create_placement_policy_label(workload_system)
511
- if is_placement_policy_supported(workload_system)
512
- else ''
513
- )
514
519
 
515
520
  # Create the workload file based on accelerator type or workload type.
516
521
  if workload_system.accelerator_type == AcceleratorType.GPU:
@@ -616,12 +621,26 @@ def workload_create(args) -> None:
616
621
  use_sub_slicing = (
617
622
  workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
618
623
  )
624
+ use_super_slicing = (
625
+ workload_scheduling == WorkloadScheduling.SUPER_SLICING_AVAILABLE
626
+ )
619
627
  if use_sub_slicing:
620
628
  xpk_print('Workload will be scheduled using the Sub-slicing feature.')
629
+ if use_super_slicing:
630
+ xpk_print('Workload will be scheduled using the Super-slicing feature.')
621
631
 
622
632
  container, debugging_dashboard_id = get_user_workload_container(
623
633
  args, workload_system
624
634
  )
635
+
636
+ machine_label = (
637
+ create_machine_label(cluster_system)
638
+ if use_sub_slicing and cluster_system
639
+ else create_machine_label(workload_system)
640
+ )
641
+ node_selector_machine_label = machine_label if not use_super_slicing else ''
642
+ annotations_machine_label = machine_label if use_super_slicing else ''
643
+
625
644
  yml_string = WORKLOAD_CREATE_YAML.format(
626
645
  args=args,
627
646
  container=container,
@@ -636,11 +655,8 @@ def workload_create(args) -> None:
636
655
  else ''
637
656
  ),
638
657
  placement_policy_label=placement_policy_label,
639
- machine_label=(
640
- create_machine_label(cluster_system)
641
- if use_sub_slicing and cluster_system
642
- else create_machine_label(workload_system)
643
- ),
658
+ node_selector_machine_label=node_selector_machine_label,
659
+ annotations_machine_label=annotations_machine_label,
644
660
  local_queue_name=LOCAL_QUEUE_NAME,
645
661
  autoprovisioning_args=autoprovisioning_args,
646
662
  volumes=get_volumes(args, workload_system),
@@ -822,12 +838,15 @@ def workload_delete(args) -> None:
822
838
  if len(workloads) == 1:
823
839
  return_code = run_command_with_updates(commands[0], 'Delete Workload')
824
840
  else:
825
- return_code = run_commands(
841
+ maybe_failure = run_commands(
826
842
  commands,
827
843
  'Delete Workload',
828
844
  task_names,
829
845
  batch=100,
830
846
  )
847
+ return_code = (
848
+ maybe_failure.return_code if maybe_failure is not None else 0
849
+ )
831
850
 
832
851
  if return_code != 0:
833
852
  xpk_print(f'Delete Workload request returned ERROR {return_code}')
@@ -34,6 +34,7 @@ SYSTEM_CHARACTERISTICS = SystemCharacteristics(
34
34
  accelerator_type=AcceleratorType.TPU,
35
35
  device_type='l4-1',
36
36
  supports_sub_slicing=True,
37
+ supports_super_slicing=False,
37
38
  requires_workload_policy=False,
38
39
  docker_platform=DockerPlatform.AMD,
39
40
  )