xpk 0.16.1__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. xpk/commands/cluster.py +48 -5
  2. xpk/commands/cluster_gcluster.py +3 -0
  3. xpk/commands/cluster_gcluster_test.py +2 -0
  4. xpk/commands/cluster_test.py +203 -0
  5. xpk/commands/common.py +6 -0
  6. xpk/commands/kind.py +2 -0
  7. xpk/commands/workload.py +35 -15
  8. xpk/commands/workload_test.py +1 -0
  9. xpk/core/capacity.py +83 -46
  10. xpk/core/capacity_test.py +82 -28
  11. xpk/core/commands.py +39 -12
  12. xpk/core/kueue_manager.py +42 -11
  13. xpk/core/kueue_manager_test.py +83 -3
  14. xpk/core/nap.py +5 -4
  15. xpk/core/nodepool.py +57 -20
  16. xpk/core/nodepool_test.py +152 -23
  17. xpk/core/pathways.py +2 -1
  18. xpk/core/resources.py +3 -3
  19. xpk/core/scheduling.py +54 -10
  20. xpk/core/scheduling_test.py +118 -13
  21. xpk/core/system_characteristics.py +41 -24
  22. xpk/core/system_characteristics_test.py +37 -4
  23. xpk/core/telemetry.py +5 -0
  24. xpk/core/telemetry_test.py +19 -2
  25. xpk/core/updates.py +1 -1
  26. xpk/main.py +2 -1
  27. xpk/parser/cluster.py +34 -2
  28. xpk/parser/cluster_test.py +117 -0
  29. xpk/parser/common.py +32 -0
  30. xpk/parser/common_test.py +49 -0
  31. xpk/templates/kueue_config.yaml.j2 +21 -5
  32. xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
  33. xpk/utils/kueue.py +6 -2
  34. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/METADATA +2 -1
  35. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/RECORD +39 -37
  36. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/WHEEL +0 -0
  37. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/entry_points.txt +0 -0
  38. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/licenses/LICENSE +0 -0
  39. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py CHANGED
@@ -19,7 +19,7 @@ from tabulate import tabulate
19
19
  from ..utils.feature_flags import FeatureFlags
20
20
  from ..utils.versions import ReleaseChannel
21
21
  from ..core.pathways import get_pathways_machine_types
22
- from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type
22
+ from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type, parse_reservation
23
23
  from ..core.cluster import (
24
24
  get_all_clusters_programmatic,
25
25
  get_cluster_credentials,
@@ -79,7 +79,7 @@ from ..utils.file import write_tmp_file
79
79
  from ..utils.execution_context import is_dry_run, is_quiet
80
80
  from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
81
81
  from . import cluster_gcluster
82
- from .common import set_cluster_command, validate_sub_slicing_system
82
+ from .common import set_cluster_command, validate_sub_slicing_system, validate_super_slicing_system
83
83
  from jinja2 import Environment, FileSystemLoader
84
84
  from ..utils.templates import get_templates_absolute_path
85
85
  import shutil
@@ -211,6 +211,11 @@ def _validate_cluster_create_args(args, system: SystemCharacteristics):
211
211
  if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
212
212
  validate_sub_slicing_system(system)
213
213
  _validate_sub_slicing_reservation(args)
214
+ if FeatureFlags.SUPER_SLICING_ENABLED:
215
+ _validate_num_slices_and_set_default(args)
216
+ if args.super_slicing:
217
+ validate_super_slicing_system(system)
218
+ _validate_super_slicing_reservation(args)
214
219
  if args.enable_pathways:
215
220
  _validate_pathways_machine(args)
216
221
 
@@ -233,15 +238,30 @@ def _validate_pathways_machine(args):
233
238
 
234
239
 
235
240
  def _validate_sub_slicing_reservation(args):
241
+ _validate_gsc_reservation(args, 'Sub-slicing')
242
+
243
+
244
+ def _validate_super_slicing_reservation(args):
245
+ _validate_gsc_reservation(args, 'Super-slicing')
246
+ reservation = parse_reservation(args.reservation, args.project)
247
+ if reservation.block_name is None:
248
+ xpk_print(
249
+ 'Error: Validation failed: Super-slicing cluster creation'
250
+ ' requires a block or sub-block reservation.'
251
+ )
252
+ xpk_exit(1)
253
+
254
+
255
+ def _validate_gsc_reservation(args, creation_description: str):
236
256
  if args.reservation is None:
237
257
  xpk_print(
238
- 'Error: Validation failed: Sub-slicing cluster creation requires'
239
- ' Cluster Director reservation to be specified.'
258
+ f'Error: Validation failed: {creation_description} cluster creation'
259
+ ' requires Cluster Director reservation to be specified.'
240
260
  )
241
261
  xpk_exit(1)
242
262
 
243
263
  deployment_type = get_reservation_deployment_type(
244
- reservation=args.reservation, project=args.project, zone=args.zone
264
+ reservation_path=args.reservation, project=args.project, zone=args.zone
245
265
  )
246
266
  if deployment_type != 'DENSE':
247
267
  xpk_print(
@@ -263,6 +283,22 @@ def _validate_sub_slicing_reservation(args):
263
283
  xpk_exit(1)
264
284
 
265
285
 
286
+ def _validate_num_slices_and_set_default(args):
287
+ if args.num_cubes is not None and not args.super_slicing:
288
+ xpk_print('--num-cubes can only be used with --super-slicing')
289
+ xpk_exit(1)
290
+
291
+ if (
292
+ args.num_cubes is not None
293
+ and args.num_slices is not None
294
+ and args.num_cubes != args.num_slices
295
+ ):
296
+ xpk_print('--num-cubes must not be different from --num-slices')
297
+ xpk_exit(1)
298
+
299
+ args.num_slices = args.num_slices or args.num_cubes or 1
300
+
301
+
266
302
  def cluster_create(args) -> None:
267
303
  """Function around cluster creation.
268
304
 
@@ -374,6 +410,7 @@ def cluster_create(args) -> None:
374
410
  )
375
411
  if return_code != 0:
376
412
  xpk_exit(return_code)
413
+ assert gke_node_pool_version
377
414
 
378
415
  run_gke_node_pool_create_command_code = run_gke_node_pool_create_command(
379
416
  args, system, gke_node_pool_version
@@ -1256,6 +1293,9 @@ def run_gke_cluster_create_command(
1256
1293
  addons_str = ','.join(addons)
1257
1294
  command += f' --addons={addons_str}'
1258
1295
 
1296
+ if FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing:
1297
+ command += ' --enable-slice-controller'
1298
+
1259
1299
  if args.custom_cluster_arguments:
1260
1300
  command += f' {args.custom_cluster_arguments}'
1261
1301
 
@@ -1347,6 +1387,9 @@ def _install_kueue(
1347
1387
  configure_sub_slicing=(
1348
1388
  FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
1349
1389
  ),
1390
+ configure_super_slicing=(
1391
+ FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing
1392
+ ),
1350
1393
  )
1351
1394
  )
1352
1395
 
@@ -173,6 +173,9 @@ def __install_kueue(args) -> int:
173
173
  configure_sub_slicing=(
174
174
  FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
175
175
  ),
176
+ configure_super_slicing=(
177
+ FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing
178
+ ),
176
179
  ),
177
180
  tolerations=tolerations,
178
181
  )
@@ -97,6 +97,7 @@ def test_install_kueue_standard(
97
97
  accelerator_type=AcceleratorType.GPU,
98
98
  device_type="h100-mega-80gb-8",
99
99
  supports_sub_slicing=False,
100
+ supports_super_slicing=False,
100
101
  docker_platform=DockerPlatform.ARM,
101
102
  gpu_config=GpuConfig(requires_topology=True),
102
103
  )
@@ -150,6 +151,7 @@ def test_install_kueue_with_autoprovisioning(
150
151
  accelerator_type=AcceleratorType.GPU,
151
152
  device_type="h100-mega-80gb-8",
152
153
  supports_sub_slicing=False,
154
+ supports_super_slicing=False,
153
155
  docker_platform=DockerPlatform.ARM,
154
156
  gpu_config=GpuConfig(requires_topology=True),
155
157
  )
@@ -110,6 +110,7 @@ def construct_args(**kwargs: Any) -> Namespace:
110
110
  cluster='test-cluster',
111
111
  default_pool_cpu_num_nodes='100',
112
112
  sub_slicing=False,
113
+ super_slicing=False,
113
114
  gke_version='',
114
115
  private=False,
115
116
  authorized_networks=None,
@@ -226,6 +227,9 @@ GPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
226
227
  SUB_SLICING_SYSTEM: SystemCharacteristics = (
227
228
  UserFacingNameToSystemCharacteristics['v6e-4x4']
228
229
  )
230
+ SUPER_SLICING_SYSTEM: SystemCharacteristics = (
231
+ UserFacingNameToSystemCharacteristics['tpu7x-4x4x4']
232
+ )
229
233
  TPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
230
234
  'v6e-4x4'
231
235
  ]
@@ -608,3 +612,202 @@ def test_cluster_create_calls_run_command_with_correct_channel_and_version(
608
612
  ]
609
613
 
610
614
  mocks.commands_tester.assert_command_run(*expected_command_parts)
615
+
616
+
617
+ def test_run_gke_cluster_create_command_with_super_slicing_enables_slice_controller(
618
+ mocks: _Mocks,
619
+ ):
620
+ FeatureFlags.SUPER_SLICING_ENABLED = True
621
+ result = run_gke_cluster_create_command(
622
+ args=construct_args(gke_version='1.2.3', super_slicing=True),
623
+ gke_control_plane_version='1.2.3',
624
+ system=SUPER_SLICING_SYSTEM,
625
+ release_channel=ReleaseChannel.REGULAR,
626
+ )
627
+
628
+ assert result == 0
629
+ mocks.commands_tester.assert_command_run(
630
+ 'clusters create', '--enable-slice-controller'
631
+ )
632
+
633
+
634
+ def test_validate_cluster_create_args_for_correct_super_slicing_args_pass(
635
+ mocks: _Mocks,
636
+ ):
637
+ FeatureFlags.SUPER_SLICING_ENABLED = True
638
+ args = construct_args(
639
+ super_slicing=True,
640
+ reservation='test-reservation/reservationBlocks/block',
641
+ num_cubes=None,
642
+ num_slices=None,
643
+ )
644
+
645
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
646
+ args = construct_args(
647
+ super_slicing=True,
648
+ reservation='test-reservation/reservationBlocks/block/reservationSubBlocks/subblock',
649
+ num_cubes=None,
650
+ num_slices=None,
651
+ )
652
+ _validate_cluster_create_args(
653
+ args, UserFacingNameToSystemCharacteristics['tpu7x-128']
654
+ )
655
+
656
+ assert mocks.common_print_mock.call_count == 0
657
+
658
+
659
+ def test_validate_cluster_create_args_for_super_slicing_system_not_supported_throws(
660
+ mocks: _Mocks,
661
+ ):
662
+ FeatureFlags.SUPER_SLICING_ENABLED = True
663
+ args = construct_args(
664
+ super_slicing=True,
665
+ reservation='test-reservation/reservationBlocks/block',
666
+ num_cubes=None,
667
+ num_slices=None,
668
+ )
669
+
670
+ with pytest.raises(SystemExit):
671
+ _validate_cluster_create_args(
672
+ args, UserFacingNameToSystemCharacteristics['tpu7x-4x4x8']
673
+ )
674
+
675
+ assert mocks.common_print_mock.call_count == 1
676
+ assert (
677
+ mocks.common_print_mock.call_args[0][0]
678
+ == 'Error: tpu7x-256 does not support Super-slicing.'
679
+ )
680
+
681
+
682
+ def test_validate_cluster_create_args_for_super_slicing_missing_reservation(
683
+ mocks: _Mocks,
684
+ ):
685
+ FeatureFlags.SUPER_SLICING_ENABLED = True
686
+ args = construct_args(
687
+ super_slicing=True,
688
+ reservation=None,
689
+ num_cubes=None,
690
+ num_slices=None,
691
+ )
692
+
693
+ with pytest.raises(SystemExit):
694
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
695
+
696
+ assert mocks.commands_print_mock.call_count == 1
697
+ assert (
698
+ 'Validation failed: Super-slicing cluster creation requires'
699
+ in mocks.commands_print_mock.call_args[0][0]
700
+ )
701
+
702
+
703
+ def test_validate_cluster_create_args_for_super_slicing_reservation_no_blocks(
704
+ mocks: _Mocks,
705
+ ):
706
+ FeatureFlags.SUPER_SLICING_ENABLED = True
707
+ args = construct_args(
708
+ super_slicing=True,
709
+ reservation='reservation',
710
+ num_cubes=None,
711
+ num_slices=None,
712
+ )
713
+
714
+ with pytest.raises(SystemExit):
715
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
716
+
717
+ assert mocks.commands_print_mock.call_count == 1
718
+ assert (
719
+ 'requires a block or sub-block reservation'
720
+ in mocks.commands_print_mock.call_args[0][0]
721
+ )
722
+
723
+
724
+ def test_validate_cluster_create_args_for_super_slicing_sparse_deployment_type_reservation(
725
+ mocks: _Mocks,
726
+ ):
727
+ FeatureFlags.SUPER_SLICING_ENABLED = True
728
+ args = construct_args(
729
+ super_slicing=True,
730
+ reservation='test-reservation/reservationBlocks/block',
731
+ num_cubes=None,
732
+ num_slices=None,
733
+ )
734
+ mocks.commands_get_reservation_deployment_type.return_value = 'SPARSE'
735
+
736
+ with pytest.raises(SystemExit):
737
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
738
+
739
+ assert mocks.commands_print_mock.call_count == 5
740
+ assert (
741
+ 'Refer to the documentation for more information on creating Cluster'
742
+ in mocks.commands_print_mock.call_args[0][0]
743
+ )
744
+
745
+
746
+ def test_validate_cluster_create_args_forbids_num_cubes_without_superslicing(
747
+ mocks: _Mocks,
748
+ ):
749
+ FeatureFlags.SUPER_SLICING_ENABLED = True # enable the feature
750
+ args = construct_args(
751
+ super_slicing=False, # but disable the flag
752
+ reservation='test-reservation/reservationBlocks/block',
753
+ num_cubes=1,
754
+ num_slices=None,
755
+ )
756
+
757
+ with pytest.raises(SystemExit):
758
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
759
+
760
+ assert mocks.commands_print_mock.call_count == 1
761
+ assert (
762
+ '--num-cubes can only be used with --super-slicing'
763
+ in mocks.commands_print_mock.call_args[0][0]
764
+ )
765
+
766
+
767
+ def test_validate_cluster_create_args_forbids_num_cubes_different_from_num_slices(
768
+ mocks: _Mocks,
769
+ ):
770
+ FeatureFlags.SUPER_SLICING_ENABLED = True
771
+ args = construct_args(
772
+ super_slicing=True,
773
+ reservation='test-reservation/reservationBlocks/block',
774
+ num_cubes=1,
775
+ num_slices=2,
776
+ )
777
+
778
+ with pytest.raises(SystemExit):
779
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
780
+
781
+ assert mocks.commands_print_mock.call_count == 1
782
+ assert (
783
+ '--num-cubes must not be different from --num-slices'
784
+ in mocks.commands_print_mock.call_args[0][0]
785
+ )
786
+
787
+
788
+ @pytest.mark.parametrize(
789
+ 'num_cubes, num_slices, expected',
790
+ [
791
+ (None, None, 1),
792
+ (3, None, 3),
793
+ (None, 3, 3),
794
+ (3, 3, 3),
795
+ ],
796
+ )
797
+ def test_validate_cluster_create_args_sets_correct_num_slices(
798
+ mocks: _Mocks,
799
+ num_cubes: int | None,
800
+ num_slices: int | None,
801
+ expected: int,
802
+ ):
803
+ FeatureFlags.SUPER_SLICING_ENABLED = True
804
+ args = construct_args(
805
+ super_slicing=True,
806
+ reservation='test-reservation/reservationBlocks/block',
807
+ num_cubes=num_cubes,
808
+ num_slices=num_slices,
809
+ )
810
+
811
+ _validate_cluster_create_args(args, SUPER_SLICING_SYSTEM)
812
+
813
+ assert args.num_slices == expected
xpk/commands/common.py CHANGED
@@ -73,3 +73,9 @@ def validate_sub_slicing_system(system: SystemCharacteristics):
73
73
  if not system.supports_sub_slicing:
74
74
  xpk_print(f'Error: {system.device_type} does not support Sub-slicing.')
75
75
  xpk_exit(1)
76
+
77
+
78
+ def validate_super_slicing_system(system: SystemCharacteristics):
79
+ if not system.supports_super_slicing:
80
+ xpk_print(f'Error: {system.device_type} does not support Super-slicing.')
81
+ xpk_exit(1)
xpk/commands/kind.py CHANGED
@@ -98,6 +98,7 @@ def cluster_create(args) -> None:
98
98
  AcceleratorType.CPU,
99
99
  'kind',
100
100
  supports_sub_slicing=False,
101
+ supports_super_slicing=False,
101
102
  docker_platform=DockerPlatform.ARM,
102
103
  )
103
104
 
@@ -113,6 +114,7 @@ def cluster_create(args) -> None:
113
114
  is_pathways_cluster=False,
114
115
  flex=False,
115
116
  configure_sub_slicing=False,
117
+ configure_super_slicing=False,
116
118
  ),
117
119
  )
118
120
 
xpk/commands/workload.py CHANGED
@@ -132,6 +132,7 @@ spec:
132
132
  annotations:
133
133
  {storage_annotations}
134
134
  {sub_slicing_annotations}
135
+ {annotations_machine_label}
135
136
  spec:
136
137
  schedulerName: {args.scheduler}
137
138
  imagePullSecrets:
@@ -140,7 +141,7 @@ spec:
140
141
  {affinity}
141
142
  nodeSelector:
142
143
  {accelerator_label}
143
- {machine_label}
144
+ {node_selector_machine_label}
144
145
  {placement_policy_label}
145
146
  {autoprovisioning_args}
146
147
  priorityClassName: {args.priority}
@@ -498,19 +499,24 @@ def workload_create(args) -> None:
498
499
  operator: NotIn
499
500
  values: [{restart_on_exit_codes}]"""
500
501
 
501
- if is_placement_policy_supported(workload_system):
502
+ placement_policy_label = ''
503
+ if (
504
+ # Don't bother with placement for sub/super-slicing workloads:
505
+ workload_scheduling == WorkloadScheduling.AVAILABLE
506
+ and is_placement_policy_supported(workload_system)
507
+ ):
502
508
  ensure_resource_policy_exists(
503
- resource_policy_name=get_placement_policy_name(workload_system),
509
+ resource_policy_name=get_placement_policy_name(
510
+ workload_system, super_slicing=False
511
+ ),
504
512
  project=args.project,
505
513
  zone=args.zone,
506
514
  topology=workload_system.topology,
515
+ super_slicing=False,
516
+ )
517
+ placement_policy_label = create_placement_policy_label(
518
+ workload_system, super_slicing=False
507
519
  )
508
-
509
- placement_policy_label = (
510
- create_placement_policy_label(workload_system)
511
- if is_placement_policy_supported(workload_system)
512
- else ''
513
- )
514
520
 
515
521
  # Create the workload file based on accelerator type or workload type.
516
522
  if workload_system.accelerator_type == AcceleratorType.GPU:
@@ -616,12 +622,26 @@ def workload_create(args) -> None:
616
622
  use_sub_slicing = (
617
623
  workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
618
624
  )
625
+ use_super_slicing = (
626
+ workload_scheduling == WorkloadScheduling.SUPER_SLICING_AVAILABLE
627
+ )
619
628
  if use_sub_slicing:
620
629
  xpk_print('Workload will be scheduled using the Sub-slicing feature.')
630
+ if use_super_slicing:
631
+ xpk_print('Workload will be scheduled using the Super-slicing feature.')
621
632
 
622
633
  container, debugging_dashboard_id = get_user_workload_container(
623
634
  args, workload_system
624
635
  )
636
+
637
+ machine_label = (
638
+ create_machine_label(cluster_system)
639
+ if use_sub_slicing and cluster_system
640
+ else create_machine_label(workload_system)
641
+ )
642
+ node_selector_machine_label = machine_label if not use_super_slicing else ''
643
+ annotations_machine_label = machine_label if use_super_slicing else ''
644
+
625
645
  yml_string = WORKLOAD_CREATE_YAML.format(
626
646
  args=args,
627
647
  container=container,
@@ -636,11 +656,8 @@ def workload_create(args) -> None:
636
656
  else ''
637
657
  ),
638
658
  placement_policy_label=placement_policy_label,
639
- machine_label=(
640
- create_machine_label(cluster_system)
641
- if use_sub_slicing and cluster_system
642
- else create_machine_label(workload_system)
643
- ),
659
+ node_selector_machine_label=node_selector_machine_label,
660
+ annotations_machine_label=annotations_machine_label,
644
661
  local_queue_name=LOCAL_QUEUE_NAME,
645
662
  autoprovisioning_args=autoprovisioning_args,
646
663
  volumes=get_volumes(args, workload_system),
@@ -822,12 +839,15 @@ def workload_delete(args) -> None:
822
839
  if len(workloads) == 1:
823
840
  return_code = run_command_with_updates(commands[0], 'Delete Workload')
824
841
  else:
825
- return_code = run_commands(
842
+ maybe_failure = run_commands(
826
843
  commands,
827
844
  'Delete Workload',
828
845
  task_names,
829
846
  batch=100,
830
847
  )
848
+ return_code = (
849
+ maybe_failure.return_code if maybe_failure is not None else 0
850
+ )
831
851
 
832
852
  if return_code != 0:
833
853
  xpk_print(f'Delete Workload request returned ERROR {return_code}')
@@ -34,6 +34,7 @@ SYSTEM_CHARACTERISTICS = SystemCharacteristics(
34
34
  accelerator_type=AcceleratorType.TPU,
35
35
  device_type='l4-1',
36
36
  supports_sub_slicing=True,
37
+ supports_super_slicing=False,
37
38
  requires_workload_policy=False,
38
39
  docker_platform=DockerPlatform.AMD,
39
40
  )