xpk 0.16.0__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. xpk/commands/cluster.py +48 -5
  2. xpk/commands/cluster_gcluster.py +3 -0
  3. xpk/commands/cluster_gcluster_test.py +2 -0
  4. xpk/commands/cluster_test.py +203 -0
  5. xpk/commands/common.py +6 -0
  6. xpk/commands/kind.py +2 -0
  7. xpk/commands/workload.py +35 -15
  8. xpk/commands/workload_test.py +1 -0
  9. xpk/core/capacity.py +83 -46
  10. xpk/core/capacity_test.py +82 -28
  11. xpk/core/commands.py +39 -12
  12. xpk/core/kueue_manager.py +42 -11
  13. xpk/core/kueue_manager_test.py +83 -3
  14. xpk/core/nap.py +5 -4
  15. xpk/core/nodepool.py +57 -20
  16. xpk/core/nodepool_test.py +152 -23
  17. xpk/core/pathways.py +2 -1
  18. xpk/core/resources.py +3 -3
  19. xpk/core/scheduling.py +54 -10
  20. xpk/core/scheduling_test.py +118 -13
  21. xpk/core/system_characteristics.py +41 -24
  22. xpk/core/system_characteristics_test.py +37 -4
  23. xpk/core/telemetry.py +5 -0
  24. xpk/core/telemetry_test.py +19 -2
  25. xpk/core/updates.py +1 -1
  26. xpk/main.py +2 -1
  27. xpk/parser/cluster.py +34 -2
  28. xpk/parser/cluster_test.py +117 -0
  29. xpk/parser/common.py +32 -0
  30. xpk/parser/common_test.py +49 -0
  31. xpk/templates/kueue_config.yaml.j2 +21 -5
  32. xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
  33. xpk/utils/kueue.py +6 -2
  34. {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/METADATA +2 -1
  35. {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/RECORD +39 -37
  36. {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/WHEEL +0 -0
  37. {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/entry_points.txt +0 -0
  38. {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/licenses/LICENSE +0 -0
  39. {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/top_level.txt +0 -0
xpk/core/scheduling.py CHANGED
@@ -16,7 +16,7 @@ limitations under the License.
16
16
 
17
17
  from enum import Enum
18
18
 
19
- from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled
19
+ from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled, has_super_slicing_enabled
20
20
  from ..utils.feature_flags import FeatureFlags
21
21
  from ..utils.topology import get_slice_topology_level
22
22
  from ..utils.console import xpk_print
@@ -33,12 +33,14 @@ from .system_characteristics import (
33
33
  from packaging.version import Version
34
34
 
35
35
  _SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
36
+ _SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.14.0')
36
37
 
37
38
 
38
39
  class WorkloadScheduling(Enum):
39
40
  UNAVAILABLE = 0
40
41
  AVAILABLE = 1
41
42
  SUB_SLICING_AVAILABLE = 2
43
+ SUPER_SLICING_AVAILABLE = 3
42
44
 
43
45
 
44
46
  def check_if_workload_can_schedule(
@@ -94,10 +96,9 @@ def check_if_workload_can_schedule(
94
96
  else:
95
97
  return WorkloadScheduling.UNAVAILABLE
96
98
 
97
- if _check_sub_slicing_availability(
99
+ if cluster_system and _check_sub_slicing_availability(
98
100
  workload_system=workload_system, cluster_system=cluster_system
99
101
  ):
100
- assert cluster_system
101
102
  if _check_workload_size_fits(
102
103
  args,
103
104
  workload_system,
@@ -107,6 +108,18 @@ def check_if_workload_can_schedule(
107
108
  else:
108
109
  return WorkloadScheduling.UNAVAILABLE
109
110
 
111
+ if cluster_system and _check_super_slicing_availability(
112
+ workload_system=workload_system, cluster_system=cluster_system
113
+ ):
114
+ if _check_workload_size_fits(
115
+ args,
116
+ workload_system,
117
+ max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
118
+ ):
119
+ return WorkloadScheduling.SUPER_SLICING_AVAILABLE
120
+ else:
121
+ return WorkloadScheduling.UNAVAILABLE
122
+
110
123
  xpk_print(
111
124
  'Workload scheduling validation failed. XPK will not create the workload'
112
125
  f' {args.workload}.'
@@ -147,11 +160,10 @@ def _check_workload_size_fits(
147
160
 
148
161
  def _check_sub_slicing_availability(
149
162
  workload_system: SystemCharacteristics,
150
- cluster_system: SystemCharacteristics | None,
163
+ cluster_system: SystemCharacteristics,
151
164
  ) -> bool:
152
165
  if (
153
166
  (not FeatureFlags.SUB_SLICING_ENABLED)
154
- or (not cluster_system)
155
167
  or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
156
168
  or (not cluster_system.supports_sub_slicing)
157
169
  or (workload_system.topology not in SUB_SLICING_TOPOLOGIES)
@@ -163,7 +175,7 @@ def _check_sub_slicing_availability(
163
175
  return False
164
176
 
165
177
  return_code, current_version = get_installed_kueue_version(
166
- dry_run_version=Version('0.13')
178
+ dry_run_version=_SUB_SLICING_MINIMUM_KUEUE_VERSION
167
179
  )
168
180
 
169
181
  return (
@@ -173,6 +185,33 @@ def _check_sub_slicing_availability(
173
185
  )
174
186
 
175
187
 
188
+ def _check_super_slicing_availability(
189
+ workload_system: SystemCharacteristics,
190
+ cluster_system: SystemCharacteristics,
191
+ ) -> bool:
192
+ # TODO: b/465447813 - Add super-slicing workload topology validation.
193
+ if (
194
+ (not FeatureFlags.SUPER_SLICING_ENABLED)
195
+ or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
196
+ or (not cluster_system.supports_super_slicing)
197
+ ):
198
+ return False
199
+
200
+ return_code, sub_slicing_enabled = has_super_slicing_enabled()
201
+ if return_code != 0 or not sub_slicing_enabled:
202
+ return False
203
+
204
+ return_code, current_version = get_installed_kueue_version(
205
+ dry_run_version=_SUPER_SLICING_MINIMUM_KUEUE_VERSION
206
+ )
207
+
208
+ return (
209
+ return_code == 0
210
+ and current_version is not None
211
+ and current_version >= _SUPER_SLICING_MINIMUM_KUEUE_VERSION
212
+ )
213
+
214
+
176
215
  def get_total_chips_requested_from_args(
177
216
  args, system: SystemCharacteristics
178
217
  ) -> int:
@@ -303,13 +342,18 @@ def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
303
342
  ]
304
343
 
305
344
 
306
- def create_placement_policy_label(system: SystemCharacteristics) -> str:
307
- name = get_placement_policy_name(system)
345
+ def create_placement_policy_label(
346
+ system: SystemCharacteristics, super_slicing: bool
347
+ ) -> str:
348
+ name = get_placement_policy_name(system, super_slicing)
308
349
  return f'cloud.google.com/placement-policy-name: {name}'
309
350
 
310
351
 
311
- def get_placement_policy_name(system: SystemCharacteristics) -> str:
312
- return f'{system.device_type}-{system.topology}-placement-policy'
352
+ def get_placement_policy_name(
353
+ system: SystemCharacteristics, super_slicing: bool
354
+ ) -> str:
355
+ super_slicing_part = '-ss' if super_slicing else ''
356
+ return f'{system.device_type}-{system.topology}{super_slicing_part}-placement-policy'
313
357
 
314
358
 
315
359
  def is_placement_policy_supported(system: SystemCharacteristics) -> bool:
@@ -65,9 +65,12 @@ def test_create_placement_policy_label_returns_valid_label():
65
65
  device_type='tpu7x',
66
66
  accelerator_type=AcceleratorType.TPU,
67
67
  supports_sub_slicing=False,
68
+ supports_super_slicing=False,
68
69
  docker_platform=DockerPlatform.ARM,
69
70
  )
70
- label = create_placement_policy_label(system_characteristics)
71
+ label = create_placement_policy_label(
72
+ system_characteristics, super_slicing=False
73
+ )
71
74
  assert (
72
75
  label
73
76
  == 'cloud.google.com/placement-policy-name: tpu7x-1x1x1-placement-policy'
@@ -85,12 +88,31 @@ def test_get_placement_policy_name_returns_valid_name():
85
88
  device_type='tpu7x',
86
89
  accelerator_type=AcceleratorType.TPU,
87
90
  supports_sub_slicing=False,
91
+ supports_super_slicing=False,
88
92
  docker_platform=DockerPlatform.ARM,
89
93
  )
90
- name = get_placement_policy_name(system_characteristics)
94
+ name = get_placement_policy_name(system_characteristics, super_slicing=False)
91
95
  assert name == 'tpu7x-1x1x1-placement-policy'
92
96
 
93
97
 
98
+ def test_get_placement_policy_name_super_slicing_returns_valid_name():
99
+ system_characteristics = SystemCharacteristics(
100
+ chips_per_vm=1,
101
+ gce_machine_type='tpu7x-standard-1t',
102
+ gke_accelerator='tpu7x',
103
+ requires_workload_policy=False,
104
+ topology='1x1x1',
105
+ vms_per_slice=1,
106
+ device_type='tpu7x',
107
+ accelerator_type=AcceleratorType.TPU,
108
+ supports_sub_slicing=False,
109
+ supports_super_slicing=False,
110
+ docker_platform=DockerPlatform.ARM,
111
+ )
112
+ name = get_placement_policy_name(system_characteristics, super_slicing=True)
113
+ assert name == 'tpu7x-1x1x1-ss-placement-policy'
114
+
115
+
94
116
  def test_is_placement_policy_supported_returns_true_for_system_characteristics_supporting_workload_policy_and_having_valid_topology():
95
117
  system_characteristics = SystemCharacteristics(
96
118
  chips_per_vm=1,
@@ -102,6 +124,7 @@ def test_is_placement_policy_supported_returns_true_for_system_characteristics_s
102
124
  device_type='tpu7x',
103
125
  accelerator_type=AcceleratorType.TPU,
104
126
  supports_sub_slicing=False,
127
+ supports_super_slicing=False,
105
128
  docker_platform=DockerPlatform.ARM,
106
129
  )
107
130
  assert is_placement_policy_supported(system_characteristics) is True
@@ -118,6 +141,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
118
141
  device_type='tpu7x',
119
142
  accelerator_type=AcceleratorType.TPU,
120
143
  supports_sub_slicing=False,
144
+ supports_super_slicing=False,
121
145
  docker_platform=DockerPlatform.ARM,
122
146
  )
123
147
  assert is_placement_policy_supported(system_characteristics) is False
@@ -134,6 +158,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
134
158
  device_type='tpu7x',
135
159
  accelerator_type=AcceleratorType.TPU,
136
160
  supports_sub_slicing=False,
161
+ supports_super_slicing=False,
137
162
  docker_platform=DockerPlatform.ARM,
138
163
  )
139
164
  assert is_placement_policy_supported(system_characteristics) is False
@@ -145,28 +170,42 @@ class SchedulingTestCase:
145
170
  num_slices: int = 1
146
171
  cluster_system: SystemCharacteristics | None = None
147
172
  resources_config_map: dict[str, str] | None = None
148
- sub_slicing_feature_enabled: bool = False
149
173
  kueue_version: str | None = None
174
+ sub_slicing_feature_enabled: bool = False
150
175
  sub_slicing_topology_set: bool = False
176
+ super_slicing_feature_enabled: bool = False
177
+ super_slicing_topology_set: bool = False
151
178
 
152
179
 
180
+ NAP_CASE = SchedulingTestCase(
181
+ workload_system=_get_system_characteristics_or_die('v6e-8'),
182
+ cluster_system=None,
183
+ resources_config_map={
184
+ 'tpu-v6e-slice': AUTOPROVISIONING_CONFIG_VALUE,
185
+ AUTOPROVISIONING_CONFIG_MAXIMUM_KEY: '10',
186
+ },
187
+ )
188
+
153
189
  SUB_SLICING_CASE = SchedulingTestCase(
154
190
  workload_system=_get_system_characteristics_or_die('v6e-8'),
155
191
  cluster_system=_get_system_characteristics_or_die('v6e-16'),
156
- resources_config_map={'v6e-16': '8'},
157
- sub_slicing_feature_enabled=True,
192
+ # 2 slices:
193
+ resources_config_map={'v6e-16': str(8 // 4 * 2)},
158
194
  kueue_version='0.13.0',
195
+ sub_slicing_feature_enabled=True,
159
196
  sub_slicing_topology_set=True,
160
197
  num_slices=1,
161
198
  )
162
199
 
163
- NAP_CASE = SchedulingTestCase(
164
- workload_system=_get_system_characteristics_or_die('v6e-8'),
165
- cluster_system=None,
166
- resources_config_map={
167
- 'tpu-v6e-slice': AUTOPROVISIONING_CONFIG_VALUE,
168
- AUTOPROVISIONING_CONFIG_MAXIMUM_KEY: '10',
169
- },
200
+ SUPER_SLICING_CASE = SchedulingTestCase(
201
+ workload_system=_get_system_characteristics_or_die('tpu7x-4x4x16'),
202
+ cluster_system=_get_system_characteristics_or_die('tpu7x-4x4x4'),
203
+ # 5 4x4x4 cubes:
204
+ resources_config_map={'tpu7x-128': str(64 // 4 * 5)},
205
+ kueue_version='0.14.0',
206
+ super_slicing_feature_enabled=True,
207
+ super_slicing_topology_set=True,
208
+ num_slices=1,
170
209
  )
171
210
 
172
211
 
@@ -283,6 +322,66 @@ NAP_CASE = SchedulingTestCase(
283
322
  ),
284
323
  WorkloadScheduling.AVAILABLE,
285
324
  ),
325
+ (
326
+ 'Correct Super-slicing',
327
+ SUPER_SLICING_CASE,
328
+ WorkloadScheduling.SUPER_SLICING_AVAILABLE,
329
+ ),
330
+ (
331
+ 'Super-slicing, but disabled flag',
332
+ dataclasses.replace(
333
+ SUPER_SLICING_CASE, super_slicing_feature_enabled=False
334
+ ),
335
+ WorkloadScheduling.UNAVAILABLE,
336
+ ),
337
+ (
338
+ 'Super-slicing, but low Kueue version',
339
+ dataclasses.replace(SUPER_SLICING_CASE, kueue_version='0.13.0'),
340
+ WorkloadScheduling.UNAVAILABLE,
341
+ ),
342
+ (
343
+ 'Super-slicing, but no super-slicing-topology',
344
+ dataclasses.replace(
345
+ SUPER_SLICING_CASE, super_slicing_topology_set=False
346
+ ),
347
+ WorkloadScheduling.UNAVAILABLE,
348
+ ),
349
+ (
350
+ 'Super-slicing, but workload too big',
351
+ dataclasses.replace(SUPER_SLICING_CASE, num_slices=100),
352
+ WorkloadScheduling.UNAVAILABLE,
353
+ ),
354
+ (
355
+ 'Super-slicing, but cluster system is incorrect',
356
+ dataclasses.replace(
357
+ SUPER_SLICING_CASE,
358
+ cluster_system=_get_system_characteristics_or_die(
359
+ 'tpu7x-4x4x8'
360
+ ),
361
+ ),
362
+ WorkloadScheduling.UNAVAILABLE,
363
+ ),
364
+ (
365
+ 'Super-slicing, but workload system is incorrect',
366
+ dataclasses.replace(
367
+ SUPER_SLICING_CASE,
368
+ workload_system=_get_system_characteristics_or_die('v6e-8'),
369
+ ),
370
+ WorkloadScheduling.UNAVAILABLE,
371
+ ),
372
+ (
373
+ (
374
+ 'Super-slicing should be ignored when a given device is already'
375
+ ' present in the cluster'
376
+ ),
377
+ dataclasses.replace(
378
+ SUPER_SLICING_CASE,
379
+ workload_system=_get_system_characteristics_or_die('tpu7x-64'),
380
+ cluster_system=_get_system_characteristics_or_die('tpu7x-64'),
381
+ resources_config_map={'tpu7x-64': '16'},
382
+ ),
383
+ WorkloadScheduling.AVAILABLE,
384
+ ),
286
385
  ],
287
386
  )
288
387
  def test_check_if_workload_can_schedule(
@@ -292,6 +391,7 @@ def test_check_if_workload_can_schedule(
292
391
  expected: WorkloadScheduling,
293
392
  ):
294
393
  FeatureFlags.SUB_SLICING_ENABLED = case.sub_slicing_feature_enabled
394
+ FeatureFlags.SUPER_SLICING_ENABLED = case.super_slicing_feature_enabled
295
395
  commands_tester.set_result_for_command(
296
396
  (
297
397
  0,
@@ -302,8 +402,13 @@ def test_check_if_workload_can_schedule(
302
402
  'kubectl get deployment',
303
403
  'image',
304
404
  )
405
+ topology_response = ''
406
+ if case.sub_slicing_topology_set:
407
+ topology_response = 'sub-slice-topology'
408
+ elif case.super_slicing_topology_set:
409
+ topology_response = 'super-slice-topology'
305
410
  commands_tester.set_result_for_command(
306
- (0, 'sub-slice-topology' if case.sub_slicing_topology_set else ''),
411
+ (0, topology_response),
307
412
  'kubectl get topology',
308
413
  )
309
414
  args = Namespace(
@@ -137,6 +137,7 @@ class SystemCharacteristics:
137
137
  device_type: A user-facing name for the specific hardware configuration
138
138
  (e.g., 'l4-1', 'h100-80gb-8').
139
139
  supports_sub_slicing: Whether the Sub-slicing feature is supported.
140
+ supports_super_slicing: Whether the Super-slicing feature is supported.
140
141
  requires_workload_policy: A boolean indicating if a GCE resource
141
142
  workload policy is required. This is automatically set to True for GPUs.
142
143
  """
@@ -149,6 +150,7 @@ class SystemCharacteristics:
149
150
  accelerator_type: AcceleratorType
150
151
  device_type: str
151
152
  supports_sub_slicing: bool
153
+ supports_super_slicing: bool
152
154
  docker_platform: DockerPlatform
153
155
  requires_workload_policy: bool = False
154
156
  gpu_config: Optional[GpuConfig] = None
@@ -239,14 +241,16 @@ def get_tpu_system_characteristics_map(
239
241
  gke_accelerator: str,
240
242
  machine_type: str,
241
243
  supported_topologies: list[str],
242
- supports_sub_slicing: bool,
243
244
  docker_platform: DockerPlatform,
244
245
  tpu_type_requires_workload_policy: bool = False,
245
246
  default_topologies: set[str] | None = None,
247
+ sub_slicing_topologies: set[str] | None = None,
248
+ super_slicing_topologies: set[str] | None = None,
246
249
  ) -> dict[str, SystemCharacteristics]:
247
250
  system_characteristics_map = {}
248
- if default_topologies is None:
249
- default_topologies = set()
251
+ default_topologies = default_topologies or set()
252
+ sub_slicing_topologies = sub_slicing_topologies or set()
253
+ super_slicing_topologies = super_slicing_topologies or set()
250
254
  for topology in supported_topologies:
251
255
  chips_per_vm = compute_chips_per_vm(topology)
252
256
  vms_per_slice = compute_vms_per_slice(topology)
@@ -262,7 +266,8 @@ def get_tpu_system_characteristics_map(
262
266
  device_type=device_type,
263
267
  requires_workload_policy=tpu_type_requires_workload_policy
264
268
  and vms_per_slice > 1,
265
- supports_sub_slicing=supports_sub_slicing,
269
+ supports_sub_slicing=topology in sub_slicing_topologies,
270
+ supports_super_slicing=topology in super_slicing_topologies,
266
271
  docker_platform=docker_platform,
267
272
  )
268
273
  system_characteristics_map[f'{prefix}-{topology}'] = system
@@ -306,6 +311,7 @@ UserFacingNameToSystemCharacteristics = {
306
311
  accelerator_type=AcceleratorType.GPU,
307
312
  device_type='l4-1',
308
313
  supports_sub_slicing=False,
314
+ supports_super_slicing=False,
309
315
  gpu_config=GpuConfig(requires_topology=False),
310
316
  docker_platform=AMD_PLATFORM,
311
317
  ),
@@ -318,6 +324,7 @@ UserFacingNameToSystemCharacteristics = {
318
324
  accelerator_type=AcceleratorType.GPU,
319
325
  device_type='l4-2',
320
326
  supports_sub_slicing=False,
327
+ supports_super_slicing=False,
321
328
  gpu_config=GpuConfig(requires_topology=False),
322
329
  docker_platform=AMD_PLATFORM,
323
330
  ),
@@ -330,6 +337,7 @@ UserFacingNameToSystemCharacteristics = {
330
337
  accelerator_type=AcceleratorType.GPU,
331
338
  device_type='l4-4',
332
339
  supports_sub_slicing=False,
340
+ supports_super_slicing=False,
333
341
  gpu_config=GpuConfig(requires_topology=False),
334
342
  docker_platform=AMD_PLATFORM,
335
343
  ),
@@ -342,6 +350,7 @@ UserFacingNameToSystemCharacteristics = {
342
350
  accelerator_type=AcceleratorType.GPU,
343
351
  device_type='l4-8',
344
352
  supports_sub_slicing=False,
353
+ supports_super_slicing=False,
345
354
  gpu_config=GpuConfig(requires_topology=False),
346
355
  docker_platform=AMD_PLATFORM,
347
356
  ),
@@ -355,6 +364,7 @@ UserFacingNameToSystemCharacteristics = {
355
364
  accelerator_type=AcceleratorType.GPU,
356
365
  device_type='a100-40gb-1',
357
366
  supports_sub_slicing=False,
367
+ supports_super_slicing=False,
358
368
  gpu_config=GpuConfig(requires_topology=False),
359
369
  docker_platform=AMD_PLATFORM,
360
370
  ),
@@ -367,6 +377,7 @@ UserFacingNameToSystemCharacteristics = {
367
377
  accelerator_type=AcceleratorType.GPU,
368
378
  device_type='a100-40gb-2',
369
379
  supports_sub_slicing=False,
380
+ supports_super_slicing=False,
370
381
  gpu_config=GpuConfig(requires_topology=False),
371
382
  docker_platform=AMD_PLATFORM,
372
383
  ),
@@ -379,6 +390,7 @@ UserFacingNameToSystemCharacteristics = {
379
390
  accelerator_type=AcceleratorType.GPU,
380
391
  device_type='a100-40gb-4',
381
392
  supports_sub_slicing=False,
393
+ supports_super_slicing=False,
382
394
  gpu_config=GpuConfig(requires_topology=False),
383
395
  docker_platform=AMD_PLATFORM,
384
396
  ),
@@ -391,6 +403,7 @@ UserFacingNameToSystemCharacteristics = {
391
403
  accelerator_type=AcceleratorType.GPU,
392
404
  device_type='a100-40gb-8',
393
405
  supports_sub_slicing=False,
406
+ supports_super_slicing=False,
394
407
  gpu_config=GpuConfig(requires_topology=False),
395
408
  docker_platform=AMD_PLATFORM,
396
409
  ),
@@ -403,6 +416,7 @@ UserFacingNameToSystemCharacteristics = {
403
416
  accelerator_type=AcceleratorType.GPU,
404
417
  device_type='gb200-4',
405
418
  supports_sub_slicing=False,
419
+ supports_super_slicing=False,
406
420
  gpu_config=GpuConfig(
407
421
  requires_topology=True,
408
422
  nccl_installer=INSTALLER_NCCL_RDMA_A4X,
@@ -421,6 +435,7 @@ UserFacingNameToSystemCharacteristics = {
421
435
  accelerator_type=AcceleratorType.GPU,
422
436
  device_type='gb200-4',
423
437
  supports_sub_slicing=False,
438
+ supports_super_slicing=False,
424
439
  gpu_config=GpuConfig(
425
440
  requires_topology=True,
426
441
  nccl_installer=INSTALLER_NCCL_RDMA_A4X,
@@ -439,6 +454,7 @@ UserFacingNameToSystemCharacteristics = {
439
454
  accelerator_type=AcceleratorType.GPU,
440
455
  device_type='b200-8',
441
456
  supports_sub_slicing=False,
457
+ supports_super_slicing=False,
442
458
  gpu_config=GpuConfig(
443
459
  requires_topology=True,
444
460
  nccl_installer=INSTALLER_NCCL_RDMA,
@@ -457,6 +473,7 @@ UserFacingNameToSystemCharacteristics = {
457
473
  accelerator_type=AcceleratorType.GPU,
458
474
  device_type='h200-141gb-8',
459
475
  supports_sub_slicing=False,
476
+ supports_super_slicing=False,
460
477
  gpu_config=GpuConfig(
461
478
  requires_topology=True,
462
479
  nccl_installer=INSTALLER_NCCL_RDMA,
@@ -476,6 +493,7 @@ UserFacingNameToSystemCharacteristics = {
476
493
  accelerator_type=AcceleratorType.GPU,
477
494
  device_type='h100-80gb-8',
478
495
  supports_sub_slicing=False,
496
+ supports_super_slicing=False,
479
497
  gpu_config=GpuConfig(
480
498
  requires_topology=True,
481
499
  nccl_installer=INSTALLER_NCCL_TCPX,
@@ -495,6 +513,7 @@ UserFacingNameToSystemCharacteristics = {
495
513
  accelerator_type=AcceleratorType.GPU,
496
514
  device_type='h100-mega-80gb-8',
497
515
  supports_sub_slicing=False,
516
+ supports_super_slicing=False,
498
517
  gpu_config=GpuConfig(
499
518
  requires_topology=True,
500
519
  nccl_installer=INSTALLER_NCCL_TCPXO,
@@ -512,7 +531,6 @@ UserFacingNameToSystemCharacteristics = {
512
531
  machine_type='tpu7x-standard-1t',
513
532
  supported_topologies=['1x1x1'],
514
533
  tpu_type_requires_workload_policy=True,
515
- supports_sub_slicing=False,
516
534
  docker_platform=AMD_PLATFORM,
517
535
  ),
518
536
  **get_tpu_system_characteristics_map(
@@ -521,9 +539,9 @@ UserFacingNameToSystemCharacteristics = {
521
539
  gke_accelerator='tpu7x',
522
540
  machine_type='tpu7x-standard-4t',
523
541
  tpu_type_requires_workload_policy=True,
524
- supports_sub_slicing=False,
525
542
  docker_platform=AMD_PLATFORM,
526
543
  supported_topologies=generate_tpu_topologies(max_cubes=144),
544
+ super_slicing_topologies=set(['4x4x4']),
527
545
  default_topologies=set([
528
546
  '12x12x12',
529
547
  '12x12x16',
@@ -630,7 +648,6 @@ UserFacingNameToSystemCharacteristics = {
630
648
  tensorcores_per_chip=1,
631
649
  gke_accelerator='tpu-v6e-slice',
632
650
  machine_type='ct6e-standard-1t',
633
- supports_sub_slicing=False,
634
651
  supported_topologies=['1x1'],
635
652
  docker_platform=AMD_PLATFORM,
636
653
  ),
@@ -639,19 +656,8 @@ UserFacingNameToSystemCharacteristics = {
639
656
  tensorcores_per_chip=1,
640
657
  gke_accelerator='tpu-v6e-slice',
641
658
  machine_type='ct6e-standard-4t',
642
- supports_sub_slicing=False,
643
- supported_topologies=[
644
- '2x2',
645
- ],
646
- docker_platform=AMD_PLATFORM,
647
- ),
648
- **get_tpu_system_characteristics_map(
649
- prefix='v6e',
650
- tensorcores_per_chip=1,
651
- gke_accelerator='tpu-v6e-slice',
652
- machine_type='ct6e-standard-4t',
653
- supports_sub_slicing=True,
654
- supported_topologies=SUB_SLICING_TOPOLOGIES,
659
+ supported_topologies=['2x2'] + SUB_SLICING_TOPOLOGIES,
660
+ sub_slicing_topologies=set(SUB_SLICING_TOPOLOGIES),
655
661
  docker_platform=AMD_PLATFORM,
656
662
  ),
657
663
  **get_tpu_system_characteristics_map(
@@ -659,7 +665,6 @@ UserFacingNameToSystemCharacteristics = {
659
665
  tensorcores_per_chip=2,
660
666
  gke_accelerator='tpu-v5p-slice',
661
667
  machine_type='ct5p-hightpu-4t',
662
- supports_sub_slicing=False,
663
668
  docker_platform=AMD_PLATFORM,
664
669
  supported_topologies=generate_tpu_topologies(max_cubes=140),
665
670
  default_topologies=set([
@@ -767,7 +772,6 @@ UserFacingNameToSystemCharacteristics = {
767
772
  gke_accelerator='tpu-v5-lite-podslice',
768
773
  machine_type='ct5lp-hightpu-4t',
769
774
  docker_platform=AMD_PLATFORM,
770
- supports_sub_slicing=False,
771
775
  supported_topologies=['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'],
772
776
  ),
773
777
  **get_tpu_system_characteristics_map(
@@ -776,7 +780,6 @@ UserFacingNameToSystemCharacteristics = {
776
780
  gke_accelerator='tpu-v4-podslice',
777
781
  machine_type='ct4p-hightpu-4t',
778
782
  docker_platform=AMD_PLATFORM,
779
- supports_sub_slicing=False,
780
783
  supported_topologies=generate_tpu_topologies(
781
784
  max_cubes=64, enforce_nondecreasing=False
782
785
  ),
@@ -807,6 +810,7 @@ UserFacingNameToSystemCharacteristics = {
807
810
  accelerator_type=AcceleratorType.CPU,
808
811
  device_type='m1-megamem-96-1',
809
812
  supports_sub_slicing=False,
813
+ supports_super_slicing=False,
810
814
  docker_platform=AMD_PLATFORM,
811
815
  ),
812
816
  # n2-standard-#vCPUs-#VMs
@@ -819,6 +823,7 @@ UserFacingNameToSystemCharacteristics = {
819
823
  accelerator_type=AcceleratorType.CPU,
820
824
  device_type='n2-standard-64-1',
821
825
  supports_sub_slicing=False,
826
+ supports_super_slicing=False,
822
827
  docker_platform=AMD_PLATFORM,
823
828
  ),
824
829
  'n2-standard-32-1': SystemCharacteristics(
@@ -830,6 +835,7 @@ UserFacingNameToSystemCharacteristics = {
830
835
  accelerator_type=AcceleratorType.CPU,
831
836
  device_type='n2-standard-32-1',
832
837
  supports_sub_slicing=False,
838
+ supports_super_slicing=False,
833
839
  docker_platform=AMD_PLATFORM,
834
840
  ),
835
841
  'n2-standard-32-2': SystemCharacteristics(
@@ -841,6 +847,7 @@ UserFacingNameToSystemCharacteristics = {
841
847
  accelerator_type=AcceleratorType.CPU,
842
848
  device_type='n2-standard-32-2',
843
849
  supports_sub_slicing=False,
850
+ supports_super_slicing=False,
844
851
  docker_platform=AMD_PLATFORM,
845
852
  ),
846
853
  'n2-standard-32-4': SystemCharacteristics(
@@ -852,6 +859,7 @@ UserFacingNameToSystemCharacteristics = {
852
859
  accelerator_type=AcceleratorType.CPU,
853
860
  device_type='n2-standard-32-4',
854
861
  supports_sub_slicing=False,
862
+ supports_super_slicing=False,
855
863
  docker_platform=AMD_PLATFORM,
856
864
  ),
857
865
  'n2-standard-32-8': SystemCharacteristics(
@@ -863,6 +871,7 @@ UserFacingNameToSystemCharacteristics = {
863
871
  accelerator_type=AcceleratorType.CPU,
864
872
  device_type='n2-standard-32-8',
865
873
  supports_sub_slicing=False,
874
+ supports_super_slicing=False,
866
875
  docker_platform=AMD_PLATFORM,
867
876
  ),
868
877
  'n2-standard-32-16': SystemCharacteristics(
@@ -874,6 +883,7 @@ UserFacingNameToSystemCharacteristics = {
874
883
  accelerator_type=AcceleratorType.CPU,
875
884
  device_type='n2-standard-32-16',
876
885
  supports_sub_slicing=False,
886
+ supports_super_slicing=False,
877
887
  docker_platform=AMD_PLATFORM,
878
888
  ),
879
889
  'n2-standard-32-32': SystemCharacteristics(
@@ -885,6 +895,7 @@ UserFacingNameToSystemCharacteristics = {
885
895
  accelerator_type=AcceleratorType.CPU,
886
896
  device_type='n2-standard-32-32',
887
897
  supports_sub_slicing=False,
898
+ supports_super_slicing=False,
888
899
  docker_platform=AMD_PLATFORM,
889
900
  ),
890
901
  'n2-standard-32-64': SystemCharacteristics(
@@ -896,6 +907,7 @@ UserFacingNameToSystemCharacteristics = {
896
907
  accelerator_type=AcceleratorType.CPU,
897
908
  device_type='n2-standard-32-64',
898
909
  supports_sub_slicing=False,
910
+ supports_super_slicing=False,
899
911
  docker_platform=AMD_PLATFORM,
900
912
  ),
901
913
  'n2-standard-32-128': SystemCharacteristics(
@@ -907,6 +919,7 @@ UserFacingNameToSystemCharacteristics = {
907
919
  accelerator_type=AcceleratorType.CPU,
908
920
  device_type='n2-standard-32-128',
909
921
  supports_sub_slicing=False,
922
+ supports_super_slicing=False,
910
923
  docker_platform=AMD_PLATFORM,
911
924
  ),
912
925
  'n2-standard-32-256': SystemCharacteristics(
@@ -918,6 +931,7 @@ UserFacingNameToSystemCharacteristics = {
918
931
  accelerator_type=AcceleratorType.CPU,
919
932
  device_type='n2-standard-32-256',
920
933
  supports_sub_slicing=False,
934
+ supports_super_slicing=False,
921
935
  docker_platform=AMD_PLATFORM,
922
936
  ),
923
937
  'n2-standard-32-512': SystemCharacteristics(
@@ -929,6 +943,7 @@ UserFacingNameToSystemCharacteristics = {
929
943
  accelerator_type=AcceleratorType.CPU,
930
944
  device_type='n2-standard-32-512',
931
945
  supports_sub_slicing=False,
946
+ supports_super_slicing=False,
932
947
  docker_platform=AMD_PLATFORM,
933
948
  ),
934
949
  'n2-standard-32-1024': SystemCharacteristics(
@@ -940,6 +955,7 @@ UserFacingNameToSystemCharacteristics = {
940
955
  accelerator_type=AcceleratorType.CPU,
941
956
  device_type='n2-standard-32-1024',
942
957
  supports_sub_slicing=False,
958
+ supports_super_slicing=False,
943
959
  docker_platform=AMD_PLATFORM,
944
960
  ),
945
961
  'n2-standard-32-2048': SystemCharacteristics(
@@ -951,6 +967,7 @@ UserFacingNameToSystemCharacteristics = {
951
967
  accelerator_type=AcceleratorType.CPU,
952
968
  device_type='n2-standard-32-2048',
953
969
  supports_sub_slicing=False,
970
+ supports_super_slicing=False,
954
971
  docker_platform=AMD_PLATFORM,
955
972
  ),
956
973
  }
@@ -983,7 +1000,7 @@ def create_accelerator_label(system: SystemCharacteristics) -> str:
983
1000
  def create_machine_label(system: SystemCharacteristics) -> str:
984
1001
  if system.accelerator_type == AcceleratorType.TPU:
985
1002
  return (
986
- f'{AcceleratorTypeToAcceleratorCharacteristics[system.accelerator_type].machine_label}:'
1003
+ f'{AcceleratorTypeToAcceleratorCharacteristics[AcceleratorType.TPU].machine_label}:'
987
1004
  f' {system.topology}'
988
1005
  )
989
1006
  return ''