xpk 0.16.1__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +48 -5
- xpk/commands/cluster_gcluster.py +3 -0
- xpk/commands/cluster_gcluster_test.py +2 -0
- xpk/commands/cluster_test.py +203 -0
- xpk/commands/common.py +6 -0
- xpk/commands/kind.py +2 -0
- xpk/commands/workload.py +35 -15
- xpk/commands/workload_test.py +1 -0
- xpk/core/capacity.py +83 -46
- xpk/core/capacity_test.py +82 -28
- xpk/core/commands.py +39 -12
- xpk/core/kueue_manager.py +42 -11
- xpk/core/kueue_manager_test.py +83 -3
- xpk/core/nap.py +5 -4
- xpk/core/nodepool.py +57 -20
- xpk/core/nodepool_test.py +152 -23
- xpk/core/pathways.py +2 -1
- xpk/core/resources.py +3 -3
- xpk/core/scheduling.py +54 -10
- xpk/core/scheduling_test.py +118 -13
- xpk/core/system_characteristics.py +41 -24
- xpk/core/system_characteristics_test.py +37 -4
- xpk/core/telemetry.py +5 -0
- xpk/core/telemetry_test.py +19 -2
- xpk/core/updates.py +1 -1
- xpk/main.py +2 -1
- xpk/parser/cluster.py +34 -2
- xpk/parser/cluster_test.py +117 -0
- xpk/parser/common.py +32 -0
- xpk/parser/common_test.py +49 -0
- xpk/templates/kueue_config.yaml.j2 +21 -5
- xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
- xpk/utils/kueue.py +6 -2
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/METADATA +2 -1
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/RECORD +39 -37
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/WHEEL +0 -0
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/top_level.txt +0 -0
xpk/core/scheduling.py
CHANGED
|
@@ -16,7 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from enum import Enum
|
|
18
18
|
|
|
19
|
-
from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled
|
|
19
|
+
from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled, has_super_slicing_enabled
|
|
20
20
|
from ..utils.feature_flags import FeatureFlags
|
|
21
21
|
from ..utils.topology import get_slice_topology_level
|
|
22
22
|
from ..utils.console import xpk_print
|
|
@@ -33,12 +33,14 @@ from .system_characteristics import (
|
|
|
33
33
|
from packaging.version import Version
|
|
34
34
|
|
|
35
35
|
_SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
|
|
36
|
+
_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.14.0')
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
class WorkloadScheduling(Enum):
|
|
39
40
|
UNAVAILABLE = 0
|
|
40
41
|
AVAILABLE = 1
|
|
41
42
|
SUB_SLICING_AVAILABLE = 2
|
|
43
|
+
SUPER_SLICING_AVAILABLE = 3
|
|
42
44
|
|
|
43
45
|
|
|
44
46
|
def check_if_workload_can_schedule(
|
|
@@ -94,10 +96,9 @@ def check_if_workload_can_schedule(
|
|
|
94
96
|
else:
|
|
95
97
|
return WorkloadScheduling.UNAVAILABLE
|
|
96
98
|
|
|
97
|
-
if _check_sub_slicing_availability(
|
|
99
|
+
if cluster_system and _check_sub_slicing_availability(
|
|
98
100
|
workload_system=workload_system, cluster_system=cluster_system
|
|
99
101
|
):
|
|
100
|
-
assert cluster_system
|
|
101
102
|
if _check_workload_size_fits(
|
|
102
103
|
args,
|
|
103
104
|
workload_system,
|
|
@@ -107,6 +108,18 @@ def check_if_workload_can_schedule(
|
|
|
107
108
|
else:
|
|
108
109
|
return WorkloadScheduling.UNAVAILABLE
|
|
109
110
|
|
|
111
|
+
if cluster_system and _check_super_slicing_availability(
|
|
112
|
+
workload_system=workload_system, cluster_system=cluster_system
|
|
113
|
+
):
|
|
114
|
+
if _check_workload_size_fits(
|
|
115
|
+
args,
|
|
116
|
+
workload_system,
|
|
117
|
+
max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
|
|
118
|
+
):
|
|
119
|
+
return WorkloadScheduling.SUPER_SLICING_AVAILABLE
|
|
120
|
+
else:
|
|
121
|
+
return WorkloadScheduling.UNAVAILABLE
|
|
122
|
+
|
|
110
123
|
xpk_print(
|
|
111
124
|
'Workload scheduling validation failed. XPK will not create the workload'
|
|
112
125
|
f' {args.workload}.'
|
|
@@ -147,11 +160,10 @@ def _check_workload_size_fits(
|
|
|
147
160
|
|
|
148
161
|
def _check_sub_slicing_availability(
|
|
149
162
|
workload_system: SystemCharacteristics,
|
|
150
|
-
cluster_system: SystemCharacteristics
|
|
163
|
+
cluster_system: SystemCharacteristics,
|
|
151
164
|
) -> bool:
|
|
152
165
|
if (
|
|
153
166
|
(not FeatureFlags.SUB_SLICING_ENABLED)
|
|
154
|
-
or (not cluster_system)
|
|
155
167
|
or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
|
|
156
168
|
or (not cluster_system.supports_sub_slicing)
|
|
157
169
|
or (workload_system.topology not in SUB_SLICING_TOPOLOGIES)
|
|
@@ -163,7 +175,7 @@ def _check_sub_slicing_availability(
|
|
|
163
175
|
return False
|
|
164
176
|
|
|
165
177
|
return_code, current_version = get_installed_kueue_version(
|
|
166
|
-
dry_run_version=
|
|
178
|
+
dry_run_version=_SUB_SLICING_MINIMUM_KUEUE_VERSION
|
|
167
179
|
)
|
|
168
180
|
|
|
169
181
|
return (
|
|
@@ -173,6 +185,33 @@ def _check_sub_slicing_availability(
|
|
|
173
185
|
)
|
|
174
186
|
|
|
175
187
|
|
|
188
|
+
def _check_super_slicing_availability(
|
|
189
|
+
workload_system: SystemCharacteristics,
|
|
190
|
+
cluster_system: SystemCharacteristics,
|
|
191
|
+
) -> bool:
|
|
192
|
+
# TODO: b/465447813 - Add super-slicing workload topology validation.
|
|
193
|
+
if (
|
|
194
|
+
(not FeatureFlags.SUPER_SLICING_ENABLED)
|
|
195
|
+
or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
|
|
196
|
+
or (not cluster_system.supports_super_slicing)
|
|
197
|
+
):
|
|
198
|
+
return False
|
|
199
|
+
|
|
200
|
+
return_code, sub_slicing_enabled = has_super_slicing_enabled()
|
|
201
|
+
if return_code != 0 or not sub_slicing_enabled:
|
|
202
|
+
return False
|
|
203
|
+
|
|
204
|
+
return_code, current_version = get_installed_kueue_version(
|
|
205
|
+
dry_run_version=_SUPER_SLICING_MINIMUM_KUEUE_VERSION
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
return (
|
|
209
|
+
return_code == 0
|
|
210
|
+
and current_version is not None
|
|
211
|
+
and current_version >= _SUPER_SLICING_MINIMUM_KUEUE_VERSION
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
|
|
176
215
|
def get_total_chips_requested_from_args(
|
|
177
216
|
args, system: SystemCharacteristics
|
|
178
217
|
) -> int:
|
|
@@ -303,13 +342,18 @@ def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
|
|
|
303
342
|
]
|
|
304
343
|
|
|
305
344
|
|
|
306
|
-
def create_placement_policy_label(
|
|
307
|
-
|
|
345
|
+
def create_placement_policy_label(
|
|
346
|
+
system: SystemCharacteristics, super_slicing: bool
|
|
347
|
+
) -> str:
|
|
348
|
+
name = get_placement_policy_name(system, super_slicing)
|
|
308
349
|
return f'cloud.google.com/placement-policy-name: {name}'
|
|
309
350
|
|
|
310
351
|
|
|
311
|
-
def get_placement_policy_name(
|
|
312
|
-
|
|
352
|
+
def get_placement_policy_name(
|
|
353
|
+
system: SystemCharacteristics, super_slicing: bool
|
|
354
|
+
) -> str:
|
|
355
|
+
super_slicing_part = '-ss' if super_slicing else ''
|
|
356
|
+
return f'{system.device_type}-{system.topology}{super_slicing_part}-placement-policy'
|
|
313
357
|
|
|
314
358
|
|
|
315
359
|
def is_placement_policy_supported(system: SystemCharacteristics) -> bool:
|
xpk/core/scheduling_test.py
CHANGED
|
@@ -65,9 +65,12 @@ def test_create_placement_policy_label_returns_valid_label():
|
|
|
65
65
|
device_type='tpu7x',
|
|
66
66
|
accelerator_type=AcceleratorType.TPU,
|
|
67
67
|
supports_sub_slicing=False,
|
|
68
|
+
supports_super_slicing=False,
|
|
68
69
|
docker_platform=DockerPlatform.ARM,
|
|
69
70
|
)
|
|
70
|
-
label = create_placement_policy_label(
|
|
71
|
+
label = create_placement_policy_label(
|
|
72
|
+
system_characteristics, super_slicing=False
|
|
73
|
+
)
|
|
71
74
|
assert (
|
|
72
75
|
label
|
|
73
76
|
== 'cloud.google.com/placement-policy-name: tpu7x-1x1x1-placement-policy'
|
|
@@ -85,12 +88,31 @@ def test_get_placement_policy_name_returns_valid_name():
|
|
|
85
88
|
device_type='tpu7x',
|
|
86
89
|
accelerator_type=AcceleratorType.TPU,
|
|
87
90
|
supports_sub_slicing=False,
|
|
91
|
+
supports_super_slicing=False,
|
|
88
92
|
docker_platform=DockerPlatform.ARM,
|
|
89
93
|
)
|
|
90
|
-
name = get_placement_policy_name(system_characteristics)
|
|
94
|
+
name = get_placement_policy_name(system_characteristics, super_slicing=False)
|
|
91
95
|
assert name == 'tpu7x-1x1x1-placement-policy'
|
|
92
96
|
|
|
93
97
|
|
|
98
|
+
def test_get_placement_policy_name_super_slicing_returns_valid_name():
|
|
99
|
+
system_characteristics = SystemCharacteristics(
|
|
100
|
+
chips_per_vm=1,
|
|
101
|
+
gce_machine_type='tpu7x-standard-1t',
|
|
102
|
+
gke_accelerator='tpu7x',
|
|
103
|
+
requires_workload_policy=False,
|
|
104
|
+
topology='1x1x1',
|
|
105
|
+
vms_per_slice=1,
|
|
106
|
+
device_type='tpu7x',
|
|
107
|
+
accelerator_type=AcceleratorType.TPU,
|
|
108
|
+
supports_sub_slicing=False,
|
|
109
|
+
supports_super_slicing=False,
|
|
110
|
+
docker_platform=DockerPlatform.ARM,
|
|
111
|
+
)
|
|
112
|
+
name = get_placement_policy_name(system_characteristics, super_slicing=True)
|
|
113
|
+
assert name == 'tpu7x-1x1x1-ss-placement-policy'
|
|
114
|
+
|
|
115
|
+
|
|
94
116
|
def test_is_placement_policy_supported_returns_true_for_system_characteristics_supporting_workload_policy_and_having_valid_topology():
|
|
95
117
|
system_characteristics = SystemCharacteristics(
|
|
96
118
|
chips_per_vm=1,
|
|
@@ -102,6 +124,7 @@ def test_is_placement_policy_supported_returns_true_for_system_characteristics_s
|
|
|
102
124
|
device_type='tpu7x',
|
|
103
125
|
accelerator_type=AcceleratorType.TPU,
|
|
104
126
|
supports_sub_slicing=False,
|
|
127
|
+
supports_super_slicing=False,
|
|
105
128
|
docker_platform=DockerPlatform.ARM,
|
|
106
129
|
)
|
|
107
130
|
assert is_placement_policy_supported(system_characteristics) is True
|
|
@@ -118,6 +141,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
|
|
|
118
141
|
device_type='tpu7x',
|
|
119
142
|
accelerator_type=AcceleratorType.TPU,
|
|
120
143
|
supports_sub_slicing=False,
|
|
144
|
+
supports_super_slicing=False,
|
|
121
145
|
docker_platform=DockerPlatform.ARM,
|
|
122
146
|
)
|
|
123
147
|
assert is_placement_policy_supported(system_characteristics) is False
|
|
@@ -134,6 +158,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
|
|
|
134
158
|
device_type='tpu7x',
|
|
135
159
|
accelerator_type=AcceleratorType.TPU,
|
|
136
160
|
supports_sub_slicing=False,
|
|
161
|
+
supports_super_slicing=False,
|
|
137
162
|
docker_platform=DockerPlatform.ARM,
|
|
138
163
|
)
|
|
139
164
|
assert is_placement_policy_supported(system_characteristics) is False
|
|
@@ -145,28 +170,42 @@ class SchedulingTestCase:
|
|
|
145
170
|
num_slices: int = 1
|
|
146
171
|
cluster_system: SystemCharacteristics | None = None
|
|
147
172
|
resources_config_map: dict[str, str] | None = None
|
|
148
|
-
sub_slicing_feature_enabled: bool = False
|
|
149
173
|
kueue_version: str | None = None
|
|
174
|
+
sub_slicing_feature_enabled: bool = False
|
|
150
175
|
sub_slicing_topology_set: bool = False
|
|
176
|
+
super_slicing_feature_enabled: bool = False
|
|
177
|
+
super_slicing_topology_set: bool = False
|
|
151
178
|
|
|
152
179
|
|
|
180
|
+
NAP_CASE = SchedulingTestCase(
|
|
181
|
+
workload_system=_get_system_characteristics_or_die('v6e-8'),
|
|
182
|
+
cluster_system=None,
|
|
183
|
+
resources_config_map={
|
|
184
|
+
'tpu-v6e-slice': AUTOPROVISIONING_CONFIG_VALUE,
|
|
185
|
+
AUTOPROVISIONING_CONFIG_MAXIMUM_KEY: '10',
|
|
186
|
+
},
|
|
187
|
+
)
|
|
188
|
+
|
|
153
189
|
SUB_SLICING_CASE = SchedulingTestCase(
|
|
154
190
|
workload_system=_get_system_characteristics_or_die('v6e-8'),
|
|
155
191
|
cluster_system=_get_system_characteristics_or_die('v6e-16'),
|
|
156
|
-
|
|
157
|
-
|
|
192
|
+
# 2 slices:
|
|
193
|
+
resources_config_map={'v6e-16': str(8 // 4 * 2)},
|
|
158
194
|
kueue_version='0.13.0',
|
|
195
|
+
sub_slicing_feature_enabled=True,
|
|
159
196
|
sub_slicing_topology_set=True,
|
|
160
197
|
num_slices=1,
|
|
161
198
|
)
|
|
162
199
|
|
|
163
|
-
|
|
164
|
-
workload_system=_get_system_characteristics_or_die('
|
|
165
|
-
cluster_system=
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
200
|
+
SUPER_SLICING_CASE = SchedulingTestCase(
|
|
201
|
+
workload_system=_get_system_characteristics_or_die('tpu7x-4x4x16'),
|
|
202
|
+
cluster_system=_get_system_characteristics_or_die('tpu7x-4x4x4'),
|
|
203
|
+
# 5 4x4x4 cubes:
|
|
204
|
+
resources_config_map={'tpu7x-128': str(64 // 4 * 5)},
|
|
205
|
+
kueue_version='0.14.0',
|
|
206
|
+
super_slicing_feature_enabled=True,
|
|
207
|
+
super_slicing_topology_set=True,
|
|
208
|
+
num_slices=1,
|
|
170
209
|
)
|
|
171
210
|
|
|
172
211
|
|
|
@@ -283,6 +322,66 @@ NAP_CASE = SchedulingTestCase(
|
|
|
283
322
|
),
|
|
284
323
|
WorkloadScheduling.AVAILABLE,
|
|
285
324
|
),
|
|
325
|
+
(
|
|
326
|
+
'Correct Super-slicing',
|
|
327
|
+
SUPER_SLICING_CASE,
|
|
328
|
+
WorkloadScheduling.SUPER_SLICING_AVAILABLE,
|
|
329
|
+
),
|
|
330
|
+
(
|
|
331
|
+
'Super-slicing, but disabled flag',
|
|
332
|
+
dataclasses.replace(
|
|
333
|
+
SUPER_SLICING_CASE, super_slicing_feature_enabled=False
|
|
334
|
+
),
|
|
335
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
336
|
+
),
|
|
337
|
+
(
|
|
338
|
+
'Super-slicing, but low Kueue version',
|
|
339
|
+
dataclasses.replace(SUPER_SLICING_CASE, kueue_version='0.13.0'),
|
|
340
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
341
|
+
),
|
|
342
|
+
(
|
|
343
|
+
'Super-slicing, but no super-slicing-topology',
|
|
344
|
+
dataclasses.replace(
|
|
345
|
+
SUPER_SLICING_CASE, super_slicing_topology_set=False
|
|
346
|
+
),
|
|
347
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
348
|
+
),
|
|
349
|
+
(
|
|
350
|
+
'Super-slicing, but workload too big',
|
|
351
|
+
dataclasses.replace(SUPER_SLICING_CASE, num_slices=100),
|
|
352
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
353
|
+
),
|
|
354
|
+
(
|
|
355
|
+
'Super-slicing, but cluster system is incorrect',
|
|
356
|
+
dataclasses.replace(
|
|
357
|
+
SUPER_SLICING_CASE,
|
|
358
|
+
cluster_system=_get_system_characteristics_or_die(
|
|
359
|
+
'tpu7x-4x4x8'
|
|
360
|
+
),
|
|
361
|
+
),
|
|
362
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
363
|
+
),
|
|
364
|
+
(
|
|
365
|
+
'Super-slicing, but workload system is incorrect',
|
|
366
|
+
dataclasses.replace(
|
|
367
|
+
SUPER_SLICING_CASE,
|
|
368
|
+
workload_system=_get_system_characteristics_or_die('v6e-8'),
|
|
369
|
+
),
|
|
370
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
371
|
+
),
|
|
372
|
+
(
|
|
373
|
+
(
|
|
374
|
+
'Super-slicing should be ignored when a given device is already'
|
|
375
|
+
' present in the cluster'
|
|
376
|
+
),
|
|
377
|
+
dataclasses.replace(
|
|
378
|
+
SUPER_SLICING_CASE,
|
|
379
|
+
workload_system=_get_system_characteristics_or_die('tpu7x-64'),
|
|
380
|
+
cluster_system=_get_system_characteristics_or_die('tpu7x-64'),
|
|
381
|
+
resources_config_map={'tpu7x-64': '16'},
|
|
382
|
+
),
|
|
383
|
+
WorkloadScheduling.AVAILABLE,
|
|
384
|
+
),
|
|
286
385
|
],
|
|
287
386
|
)
|
|
288
387
|
def test_check_if_workload_can_schedule(
|
|
@@ -292,6 +391,7 @@ def test_check_if_workload_can_schedule(
|
|
|
292
391
|
expected: WorkloadScheduling,
|
|
293
392
|
):
|
|
294
393
|
FeatureFlags.SUB_SLICING_ENABLED = case.sub_slicing_feature_enabled
|
|
394
|
+
FeatureFlags.SUPER_SLICING_ENABLED = case.super_slicing_feature_enabled
|
|
295
395
|
commands_tester.set_result_for_command(
|
|
296
396
|
(
|
|
297
397
|
0,
|
|
@@ -302,8 +402,13 @@ def test_check_if_workload_can_schedule(
|
|
|
302
402
|
'kubectl get deployment',
|
|
303
403
|
'image',
|
|
304
404
|
)
|
|
405
|
+
topology_response = ''
|
|
406
|
+
if case.sub_slicing_topology_set:
|
|
407
|
+
topology_response = 'sub-slice-topology'
|
|
408
|
+
elif case.super_slicing_topology_set:
|
|
409
|
+
topology_response = 'super-slice-topology'
|
|
305
410
|
commands_tester.set_result_for_command(
|
|
306
|
-
(0,
|
|
411
|
+
(0, topology_response),
|
|
307
412
|
'kubectl get topology',
|
|
308
413
|
)
|
|
309
414
|
args = Namespace(
|
|
@@ -137,6 +137,7 @@ class SystemCharacteristics:
|
|
|
137
137
|
device_type: A user-facing name for the specific hardware configuration
|
|
138
138
|
(e.g., 'l4-1', 'h100-80gb-8').
|
|
139
139
|
supports_sub_slicing: Whether the Sub-slicing feature is supported.
|
|
140
|
+
supports_super_slicing: Whether the Super-slicing feature is supported.
|
|
140
141
|
requires_workload_policy: A boolean indicating if a GCE resource
|
|
141
142
|
workload policy is required. This is automatically set to True for GPUs.
|
|
142
143
|
"""
|
|
@@ -149,6 +150,7 @@ class SystemCharacteristics:
|
|
|
149
150
|
accelerator_type: AcceleratorType
|
|
150
151
|
device_type: str
|
|
151
152
|
supports_sub_slicing: bool
|
|
153
|
+
supports_super_slicing: bool
|
|
152
154
|
docker_platform: DockerPlatform
|
|
153
155
|
requires_workload_policy: bool = False
|
|
154
156
|
gpu_config: Optional[GpuConfig] = None
|
|
@@ -239,14 +241,16 @@ def get_tpu_system_characteristics_map(
|
|
|
239
241
|
gke_accelerator: str,
|
|
240
242
|
machine_type: str,
|
|
241
243
|
supported_topologies: list[str],
|
|
242
|
-
supports_sub_slicing: bool,
|
|
243
244
|
docker_platform: DockerPlatform,
|
|
244
245
|
tpu_type_requires_workload_policy: bool = False,
|
|
245
246
|
default_topologies: set[str] | None = None,
|
|
247
|
+
sub_slicing_topologies: set[str] | None = None,
|
|
248
|
+
super_slicing_topologies: set[str] | None = None,
|
|
246
249
|
) -> dict[str, SystemCharacteristics]:
|
|
247
250
|
system_characteristics_map = {}
|
|
248
|
-
|
|
249
|
-
|
|
251
|
+
default_topologies = default_topologies or set()
|
|
252
|
+
sub_slicing_topologies = sub_slicing_topologies or set()
|
|
253
|
+
super_slicing_topologies = super_slicing_topologies or set()
|
|
250
254
|
for topology in supported_topologies:
|
|
251
255
|
chips_per_vm = compute_chips_per_vm(topology)
|
|
252
256
|
vms_per_slice = compute_vms_per_slice(topology)
|
|
@@ -262,7 +266,8 @@ def get_tpu_system_characteristics_map(
|
|
|
262
266
|
device_type=device_type,
|
|
263
267
|
requires_workload_policy=tpu_type_requires_workload_policy
|
|
264
268
|
and vms_per_slice > 1,
|
|
265
|
-
supports_sub_slicing=
|
|
269
|
+
supports_sub_slicing=topology in sub_slicing_topologies,
|
|
270
|
+
supports_super_slicing=topology in super_slicing_topologies,
|
|
266
271
|
docker_platform=docker_platform,
|
|
267
272
|
)
|
|
268
273
|
system_characteristics_map[f'{prefix}-{topology}'] = system
|
|
@@ -306,6 +311,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
306
311
|
accelerator_type=AcceleratorType.GPU,
|
|
307
312
|
device_type='l4-1',
|
|
308
313
|
supports_sub_slicing=False,
|
|
314
|
+
supports_super_slicing=False,
|
|
309
315
|
gpu_config=GpuConfig(requires_topology=False),
|
|
310
316
|
docker_platform=AMD_PLATFORM,
|
|
311
317
|
),
|
|
@@ -318,6 +324,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
318
324
|
accelerator_type=AcceleratorType.GPU,
|
|
319
325
|
device_type='l4-2',
|
|
320
326
|
supports_sub_slicing=False,
|
|
327
|
+
supports_super_slicing=False,
|
|
321
328
|
gpu_config=GpuConfig(requires_topology=False),
|
|
322
329
|
docker_platform=AMD_PLATFORM,
|
|
323
330
|
),
|
|
@@ -330,6 +337,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
330
337
|
accelerator_type=AcceleratorType.GPU,
|
|
331
338
|
device_type='l4-4',
|
|
332
339
|
supports_sub_slicing=False,
|
|
340
|
+
supports_super_slicing=False,
|
|
333
341
|
gpu_config=GpuConfig(requires_topology=False),
|
|
334
342
|
docker_platform=AMD_PLATFORM,
|
|
335
343
|
),
|
|
@@ -342,6 +350,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
342
350
|
accelerator_type=AcceleratorType.GPU,
|
|
343
351
|
device_type='l4-8',
|
|
344
352
|
supports_sub_slicing=False,
|
|
353
|
+
supports_super_slicing=False,
|
|
345
354
|
gpu_config=GpuConfig(requires_topology=False),
|
|
346
355
|
docker_platform=AMD_PLATFORM,
|
|
347
356
|
),
|
|
@@ -355,6 +364,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
355
364
|
accelerator_type=AcceleratorType.GPU,
|
|
356
365
|
device_type='a100-40gb-1',
|
|
357
366
|
supports_sub_slicing=False,
|
|
367
|
+
supports_super_slicing=False,
|
|
358
368
|
gpu_config=GpuConfig(requires_topology=False),
|
|
359
369
|
docker_platform=AMD_PLATFORM,
|
|
360
370
|
),
|
|
@@ -367,6 +377,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
367
377
|
accelerator_type=AcceleratorType.GPU,
|
|
368
378
|
device_type='a100-40gb-2',
|
|
369
379
|
supports_sub_slicing=False,
|
|
380
|
+
supports_super_slicing=False,
|
|
370
381
|
gpu_config=GpuConfig(requires_topology=False),
|
|
371
382
|
docker_platform=AMD_PLATFORM,
|
|
372
383
|
),
|
|
@@ -379,6 +390,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
379
390
|
accelerator_type=AcceleratorType.GPU,
|
|
380
391
|
device_type='a100-40gb-4',
|
|
381
392
|
supports_sub_slicing=False,
|
|
393
|
+
supports_super_slicing=False,
|
|
382
394
|
gpu_config=GpuConfig(requires_topology=False),
|
|
383
395
|
docker_platform=AMD_PLATFORM,
|
|
384
396
|
),
|
|
@@ -391,6 +403,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
391
403
|
accelerator_type=AcceleratorType.GPU,
|
|
392
404
|
device_type='a100-40gb-8',
|
|
393
405
|
supports_sub_slicing=False,
|
|
406
|
+
supports_super_slicing=False,
|
|
394
407
|
gpu_config=GpuConfig(requires_topology=False),
|
|
395
408
|
docker_platform=AMD_PLATFORM,
|
|
396
409
|
),
|
|
@@ -403,6 +416,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
403
416
|
accelerator_type=AcceleratorType.GPU,
|
|
404
417
|
device_type='gb200-4',
|
|
405
418
|
supports_sub_slicing=False,
|
|
419
|
+
supports_super_slicing=False,
|
|
406
420
|
gpu_config=GpuConfig(
|
|
407
421
|
requires_topology=True,
|
|
408
422
|
nccl_installer=INSTALLER_NCCL_RDMA_A4X,
|
|
@@ -421,6 +435,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
421
435
|
accelerator_type=AcceleratorType.GPU,
|
|
422
436
|
device_type='gb200-4',
|
|
423
437
|
supports_sub_slicing=False,
|
|
438
|
+
supports_super_slicing=False,
|
|
424
439
|
gpu_config=GpuConfig(
|
|
425
440
|
requires_topology=True,
|
|
426
441
|
nccl_installer=INSTALLER_NCCL_RDMA_A4X,
|
|
@@ -439,6 +454,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
439
454
|
accelerator_type=AcceleratorType.GPU,
|
|
440
455
|
device_type='b200-8',
|
|
441
456
|
supports_sub_slicing=False,
|
|
457
|
+
supports_super_slicing=False,
|
|
442
458
|
gpu_config=GpuConfig(
|
|
443
459
|
requires_topology=True,
|
|
444
460
|
nccl_installer=INSTALLER_NCCL_RDMA,
|
|
@@ -457,6 +473,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
457
473
|
accelerator_type=AcceleratorType.GPU,
|
|
458
474
|
device_type='h200-141gb-8',
|
|
459
475
|
supports_sub_slicing=False,
|
|
476
|
+
supports_super_slicing=False,
|
|
460
477
|
gpu_config=GpuConfig(
|
|
461
478
|
requires_topology=True,
|
|
462
479
|
nccl_installer=INSTALLER_NCCL_RDMA,
|
|
@@ -476,6 +493,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
476
493
|
accelerator_type=AcceleratorType.GPU,
|
|
477
494
|
device_type='h100-80gb-8',
|
|
478
495
|
supports_sub_slicing=False,
|
|
496
|
+
supports_super_slicing=False,
|
|
479
497
|
gpu_config=GpuConfig(
|
|
480
498
|
requires_topology=True,
|
|
481
499
|
nccl_installer=INSTALLER_NCCL_TCPX,
|
|
@@ -495,6 +513,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
495
513
|
accelerator_type=AcceleratorType.GPU,
|
|
496
514
|
device_type='h100-mega-80gb-8',
|
|
497
515
|
supports_sub_slicing=False,
|
|
516
|
+
supports_super_slicing=False,
|
|
498
517
|
gpu_config=GpuConfig(
|
|
499
518
|
requires_topology=True,
|
|
500
519
|
nccl_installer=INSTALLER_NCCL_TCPXO,
|
|
@@ -512,7 +531,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
512
531
|
machine_type='tpu7x-standard-1t',
|
|
513
532
|
supported_topologies=['1x1x1'],
|
|
514
533
|
tpu_type_requires_workload_policy=True,
|
|
515
|
-
supports_sub_slicing=False,
|
|
516
534
|
docker_platform=AMD_PLATFORM,
|
|
517
535
|
),
|
|
518
536
|
**get_tpu_system_characteristics_map(
|
|
@@ -521,9 +539,9 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
521
539
|
gke_accelerator='tpu7x',
|
|
522
540
|
machine_type='tpu7x-standard-4t',
|
|
523
541
|
tpu_type_requires_workload_policy=True,
|
|
524
|
-
supports_sub_slicing=False,
|
|
525
542
|
docker_platform=AMD_PLATFORM,
|
|
526
543
|
supported_topologies=generate_tpu_topologies(max_cubes=144),
|
|
544
|
+
super_slicing_topologies=set(['4x4x4']),
|
|
527
545
|
default_topologies=set([
|
|
528
546
|
'12x12x12',
|
|
529
547
|
'12x12x16',
|
|
@@ -630,7 +648,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
630
648
|
tensorcores_per_chip=1,
|
|
631
649
|
gke_accelerator='tpu-v6e-slice',
|
|
632
650
|
machine_type='ct6e-standard-1t',
|
|
633
|
-
supports_sub_slicing=False,
|
|
634
651
|
supported_topologies=['1x1'],
|
|
635
652
|
docker_platform=AMD_PLATFORM,
|
|
636
653
|
),
|
|
@@ -639,19 +656,8 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
639
656
|
tensorcores_per_chip=1,
|
|
640
657
|
gke_accelerator='tpu-v6e-slice',
|
|
641
658
|
machine_type='ct6e-standard-4t',
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
'2x2',
|
|
645
|
-
],
|
|
646
|
-
docker_platform=AMD_PLATFORM,
|
|
647
|
-
),
|
|
648
|
-
**get_tpu_system_characteristics_map(
|
|
649
|
-
prefix='v6e',
|
|
650
|
-
tensorcores_per_chip=1,
|
|
651
|
-
gke_accelerator='tpu-v6e-slice',
|
|
652
|
-
machine_type='ct6e-standard-4t',
|
|
653
|
-
supports_sub_slicing=True,
|
|
654
|
-
supported_topologies=SUB_SLICING_TOPOLOGIES,
|
|
659
|
+
supported_topologies=['2x2'] + SUB_SLICING_TOPOLOGIES,
|
|
660
|
+
sub_slicing_topologies=set(SUB_SLICING_TOPOLOGIES),
|
|
655
661
|
docker_platform=AMD_PLATFORM,
|
|
656
662
|
),
|
|
657
663
|
**get_tpu_system_characteristics_map(
|
|
@@ -659,7 +665,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
659
665
|
tensorcores_per_chip=2,
|
|
660
666
|
gke_accelerator='tpu-v5p-slice',
|
|
661
667
|
machine_type='ct5p-hightpu-4t',
|
|
662
|
-
supports_sub_slicing=False,
|
|
663
668
|
docker_platform=AMD_PLATFORM,
|
|
664
669
|
supported_topologies=generate_tpu_topologies(max_cubes=140),
|
|
665
670
|
default_topologies=set([
|
|
@@ -767,7 +772,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
767
772
|
gke_accelerator='tpu-v5-lite-podslice',
|
|
768
773
|
machine_type='ct5lp-hightpu-4t',
|
|
769
774
|
docker_platform=AMD_PLATFORM,
|
|
770
|
-
supports_sub_slicing=False,
|
|
771
775
|
supported_topologies=['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'],
|
|
772
776
|
),
|
|
773
777
|
**get_tpu_system_characteristics_map(
|
|
@@ -776,7 +780,6 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
776
780
|
gke_accelerator='tpu-v4-podslice',
|
|
777
781
|
machine_type='ct4p-hightpu-4t',
|
|
778
782
|
docker_platform=AMD_PLATFORM,
|
|
779
|
-
supports_sub_slicing=False,
|
|
780
783
|
supported_topologies=generate_tpu_topologies(
|
|
781
784
|
max_cubes=64, enforce_nondecreasing=False
|
|
782
785
|
),
|
|
@@ -807,6 +810,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
807
810
|
accelerator_type=AcceleratorType.CPU,
|
|
808
811
|
device_type='m1-megamem-96-1',
|
|
809
812
|
supports_sub_slicing=False,
|
|
813
|
+
supports_super_slicing=False,
|
|
810
814
|
docker_platform=AMD_PLATFORM,
|
|
811
815
|
),
|
|
812
816
|
# n2-standard-#vCPUs-#VMs
|
|
@@ -819,6 +823,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
819
823
|
accelerator_type=AcceleratorType.CPU,
|
|
820
824
|
device_type='n2-standard-64-1',
|
|
821
825
|
supports_sub_slicing=False,
|
|
826
|
+
supports_super_slicing=False,
|
|
822
827
|
docker_platform=AMD_PLATFORM,
|
|
823
828
|
),
|
|
824
829
|
'n2-standard-32-1': SystemCharacteristics(
|
|
@@ -830,6 +835,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
830
835
|
accelerator_type=AcceleratorType.CPU,
|
|
831
836
|
device_type='n2-standard-32-1',
|
|
832
837
|
supports_sub_slicing=False,
|
|
838
|
+
supports_super_slicing=False,
|
|
833
839
|
docker_platform=AMD_PLATFORM,
|
|
834
840
|
),
|
|
835
841
|
'n2-standard-32-2': SystemCharacteristics(
|
|
@@ -841,6 +847,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
841
847
|
accelerator_type=AcceleratorType.CPU,
|
|
842
848
|
device_type='n2-standard-32-2',
|
|
843
849
|
supports_sub_slicing=False,
|
|
850
|
+
supports_super_slicing=False,
|
|
844
851
|
docker_platform=AMD_PLATFORM,
|
|
845
852
|
),
|
|
846
853
|
'n2-standard-32-4': SystemCharacteristics(
|
|
@@ -852,6 +859,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
852
859
|
accelerator_type=AcceleratorType.CPU,
|
|
853
860
|
device_type='n2-standard-32-4',
|
|
854
861
|
supports_sub_slicing=False,
|
|
862
|
+
supports_super_slicing=False,
|
|
855
863
|
docker_platform=AMD_PLATFORM,
|
|
856
864
|
),
|
|
857
865
|
'n2-standard-32-8': SystemCharacteristics(
|
|
@@ -863,6 +871,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
863
871
|
accelerator_type=AcceleratorType.CPU,
|
|
864
872
|
device_type='n2-standard-32-8',
|
|
865
873
|
supports_sub_slicing=False,
|
|
874
|
+
supports_super_slicing=False,
|
|
866
875
|
docker_platform=AMD_PLATFORM,
|
|
867
876
|
),
|
|
868
877
|
'n2-standard-32-16': SystemCharacteristics(
|
|
@@ -874,6 +883,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
874
883
|
accelerator_type=AcceleratorType.CPU,
|
|
875
884
|
device_type='n2-standard-32-16',
|
|
876
885
|
supports_sub_slicing=False,
|
|
886
|
+
supports_super_slicing=False,
|
|
877
887
|
docker_platform=AMD_PLATFORM,
|
|
878
888
|
),
|
|
879
889
|
'n2-standard-32-32': SystemCharacteristics(
|
|
@@ -885,6 +895,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
885
895
|
accelerator_type=AcceleratorType.CPU,
|
|
886
896
|
device_type='n2-standard-32-32',
|
|
887
897
|
supports_sub_slicing=False,
|
|
898
|
+
supports_super_slicing=False,
|
|
888
899
|
docker_platform=AMD_PLATFORM,
|
|
889
900
|
),
|
|
890
901
|
'n2-standard-32-64': SystemCharacteristics(
|
|
@@ -896,6 +907,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
896
907
|
accelerator_type=AcceleratorType.CPU,
|
|
897
908
|
device_type='n2-standard-32-64',
|
|
898
909
|
supports_sub_slicing=False,
|
|
910
|
+
supports_super_slicing=False,
|
|
899
911
|
docker_platform=AMD_PLATFORM,
|
|
900
912
|
),
|
|
901
913
|
'n2-standard-32-128': SystemCharacteristics(
|
|
@@ -907,6 +919,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
907
919
|
accelerator_type=AcceleratorType.CPU,
|
|
908
920
|
device_type='n2-standard-32-128',
|
|
909
921
|
supports_sub_slicing=False,
|
|
922
|
+
supports_super_slicing=False,
|
|
910
923
|
docker_platform=AMD_PLATFORM,
|
|
911
924
|
),
|
|
912
925
|
'n2-standard-32-256': SystemCharacteristics(
|
|
@@ -918,6 +931,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
918
931
|
accelerator_type=AcceleratorType.CPU,
|
|
919
932
|
device_type='n2-standard-32-256',
|
|
920
933
|
supports_sub_slicing=False,
|
|
934
|
+
supports_super_slicing=False,
|
|
921
935
|
docker_platform=AMD_PLATFORM,
|
|
922
936
|
),
|
|
923
937
|
'n2-standard-32-512': SystemCharacteristics(
|
|
@@ -929,6 +943,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
929
943
|
accelerator_type=AcceleratorType.CPU,
|
|
930
944
|
device_type='n2-standard-32-512',
|
|
931
945
|
supports_sub_slicing=False,
|
|
946
|
+
supports_super_slicing=False,
|
|
932
947
|
docker_platform=AMD_PLATFORM,
|
|
933
948
|
),
|
|
934
949
|
'n2-standard-32-1024': SystemCharacteristics(
|
|
@@ -940,6 +955,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
940
955
|
accelerator_type=AcceleratorType.CPU,
|
|
941
956
|
device_type='n2-standard-32-1024',
|
|
942
957
|
supports_sub_slicing=False,
|
|
958
|
+
supports_super_slicing=False,
|
|
943
959
|
docker_platform=AMD_PLATFORM,
|
|
944
960
|
),
|
|
945
961
|
'n2-standard-32-2048': SystemCharacteristics(
|
|
@@ -951,6 +967,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
951
967
|
accelerator_type=AcceleratorType.CPU,
|
|
952
968
|
device_type='n2-standard-32-2048',
|
|
953
969
|
supports_sub_slicing=False,
|
|
970
|
+
supports_super_slicing=False,
|
|
954
971
|
docker_platform=AMD_PLATFORM,
|
|
955
972
|
),
|
|
956
973
|
}
|
|
@@ -983,7 +1000,7 @@ def create_accelerator_label(system: SystemCharacteristics) -> str:
|
|
|
983
1000
|
def create_machine_label(system: SystemCharacteristics) -> str:
|
|
984
1001
|
if system.accelerator_type == AcceleratorType.TPU:
|
|
985
1002
|
return (
|
|
986
|
-
f'{AcceleratorTypeToAcceleratorCharacteristics[
|
|
1003
|
+
f'{AcceleratorTypeToAcceleratorCharacteristics[AcceleratorType.TPU].machine_label}:'
|
|
987
1004
|
f' {system.topology}'
|
|
988
1005
|
)
|
|
989
1006
|
return ''
|