xpk 0.17.3__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +33 -43
- xpk/commands/cluster_gcluster.py +19 -14
- xpk/commands/cluster_gcluster_test.py +2 -0
- xpk/commands/cluster_test.py +1 -21
- xpk/commands/common.py +39 -6
- xpk/commands/common_test.py +170 -0
- xpk/commands/info.py +9 -5
- xpk/commands/inspector.py +33 -4
- xpk/commands/inspector_test.py +142 -0
- xpk/commands/workload.py +32 -11
- xpk/commands/workload_test.py +71 -3
- xpk/core/blueprint/blueprint_generator.py +19 -8
- xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
- xpk/core/blueprint/testing/data/a4.yaml +3 -1
- xpk/core/capacity.py +37 -17
- xpk/core/capacity_test.py +66 -1
- xpk/core/cluster.py +11 -10
- xpk/core/cluster_private.py +3 -3
- xpk/core/cluster_test.py +29 -2
- xpk/core/config.py +5 -2
- xpk/core/docker_container.py +31 -24
- xpk/core/docker_manager.py +4 -4
- xpk/core/docker_resources.py +4 -1
- xpk/core/kueue_manager.py +6 -8
- xpk/core/kueue_manager_test.py +6 -5
- xpk/core/nap.py +14 -3
- xpk/core/nodepool.py +52 -13
- xpk/core/nodepool_test.py +147 -8
- xpk/core/remote_state/fuse_remote_state.py +1 -1
- xpk/core/scheduling.py +32 -4
- xpk/core/scheduling_test.py +39 -2
- xpk/core/system_characteristics.py +44 -0
- xpk/core/system_characteristics_test.py +11 -0
- xpk/core/telemetry.py +11 -1
- xpk/core/telemetry_test.py +39 -0
- xpk/core/testing/commands_tester.py +26 -0
- xpk/core/testing/commands_tester_test.py +20 -1
- xpk/core/workload_decorators/rdma_decorator.py +9 -0
- xpk/parser/cluster.py +11 -1
- xpk/parser/cluster_test.py +59 -1
- xpk/parser/common.py +11 -17
- xpk/parser/core.py +0 -8
- xpk/parser/storage.py +3 -14
- xpk/utils/console.py +1 -1
- xpk/utils/feature_flags.py +8 -4
- {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/METADATA +50 -23
- {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/RECORD +51 -60
- xpk-1.1.0.dist-info/top_level.txt +1 -0
- integration/README.md +0 -19
- integration/__init__.py +0 -15
- integration/docker_manager_test.py +0 -102
- integration/gcluster_a3mega_test.py +0 -215
- integration/gcluster_a3ultra_test.py +0 -187
- integration/gcluster_a4_test.py +0 -187
- integration/gcluster_test.py +0 -107
- xpk/commands/kind.py +0 -265
- xpk/parser/kind.py +0 -95
- xpk/utils/user_input.py +0 -48
- xpk/utils/user_input_test.py +0 -92
- xpk-0.17.3.dist-info/top_level.txt +0 -2
- {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/WHEEL +0 -0
- {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/licenses/LICENSE +0 -0
xpk/core/nodepool_test.py
CHANGED
|
@@ -20,6 +20,7 @@ from xpk.core.nodepool import (
|
|
|
20
20
|
ensure_resource_policy_exists,
|
|
21
21
|
get_desired_node_pool_names,
|
|
22
22
|
run_gke_node_pool_create_command,
|
|
23
|
+
_validate_reservation_count,
|
|
23
24
|
)
|
|
24
25
|
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
|
|
25
26
|
from xpk.core.commands import FailedCommand
|
|
@@ -103,6 +104,7 @@ def commands_tester(mocker):
|
|
|
103
104
|
return CommandsTester(
|
|
104
105
|
mocker,
|
|
105
106
|
run_command_for_value_path="xpk.core.nodepool.run_command_for_value",
|
|
107
|
+
run_command_batch_path="xpk.core.commands.run_command_batch",
|
|
106
108
|
)
|
|
107
109
|
|
|
108
110
|
|
|
@@ -119,7 +121,7 @@ def test_ensure_resource_policy_exists_with_existing_policy_retrieves_existing_p
|
|
|
119
121
|
|
|
120
122
|
assert len(commands_tester.commands_history) == 1
|
|
121
123
|
commands_tester.assert_command_run(
|
|
122
|
-
"gcloud compute resource-policies describe resource-policy",
|
|
124
|
+
"gcloud beta compute resource-policies describe resource-policy",
|
|
123
125
|
"--project=test-project",
|
|
124
126
|
"--region=us-central1",
|
|
125
127
|
)
|
|
@@ -129,7 +131,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
|
|
|
129
131
|
commands_tester: CommandsTester,
|
|
130
132
|
):
|
|
131
133
|
commands_tester.set_result_for_command(
|
|
132
|
-
(1, ""), "gcloud compute resource-policies describe"
|
|
134
|
+
(1, ""), "gcloud beta compute resource-policies describe"
|
|
133
135
|
)
|
|
134
136
|
|
|
135
137
|
ensure_resource_policy_exists(
|
|
@@ -142,16 +144,17 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
|
|
|
142
144
|
|
|
143
145
|
assert len(commands_tester.commands_history) == 2
|
|
144
146
|
commands_tester.assert_command_run(
|
|
145
|
-
"gcloud compute resource-policies describe"
|
|
147
|
+
"gcloud beta compute resource-policies describe"
|
|
146
148
|
)
|
|
147
149
|
commands_tester.assert_command_run(
|
|
148
|
-
"gcloud compute resource-policies create workload-policy
|
|
150
|
+
"gcloud beta compute resource-policies create workload-policy"
|
|
151
|
+
" resource-policy",
|
|
149
152
|
"--project=test-project",
|
|
150
153
|
"--region=us-central1",
|
|
151
154
|
"--accelerator-topology=2x2x1",
|
|
152
155
|
)
|
|
153
156
|
commands_tester.assert_command_not_run(
|
|
154
|
-
"gcloud compute resource-policies create workload-policy",
|
|
157
|
+
"gcloud beta compute resource-policies create workload-policy",
|
|
155
158
|
"--accelerator-topology-mode",
|
|
156
159
|
)
|
|
157
160
|
|
|
@@ -160,7 +163,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_fo
|
|
|
160
163
|
commands_tester: CommandsTester,
|
|
161
164
|
):
|
|
162
165
|
commands_tester.set_result_for_command(
|
|
163
|
-
(1, ""), "gcloud compute resource-policies describe"
|
|
166
|
+
(1, ""), "gcloud beta compute resource-policies describe"
|
|
164
167
|
)
|
|
165
168
|
|
|
166
169
|
ensure_resource_policy_exists(
|
|
@@ -172,7 +175,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_fo
|
|
|
172
175
|
)
|
|
173
176
|
|
|
174
177
|
commands_tester.assert_command_run(
|
|
175
|
-
"gcloud compute resource-policies create workload-policy",
|
|
178
|
+
"gcloud beta compute resource-policies create workload-policy",
|
|
176
179
|
"--accelerator-topology-mode",
|
|
177
180
|
)
|
|
178
181
|
|
|
@@ -182,7 +185,7 @@ def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creati
|
|
|
182
185
|
):
|
|
183
186
|
with pytest.raises(RuntimeError):
|
|
184
187
|
commands_tester.set_result_for_command(
|
|
185
|
-
(1, ""), "gcloud compute resource-policies"
|
|
188
|
+
(1, ""), "gcloud beta compute resource-policies"
|
|
186
189
|
)
|
|
187
190
|
|
|
188
191
|
ensure_resource_policy_exists(
|
|
@@ -251,6 +254,7 @@ def test_placement_policy_created_for_gpu_with_valid_topology(
|
|
|
251
254
|
device_type="h100-80gb-8",
|
|
252
255
|
supports_sub_slicing=False,
|
|
253
256
|
supports_super_slicing=False,
|
|
257
|
+
supports_accelerator_network_profile=True,
|
|
254
258
|
docker_platform=DockerPlatform.ARM,
|
|
255
259
|
gpu_config=GpuConfig(requires_topology=True),
|
|
256
260
|
)
|
|
@@ -284,6 +288,7 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
|
|
|
284
288
|
device_type="h100-80gb-8",
|
|
285
289
|
supports_sub_slicing=False,
|
|
286
290
|
supports_super_slicing=False,
|
|
291
|
+
supports_accelerator_network_profile=True,
|
|
287
292
|
docker_platform=DockerPlatform.ARM,
|
|
288
293
|
gpu_config=GpuConfig(requires_topology=True),
|
|
289
294
|
)
|
|
@@ -320,6 +325,7 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
|
|
|
320
325
|
requires_workload_policy=True,
|
|
321
326
|
supports_sub_slicing=False,
|
|
322
327
|
supports_super_slicing=False,
|
|
328
|
+
supports_accelerator_network_profile=False,
|
|
323
329
|
docker_platform=DockerPlatform.ARM,
|
|
324
330
|
)
|
|
325
331
|
|
|
@@ -354,6 +360,7 @@ def test_placement_policy_not_created_for_non7x_tpu(
|
|
|
354
360
|
device_type="v6e-4",
|
|
355
361
|
supports_sub_slicing=True,
|
|
356
362
|
supports_super_slicing=False,
|
|
363
|
+
supports_accelerator_network_profile=True,
|
|
357
364
|
docker_platform=DockerPlatform.ARM,
|
|
358
365
|
)
|
|
359
366
|
|
|
@@ -429,3 +436,135 @@ def test_display_nodepool_creation_ignores_logs_without_errors(
|
|
|
429
436
|
mock_xpk_print.call_args_list[0].args[0]
|
|
430
437
|
== "Create Nodepools returned ERROR 1"
|
|
431
438
|
)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def test_validate_reservation_count_mismatch(mock_xpk_print):
|
|
442
|
+
result = _validate_reservation_count(
|
|
443
|
+
["res1", "res2"], num_node_pools_to_create=3
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
assert result == 1
|
|
447
|
+
assert mock_xpk_print.call_count == 1
|
|
448
|
+
assert (
|
|
449
|
+
"reservations (2) must match the number of NEW nodepools (3)"
|
|
450
|
+
in mock_xpk_print.call_args_list[0].args[0]
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def test_run_gke_node_pool_create_command_multiple_reservations(
|
|
455
|
+
mocker,
|
|
456
|
+
commands_tester: CommandsTester,
|
|
457
|
+
):
|
|
458
|
+
mocker.patch(
|
|
459
|
+
"xpk.core.nodepool.get_cluster_location", return_value="us-central1"
|
|
460
|
+
)
|
|
461
|
+
mocker.patch("xpk.core.capacity.verify_reservations_exist", return_value=0)
|
|
462
|
+
args = mocker.Mock(
|
|
463
|
+
num_slices=2,
|
|
464
|
+
reservation="res1,res2",
|
|
465
|
+
tpu_type="v4-8",
|
|
466
|
+
device_type=None,
|
|
467
|
+
cluster="test-cluster",
|
|
468
|
+
project="test-project",
|
|
469
|
+
zone="us-central1-a",
|
|
470
|
+
on_demand=False,
|
|
471
|
+
spot=False,
|
|
472
|
+
flex=False,
|
|
473
|
+
enable_workload_identity=False,
|
|
474
|
+
enable_gcsfuse_csi_driver=False,
|
|
475
|
+
host_maintenance_interval="AS_NEEDED",
|
|
476
|
+
custom_nodepool_arguments="",
|
|
477
|
+
)
|
|
478
|
+
system = SystemCharacteristics(
|
|
479
|
+
topology="2x2x1",
|
|
480
|
+
vms_per_slice=2,
|
|
481
|
+
gke_accelerator="tpu-v4",
|
|
482
|
+
gce_machine_type="ct4p-hightpu-4t",
|
|
483
|
+
chips_per_vm=4,
|
|
484
|
+
accelerator_type=AcceleratorType.TPU,
|
|
485
|
+
device_type="v4-8",
|
|
486
|
+
requires_workload_policy=False,
|
|
487
|
+
supports_sub_slicing=False,
|
|
488
|
+
supports_super_slicing=False,
|
|
489
|
+
supports_accelerator_network_profile=True,
|
|
490
|
+
docker_platform=DockerPlatform.AMD,
|
|
491
|
+
)
|
|
492
|
+
commands_tester.set_result_for_command(
|
|
493
|
+
(0, ""), "gcloud beta container node-pools list"
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
result = run_gke_node_pool_create_command(args, system, "1.2.3")
|
|
497
|
+
|
|
498
|
+
assert result == 0
|
|
499
|
+
commands_tester.assert_command_run(
|
|
500
|
+
"gcloud", "node-pools create", "--tpu-topology=2x2x1", times=2
|
|
501
|
+
)
|
|
502
|
+
commands_tester.assert_command_run(
|
|
503
|
+
"gcloud", "node-pools create", "test-cluster-np-0", "--reservation=res1"
|
|
504
|
+
)
|
|
505
|
+
commands_tester.assert_command_run(
|
|
506
|
+
"gcloud", "node-pools create", "test-cluster-np-1", "--reservation=res2"
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def test_run_gke_node_pool_create_command_partial_reservations(
|
|
511
|
+
mocker,
|
|
512
|
+
commands_tester: CommandsTester,
|
|
513
|
+
):
|
|
514
|
+
mocker.patch(
|
|
515
|
+
"xpk.core.nodepool.get_cluster_location", return_value="us-central1"
|
|
516
|
+
)
|
|
517
|
+
mocker.patch("xpk.core.nodepool.get_node_pools_to_delete", return_value=[])
|
|
518
|
+
mocker.patch("xpk.core.capacity.verify_reservations_exist", return_value=0)
|
|
519
|
+
args = mocker.Mock(
|
|
520
|
+
num_slices=3,
|
|
521
|
+
reservation="res1,res2",
|
|
522
|
+
tpu_type="v4-8",
|
|
523
|
+
device_type=None,
|
|
524
|
+
cluster="test-cluster",
|
|
525
|
+
project="test-project",
|
|
526
|
+
zone="us-central1-a",
|
|
527
|
+
on_demand=False,
|
|
528
|
+
spot=False,
|
|
529
|
+
flex=False,
|
|
530
|
+
enable_workload_identity=False,
|
|
531
|
+
enable_gcsfuse_csi_driver=False,
|
|
532
|
+
host_maintenance_interval="AS_NEEDED",
|
|
533
|
+
custom_nodepool_arguments="",
|
|
534
|
+
)
|
|
535
|
+
system = SystemCharacteristics(
|
|
536
|
+
topology="2x2x1",
|
|
537
|
+
vms_per_slice=2,
|
|
538
|
+
gke_accelerator="tpu-v4",
|
|
539
|
+
gce_machine_type="ct4p-hightpu-4t",
|
|
540
|
+
chips_per_vm=4,
|
|
541
|
+
accelerator_type=AcceleratorType.TPU,
|
|
542
|
+
device_type="v4-8",
|
|
543
|
+
requires_workload_policy=False,
|
|
544
|
+
supports_sub_slicing=False,
|
|
545
|
+
supports_super_slicing=False,
|
|
546
|
+
supports_accelerator_network_profile=True,
|
|
547
|
+
docker_platform=DockerPlatform.AMD,
|
|
548
|
+
)
|
|
549
|
+
commands_tester.set_result_for_command(
|
|
550
|
+
(0, "test-cluster-np-0"), "gcloud beta container node-pools list"
|
|
551
|
+
)
|
|
552
|
+
commands_tester.set_result_for_command(
|
|
553
|
+
(0, "us-central1-a"),
|
|
554
|
+
"gcloud",
|
|
555
|
+
"node-pools describe",
|
|
556
|
+
'--format="value(locations)"',
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
result = run_gke_node_pool_create_command(args, system, "1.2.3")
|
|
560
|
+
|
|
561
|
+
assert result == 0
|
|
562
|
+
commands_tester.assert_command_run(
|
|
563
|
+
"gcloud", "node-pools create", "--tpu-topology=2x2x1", times=2
|
|
564
|
+
)
|
|
565
|
+
commands_tester.assert_command_run(
|
|
566
|
+
"gcloud", "node-pools create", "test-cluster-np-1", "--reservation=res1"
|
|
567
|
+
)
|
|
568
|
+
commands_tester.assert_command_run(
|
|
569
|
+
"gcloud", "node-pools create", "test-cluster-np-2", "--reservation=res2"
|
|
570
|
+
)
|
|
@@ -56,7 +56,7 @@ class FuseStateClient(RemoteStateClient):
|
|
|
56
56
|
|
|
57
57
|
def upload_state(self) -> None:
|
|
58
58
|
xpk_print(
|
|
59
|
-
f'Uploading
|
|
59
|
+
f'Uploading dependencies from directory {self.state_dir} to bucket:'
|
|
60
60
|
f' {self.bucket}. Path within bucket is: {self._get_bucket_path()}'
|
|
61
61
|
)
|
|
62
62
|
upload_directory_to_gcs(
|
xpk/core/scheduling.py
CHANGED
|
@@ -18,7 +18,7 @@ from enum import Enum
|
|
|
18
18
|
|
|
19
19
|
from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled, has_super_slicing_enabled
|
|
20
20
|
from ..utils.feature_flags import FeatureFlags
|
|
21
|
-
from ..utils.topology import get_slice_topology_level
|
|
21
|
+
from ..utils.topology import get_slice_topology_level, parse_topology
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
from ..utils.topology import is_topology_valid
|
|
24
24
|
from ..utils.execution_context import is_dry_run
|
|
@@ -33,7 +33,11 @@ from .system_characteristics import (
|
|
|
33
33
|
from packaging.version import Version
|
|
34
34
|
|
|
35
35
|
_SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
|
|
36
|
-
_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.
|
|
36
|
+
_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.15.2')
|
|
37
|
+
_SUPER_SLICING_MAX_TOPOLOGY = (16, 24, 24)
|
|
38
|
+
ONE_TO_ONE_REPLICA_NODE_POOL_ASSIGNMENT_ANNOTATION = (
|
|
39
|
+
'alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool'
|
|
40
|
+
)
|
|
37
41
|
|
|
38
42
|
|
|
39
43
|
class WorkloadScheduling(Enum):
|
|
@@ -115,7 +119,7 @@ def check_if_workload_can_schedule(
|
|
|
115
119
|
args,
|
|
116
120
|
workload_system,
|
|
117
121
|
max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
|
|
118
|
-
):
|
|
122
|
+
) and _check_super_slicing_topology(workload_system):
|
|
119
123
|
return WorkloadScheduling.SUPER_SLICING_AVAILABLE
|
|
120
124
|
else:
|
|
121
125
|
return WorkloadScheduling.UNAVAILABLE
|
|
@@ -189,7 +193,6 @@ def _check_super_slicing_availability(
|
|
|
189
193
|
workload_system: SystemCharacteristics,
|
|
190
194
|
cluster_system: SystemCharacteristics,
|
|
191
195
|
) -> bool:
|
|
192
|
-
# TODO: b/465447813 - Add super-slicing workload topology validation.
|
|
193
196
|
if (
|
|
194
197
|
(not FeatureFlags.SUPER_SLICING_ENABLED)
|
|
195
198
|
or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
|
|
@@ -212,6 +215,27 @@ def _check_super_slicing_availability(
|
|
|
212
215
|
)
|
|
213
216
|
|
|
214
217
|
|
|
218
|
+
def _check_super_slicing_topology(
|
|
219
|
+
workload_system: SystemCharacteristics,
|
|
220
|
+
) -> bool:
|
|
221
|
+
topology = parse_topology(workload_system.topology)
|
|
222
|
+
result = (
|
|
223
|
+
all(size % 4 == 0 and size >= 4 for size in topology)
|
|
224
|
+
and len(topology) == len(_SUPER_SLICING_MAX_TOPOLOGY)
|
|
225
|
+
and topology[0] <= topology[1] <= topology[2]
|
|
226
|
+
and all(a <= b for a, b in zip(topology, _SUPER_SLICING_MAX_TOPOLOGY))
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
if not result:
|
|
230
|
+
xpk_print(
|
|
231
|
+
'Error: Invalid super-slicing topology. It must adhere to the format of'
|
|
232
|
+
' 4i x 4j x 4k, where i <= j <= k, and i, j, k are integers, with a'
|
|
233
|
+
' maximum of 16x24x24.'
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return result
|
|
237
|
+
|
|
238
|
+
|
|
215
239
|
def get_total_chips_requested_from_args(
|
|
216
240
|
args, system: SystemCharacteristics
|
|
217
241
|
) -> int:
|
|
@@ -342,6 +366,10 @@ def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
|
|
|
342
366
|
]
|
|
343
367
|
|
|
344
368
|
|
|
369
|
+
def create_tpu_slice_topology_annotation(workload_topology: str) -> str:
|
|
370
|
+
return f'cloud.google.com/gke-tpu-slice-topology: {workload_topology}'
|
|
371
|
+
|
|
372
|
+
|
|
345
373
|
def create_placement_policy_label(
|
|
346
374
|
system: SystemCharacteristics, super_slicing: bool
|
|
347
375
|
) -> str:
|
xpk/core/scheduling_test.py
CHANGED
|
@@ -22,7 +22,7 @@ from pytest_mock import MockerFixture
|
|
|
22
22
|
from xpk.core.capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
|
|
23
23
|
from xpk.core.testing.commands_tester import CommandsTester
|
|
24
24
|
from xpk.utils.feature_flags import FeatureFlags
|
|
25
|
-
from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, get_placement_policy_name, is_placement_policy_supported
|
|
25
|
+
from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, create_tpu_slice_topology_annotation, get_placement_policy_name, is_placement_policy_supported
|
|
26
26
|
from .system_characteristics import SystemCharacteristics, AcceleratorType, DockerPlatform, get_system_characteristics_by_device_type
|
|
27
27
|
|
|
28
28
|
|
|
@@ -66,6 +66,7 @@ def test_create_placement_policy_label_returns_valid_label():
|
|
|
66
66
|
accelerator_type=AcceleratorType.TPU,
|
|
67
67
|
supports_sub_slicing=False,
|
|
68
68
|
supports_super_slicing=False,
|
|
69
|
+
supports_accelerator_network_profile=False,
|
|
69
70
|
docker_platform=DockerPlatform.ARM,
|
|
70
71
|
)
|
|
71
72
|
label = create_placement_policy_label(
|
|
@@ -89,6 +90,7 @@ def test_get_placement_policy_name_returns_valid_name():
|
|
|
89
90
|
accelerator_type=AcceleratorType.TPU,
|
|
90
91
|
supports_sub_slicing=False,
|
|
91
92
|
supports_super_slicing=False,
|
|
93
|
+
supports_accelerator_network_profile=False,
|
|
92
94
|
docker_platform=DockerPlatform.ARM,
|
|
93
95
|
)
|
|
94
96
|
name = get_placement_policy_name(system_characteristics, super_slicing=False)
|
|
@@ -107,6 +109,7 @@ def test_get_placement_policy_name_super_slicing_returns_valid_name():
|
|
|
107
109
|
accelerator_type=AcceleratorType.TPU,
|
|
108
110
|
supports_sub_slicing=False,
|
|
109
111
|
supports_super_slicing=False,
|
|
112
|
+
supports_accelerator_network_profile=False,
|
|
110
113
|
docker_platform=DockerPlatform.ARM,
|
|
111
114
|
)
|
|
112
115
|
name = get_placement_policy_name(system_characteristics, super_slicing=True)
|
|
@@ -125,6 +128,7 @@ def test_is_placement_policy_supported_returns_true_for_system_characteristics_s
|
|
|
125
128
|
accelerator_type=AcceleratorType.TPU,
|
|
126
129
|
supports_sub_slicing=False,
|
|
127
130
|
supports_super_slicing=False,
|
|
131
|
+
supports_accelerator_network_profile=False,
|
|
128
132
|
docker_platform=DockerPlatform.ARM,
|
|
129
133
|
)
|
|
130
134
|
assert is_placement_policy_supported(system_characteristics) is True
|
|
@@ -142,6 +146,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
|
|
|
142
146
|
accelerator_type=AcceleratorType.TPU,
|
|
143
147
|
supports_sub_slicing=False,
|
|
144
148
|
supports_super_slicing=False,
|
|
149
|
+
supports_accelerator_network_profile=False,
|
|
145
150
|
docker_platform=DockerPlatform.ARM,
|
|
146
151
|
)
|
|
147
152
|
assert is_placement_policy_supported(system_characteristics) is False
|
|
@@ -159,6 +164,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
|
|
|
159
164
|
accelerator_type=AcceleratorType.TPU,
|
|
160
165
|
supports_sub_slicing=False,
|
|
161
166
|
supports_super_slicing=False,
|
|
167
|
+
supports_accelerator_network_profile=False,
|
|
162
168
|
docker_platform=DockerPlatform.ARM,
|
|
163
169
|
)
|
|
164
170
|
assert is_placement_policy_supported(system_characteristics) is False
|
|
@@ -202,7 +208,7 @@ SUPER_SLICING_CASE = SchedulingTestCase(
|
|
|
202
208
|
cluster_system=_get_system_characteristics_or_die('tpu7x-4x4x4'),
|
|
203
209
|
# 5 4x4x4 cubes:
|
|
204
210
|
resources_config_map={'tpu7x-128': str(64 // 4 * 5)},
|
|
205
|
-
kueue_version='0.
|
|
211
|
+
kueue_version='0.15.2',
|
|
206
212
|
super_slicing_feature_enabled=True,
|
|
207
213
|
super_slicing_topology_set=True,
|
|
208
214
|
num_slices=1,
|
|
@@ -369,6 +375,28 @@ SUPER_SLICING_CASE = SchedulingTestCase(
|
|
|
369
375
|
),
|
|
370
376
|
WorkloadScheduling.UNAVAILABLE,
|
|
371
377
|
),
|
|
378
|
+
(
|
|
379
|
+
'Super-slicing, but workload topology is not divisible by four',
|
|
380
|
+
dataclasses.replace(
|
|
381
|
+
SUPER_SLICING_CASE,
|
|
382
|
+
workload_system=_get_system_characteristics_or_die(
|
|
383
|
+
'tpu7x-2x2x1'
|
|
384
|
+
),
|
|
385
|
+
),
|
|
386
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
387
|
+
),
|
|
388
|
+
(
|
|
389
|
+
'Super-slicing, but workload topology is too big for super-slice',
|
|
390
|
+
dataclasses.replace(
|
|
391
|
+
SUPER_SLICING_CASE,
|
|
392
|
+
workload_system=_get_system_characteristics_or_die(
|
|
393
|
+
'tpu7x-4x4x32'
|
|
394
|
+
),
|
|
395
|
+
# 10 cubes, to make sure vms fit:
|
|
396
|
+
resources_config_map={'tpu7x-128': str(64 // 4 * 10)},
|
|
397
|
+
),
|
|
398
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
399
|
+
),
|
|
372
400
|
(
|
|
373
401
|
(
|
|
374
402
|
'Super-slicing should be ignored when a given device is already'
|
|
@@ -426,3 +454,12 @@ def test_check_if_workload_can_schedule(
|
|
|
426
454
|
)
|
|
427
455
|
== expected
|
|
428
456
|
)
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def test_create_tpu_slice_topology_annotation():
|
|
460
|
+
workload_system = _get_system_characteristics_or_die('tpu7x-4x4x8')
|
|
461
|
+
|
|
462
|
+
assert (
|
|
463
|
+
create_tpu_slice_topology_annotation(workload_system.topology)
|
|
464
|
+
== 'cloud.google.com/gke-tpu-slice-topology: 4x4x8'
|
|
465
|
+
)
|