xpk 0.16.1__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +48 -5
- xpk/commands/cluster_gcluster.py +3 -0
- xpk/commands/cluster_gcluster_test.py +2 -0
- xpk/commands/cluster_test.py +203 -0
- xpk/commands/common.py +6 -0
- xpk/commands/kind.py +2 -0
- xpk/commands/workload.py +35 -15
- xpk/commands/workload_test.py +1 -0
- xpk/core/capacity.py +83 -46
- xpk/core/capacity_test.py +82 -28
- xpk/core/commands.py +39 -12
- xpk/core/kueue_manager.py +42 -11
- xpk/core/kueue_manager_test.py +83 -3
- xpk/core/nap.py +5 -4
- xpk/core/nodepool.py +57 -20
- xpk/core/nodepool_test.py +152 -23
- xpk/core/pathways.py +2 -1
- xpk/core/resources.py +3 -3
- xpk/core/scheduling.py +54 -10
- xpk/core/scheduling_test.py +118 -13
- xpk/core/system_characteristics.py +41 -24
- xpk/core/system_characteristics_test.py +37 -4
- xpk/core/telemetry.py +5 -0
- xpk/core/telemetry_test.py +19 -2
- xpk/core/updates.py +1 -1
- xpk/main.py +2 -1
- xpk/parser/cluster.py +34 -2
- xpk/parser/cluster_test.py +117 -0
- xpk/parser/common.py +32 -0
- xpk/parser/common_test.py +49 -0
- xpk/templates/kueue_config.yaml.j2 +21 -5
- xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
- xpk/utils/kueue.py +6 -2
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/METADATA +2 -1
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/RECORD +39 -37
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/WHEEL +0 -0
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/top_level.txt +0 -0
xpk/core/kueue_manager_test.py
CHANGED
|
@@ -21,7 +21,7 @@ from pytest_mock import MockerFixture
|
|
|
21
21
|
import yaml
|
|
22
22
|
from unittest.mock import MagicMock, patch
|
|
23
23
|
|
|
24
|
-
from xpk.core.kueue_manager import KueueConfig, KueueManager, has_sub_slicing_enabled
|
|
24
|
+
from xpk.core.kueue_manager import KueueConfig, KueueManager, has_sub_slicing_enabled, has_super_slicing_enabled
|
|
25
25
|
from xpk.core.system_characteristics import GpuConfig, DockerPlatform, AcceleratorType, SystemCharacteristics, UserFacingNameToSystemCharacteristics
|
|
26
26
|
from xpk.core.testing.commands_tester import CommandsTester
|
|
27
27
|
from packaging.version import Version
|
|
@@ -35,6 +35,7 @@ TPU_SYSTEM: SystemCharacteristics = SystemCharacteristics(
|
|
|
35
35
|
accelerator_type=AcceleratorType.TPU,
|
|
36
36
|
device_type="v5p-8",
|
|
37
37
|
supports_sub_slicing=False,
|
|
38
|
+
supports_super_slicing=False,
|
|
38
39
|
docker_platform=DockerPlatform.ARM,
|
|
39
40
|
)
|
|
40
41
|
|
|
@@ -44,6 +45,7 @@ KUEUE_CONFIG: KueueConfig = KueueConfig(
|
|
|
44
45
|
cpu_limit=100,
|
|
45
46
|
memory_limit="100Gi",
|
|
46
47
|
configure_sub_slicing=False,
|
|
48
|
+
configure_super_slicing=False,
|
|
47
49
|
)
|
|
48
50
|
|
|
49
51
|
|
|
@@ -370,6 +372,7 @@ def test_configure_generates_manifest_with_admission_checks_for_flex_single_slic
|
|
|
370
372
|
KUEUE_CONFIG,
|
|
371
373
|
num_slices=1,
|
|
372
374
|
flex=True,
|
|
375
|
+
system=UserFacingNameToSystemCharacteristics["l4-1"],
|
|
373
376
|
)
|
|
374
377
|
|
|
375
378
|
kueue_manager.install_or_upgrade(kueue_config)
|
|
@@ -382,7 +385,7 @@ def test_configure_generates_manifest_with_admission_checks_for_flex_single_slic
|
|
|
382
385
|
)
|
|
383
386
|
assert (
|
|
384
387
|
cluster_queue["spec"]["resourceGroups"][0]["flavors"][0]["name"]
|
|
385
|
-
== "
|
|
388
|
+
== "1xl4-1"
|
|
386
389
|
)
|
|
387
390
|
assert cluster_queue["spec"]["admissionChecks"][0] == "dws-prov"
|
|
388
391
|
|
|
@@ -406,6 +409,7 @@ def test_configure_generates_correct_manifest_with_gke_default_topology(
|
|
|
406
409
|
accelerator_type=AcceleratorType.GPU,
|
|
407
410
|
device_type="h100-mega-80gb-8",
|
|
408
411
|
supports_sub_slicing=False,
|
|
412
|
+
supports_super_slicing=False,
|
|
409
413
|
docker_platform=DockerPlatform.ARM,
|
|
410
414
|
gpu_config=GpuConfig(requires_topology=True),
|
|
411
415
|
),
|
|
@@ -462,6 +466,47 @@ def test_configure_generates_correct_manifest_with_sub_slicing(
|
|
|
462
466
|
assert actual_levels == expected_levels
|
|
463
467
|
|
|
464
468
|
|
|
469
|
+
@patch("xpk.core.kueue_manager.write_tmp_file")
|
|
470
|
+
def test_configure_generates_correct_manifest_with_super_slicing(
|
|
471
|
+
write_tmp_file_mock: MagicMock,
|
|
472
|
+
mock_commands: CommandsTester,
|
|
473
|
+
kueue_manager: KueueManager,
|
|
474
|
+
):
|
|
475
|
+
"""Test that __configure generates correct manifest with super-slicing topology."""
|
|
476
|
+
set_installed_kueue_version(mock_commands, None)
|
|
477
|
+
kueue_config = dataclasses.replace(
|
|
478
|
+
KUEUE_CONFIG,
|
|
479
|
+
configure_super_slicing=True,
|
|
480
|
+
system=UserFacingNameToSystemCharacteristics["tpu7x-4x4x4"],
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
kueue_manager.install_or_upgrade(kueue_config)
|
|
484
|
+
|
|
485
|
+
rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
|
|
486
|
+
manifest_docs = list(yaml.safe_load_all(rendered_manifest))
|
|
487
|
+
resource_flavor = _first(
|
|
488
|
+
doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
|
|
489
|
+
)
|
|
490
|
+
assert resource_flavor["spec"]["topologyName"] == "super-slice-topology"
|
|
491
|
+
assert resource_flavor["spec"]["nodeLabels"] == {
|
|
492
|
+
"cloud.google.com/gke-tpu-accelerator": "tpu7x",
|
|
493
|
+
"cloud.google.com/gke-tpu-partition-4x4x4-state": "HEALTHY",
|
|
494
|
+
}
|
|
495
|
+
topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
|
|
496
|
+
assert topology["metadata"]["name"] == "super-slice-topology"
|
|
497
|
+
expected_levels = [
|
|
498
|
+
"cloud.google.com/gce-topology-block",
|
|
499
|
+
"cloud.google.com/gke-tpu-partition-4x4x4-id",
|
|
500
|
+
"kubernetes.io/hostname",
|
|
501
|
+
]
|
|
502
|
+
actual_levels = [level["nodeLabel"] for level in topology["spec"]["levels"]]
|
|
503
|
+
assert actual_levels == expected_levels
|
|
504
|
+
cluster_queue = _first(
|
|
505
|
+
doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
|
|
506
|
+
)
|
|
507
|
+
assert cluster_queue["spec"]["admissionChecks"][0] == "ss-kueue-operator"
|
|
508
|
+
|
|
509
|
+
|
|
465
510
|
@patch("xpk.core.kueue_manager.write_tmp_file")
|
|
466
511
|
def test_configure_generates_correct_manifest_with_pathways(
|
|
467
512
|
write_tmp_file_mock: MagicMock,
|
|
@@ -549,7 +594,7 @@ def test_has_sub_slicing_enabled_returns_false_when_sub_slicing_topology_is_not_
|
|
|
549
594
|
assert result is False
|
|
550
595
|
|
|
551
596
|
|
|
552
|
-
def
|
|
597
|
+
def test_has_sub_slicing_enabled_returns_true_when_sub_slicing_topology_is_present(
|
|
553
598
|
mock_commands: CommandsTester,
|
|
554
599
|
):
|
|
555
600
|
mock_commands.set_result_for_command(
|
|
@@ -562,6 +607,41 @@ def test_has_sub_slicing_enabled_returns_true_when_sub_slicing_topology_is_not_p
|
|
|
562
607
|
assert result is True
|
|
563
608
|
|
|
564
609
|
|
|
610
|
+
def test_has_super_slicing_enabled_returns_exit_code_when_command_fails(
|
|
611
|
+
mock_commands: CommandsTester,
|
|
612
|
+
):
|
|
613
|
+
mock_commands.set_result_for_command((1, ""), "kubectl get topology")
|
|
614
|
+
|
|
615
|
+
return_code, result = has_super_slicing_enabled()
|
|
616
|
+
|
|
617
|
+
assert return_code == 1
|
|
618
|
+
assert result is None
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def test_has_super_slicing_enabled_returns_false_when_super_slicing_topology_is_not_present(
|
|
622
|
+
mock_commands: CommandsTester,
|
|
623
|
+
):
|
|
624
|
+
mock_commands.set_result_for_command((0, ""), "kubectl get topology")
|
|
625
|
+
|
|
626
|
+
return_code, result = has_super_slicing_enabled()
|
|
627
|
+
|
|
628
|
+
assert return_code == 0
|
|
629
|
+
assert result is False
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def test_has_super_slicing_enabled_returns_true_when_super_slicing_topology_is_present(
|
|
633
|
+
mock_commands: CommandsTester,
|
|
634
|
+
):
|
|
635
|
+
mock_commands.set_result_for_command(
|
|
636
|
+
(0, "super-slice-topology"), "kubectl get topology"
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
return_code, result = has_super_slicing_enabled()
|
|
640
|
+
|
|
641
|
+
assert return_code == 0
|
|
642
|
+
assert result is True
|
|
643
|
+
|
|
644
|
+
|
|
565
645
|
T = TypeVar("T")
|
|
566
646
|
|
|
567
647
|
|
xpk/core/nap.py
CHANGED
|
@@ -147,17 +147,18 @@ def enable_autoprovisioning_on_cluster(
|
|
|
147
147
|
|
|
148
148
|
for i, command in enumerate(commands):
|
|
149
149
|
xpk_print(f'To complete {task_names[i]} we are executing {command}')
|
|
150
|
-
|
|
150
|
+
maybe_failure = run_commands(
|
|
151
151
|
commands,
|
|
152
152
|
'Update node pools with autoprovisioning support',
|
|
153
153
|
task_names,
|
|
154
154
|
)
|
|
155
|
-
if
|
|
155
|
+
if maybe_failure is not None:
|
|
156
156
|
xpk_print(
|
|
157
157
|
'Update node pools with autoprovisioning support returned ERROR:'
|
|
158
|
-
f' {
|
|
158
|
+
f' {maybe_failure.return_code}'
|
|
159
159
|
)
|
|
160
|
-
return None,
|
|
160
|
+
return None, maybe_failure.return_code
|
|
161
|
+
|
|
161
162
|
return autoprovisioning_config, return_code
|
|
162
163
|
|
|
163
164
|
|
xpk/core/nodepool.py
CHANGED
|
@@ -15,6 +15,8 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from typing import List
|
|
18
|
+
|
|
19
|
+
from ..utils.feature_flags import FeatureFlags
|
|
18
20
|
from ..utils.console import ask_for_user_consent, xpk_print
|
|
19
21
|
from .scheduling import get_placement_policy_name, is_placement_policy_supported
|
|
20
22
|
from .capacity import (
|
|
@@ -25,14 +27,14 @@ from .capacity import (
|
|
|
25
27
|
get_capacity_type,
|
|
26
28
|
print_reservations,
|
|
27
29
|
)
|
|
28
|
-
from .commands import run_command_for_value, run_commands
|
|
30
|
+
from .commands import run_command_for_value, run_commands, FailedCommand
|
|
29
31
|
from .gcloud_context import GkeServerConfig, get_cluster_location, zone_to_region
|
|
30
32
|
from .resources import (
|
|
31
33
|
ConfigMapType,
|
|
32
34
|
check_cluster_resources,
|
|
33
35
|
update_cluster_configmap,
|
|
34
36
|
)
|
|
35
|
-
from .system_characteristics import AcceleratorType
|
|
37
|
+
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
36
38
|
|
|
37
39
|
|
|
38
40
|
CLOUD_PLATFORM_AUTH_SCOPE_URL = (
|
|
@@ -43,7 +45,7 @@ OLDER_PATHWAYS_CPU_NP_TO_DELETE = ['cpu-rm-np', 'cpu-proxy-np', 'cpu-user-np']
|
|
|
43
45
|
|
|
44
46
|
|
|
45
47
|
def run_gke_node_pool_create_command(
|
|
46
|
-
args, system, gke_node_pool_version
|
|
48
|
+
args, system: SystemCharacteristics, gke_node_pool_version: str
|
|
47
49
|
) -> int:
|
|
48
50
|
"""Run the Create GKE Node Pool request.
|
|
49
51
|
|
|
@@ -84,7 +86,7 @@ def run_gke_node_pool_create_command(
|
|
|
84
86
|
else:
|
|
85
87
|
max_nodes = 1000
|
|
86
88
|
capacity_args, return_code = get_capacity_arguments_from_capacity_type(
|
|
87
|
-
args, capacity_type, max_nodes
|
|
89
|
+
args, capacity_type, max_nodes, system.accelerator_type
|
|
88
90
|
)
|
|
89
91
|
if return_code > 0:
|
|
90
92
|
xpk_print('Parsing capacity arguments failed!')
|
|
@@ -200,13 +202,13 @@ def run_gke_node_pool_create_command(
|
|
|
200
202
|
xpk_print(
|
|
201
203
|
f'To complete {delete_task_names[i]} we are executing {command}'
|
|
202
204
|
)
|
|
203
|
-
|
|
205
|
+
maybe_failure = run_commands(
|
|
204
206
|
delete_commands,
|
|
205
207
|
'Delete Nodepools',
|
|
206
208
|
delete_task_names,
|
|
207
209
|
)
|
|
208
|
-
if
|
|
209
|
-
xpk_print(f'Delete Nodepools returned ERROR {
|
|
210
|
+
if maybe_failure is not None:
|
|
211
|
+
xpk_print(f'Delete Nodepools returned ERROR {maybe_failure.return_code}')
|
|
210
212
|
return 1
|
|
211
213
|
|
|
212
214
|
# Enable Workload Identity on existing Nodepools
|
|
@@ -224,15 +226,15 @@ def run_gke_node_pool_create_command(
|
|
|
224
226
|
xpk_print(
|
|
225
227
|
f'To complete {update_WI_task_names[i]} we are executing {command}'
|
|
226
228
|
)
|
|
227
|
-
|
|
229
|
+
maybe_failure = run_commands(
|
|
228
230
|
update_WI_commands,
|
|
229
231
|
'Enable Workload Identity on existing Nodepools',
|
|
230
232
|
update_WI_task_names,
|
|
231
233
|
)
|
|
232
|
-
if
|
|
234
|
+
if maybe_failure is not None:
|
|
233
235
|
xpk_print(
|
|
234
236
|
'Enable Workload Identity on existing Nodepools returned ERROR'
|
|
235
|
-
f' {
|
|
237
|
+
f' {maybe_failure.return_code}'
|
|
236
238
|
)
|
|
237
239
|
return 1
|
|
238
240
|
|
|
@@ -256,12 +258,17 @@ def run_gke_node_pool_create_command(
|
|
|
256
258
|
|
|
257
259
|
placement_args = ''
|
|
258
260
|
if is_placement_policy_supported(system):
|
|
259
|
-
|
|
261
|
+
super_slicing = FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing
|
|
262
|
+
placement_policy = get_placement_policy_name(
|
|
263
|
+
system,
|
|
264
|
+
super_slicing,
|
|
265
|
+
)
|
|
260
266
|
ensure_resource_policy_exists(
|
|
261
267
|
resource_policy_name=placement_policy,
|
|
262
268
|
project=args.project,
|
|
263
269
|
zone=args.zone,
|
|
264
270
|
topology=system.topology,
|
|
271
|
+
super_slicing=super_slicing,
|
|
265
272
|
)
|
|
266
273
|
placement_args = f' --placement-policy={placement_policy}'
|
|
267
274
|
|
|
@@ -358,19 +365,41 @@ def run_gke_node_pool_create_command(
|
|
|
358
365
|
|
|
359
366
|
for i, command in enumerate(create_commands):
|
|
360
367
|
xpk_print(f'To complete {create_task_names[i]} we are executing {command}')
|
|
361
|
-
|
|
368
|
+
maybe_failure = run_commands(
|
|
362
369
|
create_commands,
|
|
363
370
|
'Create Nodepools',
|
|
364
371
|
create_task_names,
|
|
365
372
|
)
|
|
366
|
-
if
|
|
367
|
-
|
|
373
|
+
if maybe_failure is not None:
|
|
374
|
+
display_nodepool_creation_error(maybe_failure)
|
|
368
375
|
return 1
|
|
369
376
|
|
|
370
377
|
xpk_print('Create or delete node pool request complete.')
|
|
371
378
|
return 0
|
|
372
379
|
|
|
373
380
|
|
|
381
|
+
def display_nodepool_creation_error(maybe_failure: FailedCommand) -> None:
|
|
382
|
+
"""Display nodepool creation errors to the user."""
|
|
383
|
+
|
|
384
|
+
xpk_print(f'Create Nodepools returned ERROR {maybe_failure.return_code}')
|
|
385
|
+
try:
|
|
386
|
+
with open(maybe_failure.logfile, 'r', encoding='utf-8') as f:
|
|
387
|
+
contents = f.read()
|
|
388
|
+
error_marker = 'finished with error:'
|
|
389
|
+
error = contents[contents.index(error_marker) + len(error_marker) :].strip()
|
|
390
|
+
# the longest error we're expecting to see is 256 characters + np name
|
|
391
|
+
max_error_display_length = 400
|
|
392
|
+
xpk_print(f'Nodepool creation error: {error[:max_error_display_length]}')
|
|
393
|
+
if (
|
|
394
|
+
error.find('lack of capacity') != -1
|
|
395
|
+
or error.find('Requested resource is exhausted') != -1
|
|
396
|
+
):
|
|
397
|
+
xpk_print('NOTE: this error might be caused by a stockout')
|
|
398
|
+
except (FileNotFoundError, IOError, ValueError):
|
|
399
|
+
# silently ignore any log parsing errors
|
|
400
|
+
pass
|
|
401
|
+
|
|
402
|
+
|
|
374
403
|
def get_node_pools_to_delete(
|
|
375
404
|
args, system, existing_node_pool_names, desired_node_pool_names
|
|
376
405
|
) -> list:
|
|
@@ -585,18 +614,22 @@ def get_desired_node_pool_names(
|
|
|
585
614
|
while len(result) < desired_node_pool_count:
|
|
586
615
|
result.add(f'{cluster_name}-np-{i}')
|
|
587
616
|
i += 1
|
|
588
|
-
return list(result)
|
|
617
|
+
return list(sorted(result))
|
|
589
618
|
|
|
590
619
|
|
|
591
620
|
def ensure_resource_policy_exists(
|
|
592
|
-
resource_policy_name: str,
|
|
621
|
+
resource_policy_name: str,
|
|
622
|
+
project: str,
|
|
623
|
+
zone: str,
|
|
624
|
+
topology: str,
|
|
625
|
+
super_slicing: bool,
|
|
593
626
|
) -> None:
|
|
594
627
|
return_code, _ = run_command_for_value(
|
|
595
628
|
(
|
|
596
629
|
'gcloud compute resource-policies describe'
|
|
597
|
-
f' {resource_policy_name}
|
|
598
|
-
f'--project={project}
|
|
599
|
-
f'--region={zone_to_region(zone)}'
|
|
630
|
+
f' {resource_policy_name}'
|
|
631
|
+
f' --project={project}'
|
|
632
|
+
f' --region={zone_to_region(zone)}'
|
|
600
633
|
),
|
|
601
634
|
'Retrieve resource policy',
|
|
602
635
|
)
|
|
@@ -604,11 +637,15 @@ def ensure_resource_policy_exists(
|
|
|
604
637
|
if return_code == 0:
|
|
605
638
|
return
|
|
606
639
|
|
|
640
|
+
# TODO: b/465696970 - Verify the flag below before launching SUPER_SLICING:
|
|
641
|
+
accelerator_topology_mode = (
|
|
642
|
+
' --accelerator-topology-mode=PROVISION_ONLY' if super_slicing else ''
|
|
643
|
+
)
|
|
607
644
|
return_code, _ = run_command_for_value(
|
|
608
645
|
(
|
|
609
646
|
'gcloud compute resource-policies create workload-policy'
|
|
610
647
|
f' {resource_policy_name} --project={project} --region={zone_to_region(zone)} --type=HIGH_THROUGHPUT'
|
|
611
|
-
f' --accelerator-topology={topology}'
|
|
648
|
+
f' --accelerator-topology={topology}{accelerator_topology_mode}'
|
|
612
649
|
),
|
|
613
650
|
'Create resource policy',
|
|
614
651
|
)
|
xpk/core/nodepool_test.py
CHANGED
|
@@ -16,13 +16,23 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import pytest
|
|
18
18
|
from xpk.core.nodepool import (
|
|
19
|
+
display_nodepool_creation_error,
|
|
19
20
|
ensure_resource_policy_exists,
|
|
20
21
|
get_desired_node_pool_names,
|
|
21
22
|
run_gke_node_pool_create_command,
|
|
22
23
|
)
|
|
23
24
|
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
|
|
25
|
+
from xpk.core.commands import FailedCommand
|
|
26
|
+
from xpk.core.testing.commands_tester import CommandsTester
|
|
27
|
+
|
|
24
28
|
|
|
25
29
|
CLUSTER_NAME = "running-cucumber"
|
|
30
|
+
maybe_failure = FailedCommand(
|
|
31
|
+
return_code=1,
|
|
32
|
+
name="create-nodepool",
|
|
33
|
+
command="test-command",
|
|
34
|
+
logfile="logfile_path",
|
|
35
|
+
)
|
|
26
36
|
|
|
27
37
|
|
|
28
38
|
def node_pool_name(number: int) -> str:
|
|
@@ -88,61 +98,107 @@ def test_compute_desired_node_pool_names_with_unknown_node_pools():
|
|
|
88
98
|
assert set(result) == set(expected_result)
|
|
89
99
|
|
|
90
100
|
|
|
101
|
+
@pytest.fixture
|
|
102
|
+
def commands_tester(mocker):
|
|
103
|
+
return CommandsTester(
|
|
104
|
+
mocker,
|
|
105
|
+
run_command_for_value_path="xpk.core.nodepool.run_command_for_value",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
91
109
|
def test_ensure_resource_policy_exists_with_existing_policy_retrieves_existing_policy(
|
|
92
|
-
|
|
110
|
+
commands_tester: CommandsTester,
|
|
93
111
|
):
|
|
94
|
-
args = mocker.Mock(project="test-project", zone="us-central1-a")
|
|
95
|
-
mocker.patch("xpk.core.nodepool.get_cluster_location", return_value=args.zone)
|
|
96
|
-
mock = mocker.patch(
|
|
97
|
-
"xpk.core.nodepool.run_command_for_value", return_value=(0, "")
|
|
98
|
-
)
|
|
99
112
|
ensure_resource_policy_exists(
|
|
100
113
|
resource_policy_name="resource-policy",
|
|
101
114
|
project="test-project",
|
|
102
115
|
zone="us-central1-a",
|
|
103
116
|
topology="2x2x1",
|
|
117
|
+
super_slicing=False,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
assert len(commands_tester.commands_history) == 1
|
|
121
|
+
commands_tester.assert_command_run(
|
|
122
|
+
"gcloud compute resource-policies describe resource-policy",
|
|
123
|
+
"--project=test-project",
|
|
124
|
+
"--region=us-central1",
|
|
104
125
|
)
|
|
105
|
-
mock.assert_called_once()
|
|
106
126
|
|
|
107
127
|
|
|
108
128
|
def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
|
|
109
|
-
|
|
129
|
+
commands_tester: CommandsTester,
|
|
110
130
|
):
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
mock = mocker.patch(
|
|
114
|
-
"xpk.core.nodepool.run_command_for_value", side_effect=[(1, ""), (0, "")]
|
|
131
|
+
commands_tester.set_result_for_command(
|
|
132
|
+
(1, ""), "gcloud compute resource-policies describe"
|
|
115
133
|
)
|
|
134
|
+
|
|
116
135
|
ensure_resource_policy_exists(
|
|
117
136
|
resource_policy_name="resource-policy",
|
|
118
137
|
project="test-project",
|
|
119
138
|
zone="us-central1-a",
|
|
120
139
|
topology="2x2x1",
|
|
140
|
+
super_slicing=False,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
assert len(commands_tester.commands_history) == 2
|
|
144
|
+
commands_tester.assert_command_run(
|
|
145
|
+
"gcloud compute resource-policies describe"
|
|
146
|
+
)
|
|
147
|
+
commands_tester.assert_command_run(
|
|
148
|
+
"gcloud compute resource-policies create workload-policy resource-policy",
|
|
149
|
+
"--project=test-project",
|
|
150
|
+
"--region=us-central1",
|
|
151
|
+
"--accelerator-topology=2x2x1",
|
|
152
|
+
)
|
|
153
|
+
commands_tester.assert_command_not_run(
|
|
154
|
+
"gcloud compute resource-policies create workload-policy",
|
|
155
|
+
"--accelerator-topology-mode",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_for_super_slicing(
|
|
160
|
+
commands_tester: CommandsTester,
|
|
161
|
+
):
|
|
162
|
+
commands_tester.set_result_for_command(
|
|
163
|
+
(1, ""), "gcloud compute resource-policies describe"
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
ensure_resource_policy_exists(
|
|
167
|
+
resource_policy_name="ss-resource-policy",
|
|
168
|
+
project="test-project",
|
|
169
|
+
zone="us-central1-a",
|
|
170
|
+
topology="2x2x1",
|
|
171
|
+
super_slicing=True,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
commands_tester.assert_command_run(
|
|
175
|
+
"gcloud compute resource-policies create workload-policy",
|
|
176
|
+
"--accelerator-topology-mode",
|
|
121
177
|
)
|
|
122
|
-
assert mock.call_count == 2
|
|
123
|
-
assert mock.call_args_list[0].args[1] == "Retrieve resource policy"
|
|
124
178
|
|
|
125
179
|
|
|
126
180
|
def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creation_fails(
|
|
127
|
-
|
|
181
|
+
commands_tester: CommandsTester,
|
|
128
182
|
):
|
|
129
183
|
with pytest.raises(RuntimeError):
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
"xpk.core.nodepool.get_cluster_location", return_value=args.zone
|
|
133
|
-
)
|
|
134
|
-
mocker.patch(
|
|
135
|
-
"xpk.core.nodepool.run_command_for_value",
|
|
136
|
-
side_effect=[(1, ""), (1, "")],
|
|
184
|
+
commands_tester.set_result_for_command(
|
|
185
|
+
(1, ""), "gcloud compute resource-policies"
|
|
137
186
|
)
|
|
187
|
+
|
|
138
188
|
ensure_resource_policy_exists(
|
|
139
189
|
resource_policy_name="resource-policy",
|
|
140
190
|
project="test-project",
|
|
141
191
|
zone="us-central1-a",
|
|
142
192
|
topology="2x2x1",
|
|
193
|
+
super_slicing=False,
|
|
143
194
|
)
|
|
144
195
|
|
|
145
196
|
|
|
197
|
+
@pytest.fixture
|
|
198
|
+
def mock_xpk_print(mocker):
|
|
199
|
+
return mocker.patch("xpk.core.nodepool.xpk_print")
|
|
200
|
+
|
|
201
|
+
|
|
146
202
|
@pytest.fixture
|
|
147
203
|
def mock_nodepool_dependencies(mocker):
|
|
148
204
|
"""Mocks dependencies for run_gke_node_pool_create_command."""
|
|
@@ -159,7 +215,7 @@ def mock_nodepool_dependencies(mocker):
|
|
|
159
215
|
mocker.patch(
|
|
160
216
|
"xpk.core.nodepool.get_cluster_location", return_value="us-central1"
|
|
161
217
|
)
|
|
162
|
-
mocker.patch("xpk.core.nodepool.run_commands", return_value=
|
|
218
|
+
mocker.patch("xpk.core.nodepool.run_commands", return_value=None)
|
|
163
219
|
mocker.patch("xpk.core.nodepool.ask_for_user_consent", return_value=True)
|
|
164
220
|
mock_is_placement_policy_supported = mocker.patch(
|
|
165
221
|
"xpk.core.nodepool.is_placement_policy_supported"
|
|
@@ -194,6 +250,7 @@ def test_placement_policy_created_for_gpu_with_valid_topology(
|
|
|
194
250
|
accelerator_type=AcceleratorType.GPU,
|
|
195
251
|
device_type="h100-80gb-8",
|
|
196
252
|
supports_sub_slicing=False,
|
|
253
|
+
supports_super_slicing=False,
|
|
197
254
|
docker_platform=DockerPlatform.ARM,
|
|
198
255
|
gpu_config=GpuConfig(requires_topology=True),
|
|
199
256
|
)
|
|
@@ -226,6 +283,7 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
|
|
|
226
283
|
accelerator_type=AcceleratorType.GPU,
|
|
227
284
|
device_type="h100-80gb-8",
|
|
228
285
|
supports_sub_slicing=False,
|
|
286
|
+
supports_super_slicing=False,
|
|
229
287
|
docker_platform=DockerPlatform.ARM,
|
|
230
288
|
gpu_config=GpuConfig(requires_topology=True),
|
|
231
289
|
)
|
|
@@ -261,6 +319,7 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
|
|
|
261
319
|
device_type="tpu7x-8",
|
|
262
320
|
requires_workload_policy=True,
|
|
263
321
|
supports_sub_slicing=False,
|
|
322
|
+
supports_super_slicing=False,
|
|
264
323
|
docker_platform=DockerPlatform.ARM,
|
|
265
324
|
)
|
|
266
325
|
|
|
@@ -294,9 +353,79 @@ def test_placement_policy_not_created_for_non7x_tpu(
|
|
|
294
353
|
accelerator_type=AcceleratorType.TPU,
|
|
295
354
|
device_type="v6e-4",
|
|
296
355
|
supports_sub_slicing=True,
|
|
356
|
+
supports_super_slicing=False,
|
|
297
357
|
docker_platform=DockerPlatform.ARM,
|
|
298
358
|
)
|
|
299
359
|
|
|
300
360
|
run_gke_node_pool_create_command(args, system, "1.2.3")
|
|
301
361
|
|
|
302
362
|
mock_ensure_resource_policy.assert_not_called()
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
@pytest.mark.parametrize(
|
|
366
|
+
argnames="error_message,is_stockout",
|
|
367
|
+
argvalues=[
|
|
368
|
+
(
|
|
369
|
+
(
|
|
370
|
+
"Requested resource is exhausted: Zone 'us-central1-c' is not"
|
|
371
|
+
" available. Please try another zone."
|
|
372
|
+
),
|
|
373
|
+
True,
|
|
374
|
+
),
|
|
375
|
+
(
|
|
376
|
+
(
|
|
377
|
+
"TPU: the nodes (in pool test-pool) cannot be created now due"
|
|
378
|
+
" to lack of capacity in your reservation. They will be created"
|
|
379
|
+
" asynchronously once capacity is available. You can either"
|
|
380
|
+
" wait for the nodes to be up, or delete the node pool and try"
|
|
381
|
+
" re-creating it again later"
|
|
382
|
+
),
|
|
383
|
+
True,
|
|
384
|
+
),
|
|
385
|
+
("Generic error message", False),
|
|
386
|
+
],
|
|
387
|
+
)
|
|
388
|
+
def test_display_nodepool_creation_error_handles_error_messages(
|
|
389
|
+
mocker, mock_xpk_print, error_message, is_stockout
|
|
390
|
+
):
|
|
391
|
+
"""Tests that display_nodepool_creation_error surfaces errors and detects stockouts."""
|
|
392
|
+
|
|
393
|
+
log_contents = """Operation [
|
|
394
|
+
...
|
|
395
|
+
] finished with error: """ + error_message + "\n"
|
|
396
|
+
mocker.patch("builtins.open", mocker.mock_open(read_data=log_contents))
|
|
397
|
+
display_nodepool_creation_error(maybe_failure)
|
|
398
|
+
|
|
399
|
+
assert mock_xpk_print.call_count == 3 if is_stockout else 2
|
|
400
|
+
assert (
|
|
401
|
+
mock_xpk_print.call_args_list[0].args[0]
|
|
402
|
+
== "Create Nodepools returned ERROR 1"
|
|
403
|
+
)
|
|
404
|
+
assert (
|
|
405
|
+
mock_xpk_print.call_args_list[1].args[0]
|
|
406
|
+
== "Nodepool creation error: " + error_message
|
|
407
|
+
)
|
|
408
|
+
assert (
|
|
409
|
+
not is_stockout
|
|
410
|
+
or mock_xpk_print.call_args_list[2].args[0]
|
|
411
|
+
== "NOTE: this error might be caused by a stockout"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def test_display_nodepool_creation_ignores_logs_without_errors(
|
|
416
|
+
mocker,
|
|
417
|
+
mock_xpk_print,
|
|
418
|
+
):
|
|
419
|
+
"""Tests that display_nodepool_creation_error ignores log files with no errors."""
|
|
420
|
+
|
|
421
|
+
log_contents = """Operation [
|
|
422
|
+
...
|
|
423
|
+
] succeeded!"""
|
|
424
|
+
mocker.patch("builtins.open", mocker.mock_open(read_data=log_contents))
|
|
425
|
+
display_nodepool_creation_error(maybe_failure)
|
|
426
|
+
|
|
427
|
+
assert mock_xpk_print.call_count == 1
|
|
428
|
+
assert (
|
|
429
|
+
mock_xpk_print.call_args_list[0].args[0]
|
|
430
|
+
== "Create Nodepools returned ERROR 1"
|
|
431
|
+
)
|
xpk/core/pathways.py
CHANGED
|
@@ -325,9 +325,10 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
|
|
|
325
325
|
if len(workloads) == 1:
|
|
326
326
|
return_code = run_command_with_updates(commands[0], 'Delete Workload')
|
|
327
327
|
else:
|
|
328
|
-
|
|
328
|
+
maybe_failure = run_commands(
|
|
329
329
|
commands, 'Delete Workload', task_names, batch=100
|
|
330
330
|
)
|
|
331
|
+
return_code = 0 if maybe_failure is None else maybe_failure.return_code
|
|
331
332
|
|
|
332
333
|
if return_code != 0:
|
|
333
334
|
xpk_print(f'Delete Workload request returned ERROR {return_code}')
|
xpk/core/resources.py
CHANGED
|
@@ -201,15 +201,15 @@ def _create_or_update_cluster_configmap(configmap_yml: dict[str, str]) -> int:
|
|
|
201
201
|
task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
|
|
202
202
|
task_names.append(task_name)
|
|
203
203
|
|
|
204
|
-
|
|
204
|
+
maybe_failure = run_commands(
|
|
205
205
|
commands,
|
|
206
206
|
'GKE Cluster CreateOrUpdate ConfigMap(s)',
|
|
207
207
|
task_names,
|
|
208
208
|
)
|
|
209
|
-
if
|
|
209
|
+
if maybe_failure is not None:
|
|
210
210
|
xpk_print(
|
|
211
211
|
'GKE Cluster Create/Update ConfigMap(s) request returned ERROR'
|
|
212
|
-
f' {return_code}'
|
|
212
|
+
f' {maybe_failure.return_code}'
|
|
213
213
|
)
|
|
214
214
|
return 1
|
|
215
215
|
return 0
|