xpk 0.16.1__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. xpk/commands/cluster.py +48 -5
  2. xpk/commands/cluster_gcluster.py +3 -0
  3. xpk/commands/cluster_gcluster_test.py +2 -0
  4. xpk/commands/cluster_test.py +203 -0
  5. xpk/commands/common.py +6 -0
  6. xpk/commands/kind.py +2 -0
  7. xpk/commands/workload.py +35 -16
  8. xpk/commands/workload_test.py +1 -0
  9. xpk/core/capacity.py +83 -46
  10. xpk/core/capacity_test.py +82 -28
  11. xpk/core/commands.py +39 -12
  12. xpk/core/kueue_manager.py +42 -11
  13. xpk/core/kueue_manager_test.py +83 -3
  14. xpk/core/nap.py +5 -4
  15. xpk/core/nodepool.py +57 -20
  16. xpk/core/nodepool_test.py +152 -23
  17. xpk/core/pathways.py +2 -1
  18. xpk/core/resources.py +3 -3
  19. xpk/core/scheduling.py +54 -10
  20. xpk/core/scheduling_test.py +118 -13
  21. xpk/core/system_characteristics.py +41 -24
  22. xpk/core/system_characteristics_test.py +37 -4
  23. xpk/core/telemetry.py +5 -0
  24. xpk/core/telemetry_test.py +19 -2
  25. xpk/core/updates.py +1 -1
  26. xpk/main.py +2 -1
  27. xpk/parser/cluster.py +34 -2
  28. xpk/parser/cluster_test.py +117 -0
  29. xpk/parser/common.py +32 -0
  30. xpk/parser/common_test.py +49 -0
  31. xpk/templates/kueue_config.yaml.j2 +21 -5
  32. xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
  33. xpk/utils/kueue.py +6 -2
  34. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/METADATA +2 -1
  35. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/RECORD +39 -37
  36. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/WHEEL +0 -0
  37. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/entry_points.txt +0 -0
  38. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/licenses/LICENSE +0 -0
  39. {xpk-0.16.1.dist-info → xpk-0.17.0.dist-info}/top_level.txt +0 -0
@@ -21,7 +21,7 @@ from pytest_mock import MockerFixture
21
21
  import yaml
22
22
  from unittest.mock import MagicMock, patch
23
23
 
24
- from xpk.core.kueue_manager import KueueConfig, KueueManager, has_sub_slicing_enabled
24
+ from xpk.core.kueue_manager import KueueConfig, KueueManager, has_sub_slicing_enabled, has_super_slicing_enabled
25
25
  from xpk.core.system_characteristics import GpuConfig, DockerPlatform, AcceleratorType, SystemCharacteristics, UserFacingNameToSystemCharacteristics
26
26
  from xpk.core.testing.commands_tester import CommandsTester
27
27
  from packaging.version import Version
@@ -35,6 +35,7 @@ TPU_SYSTEM: SystemCharacteristics = SystemCharacteristics(
35
35
  accelerator_type=AcceleratorType.TPU,
36
36
  device_type="v5p-8",
37
37
  supports_sub_slicing=False,
38
+ supports_super_slicing=False,
38
39
  docker_platform=DockerPlatform.ARM,
39
40
  )
40
41
 
@@ -44,6 +45,7 @@ KUEUE_CONFIG: KueueConfig = KueueConfig(
44
45
  cpu_limit=100,
45
46
  memory_limit="100Gi",
46
47
  configure_sub_slicing=False,
48
+ configure_super_slicing=False,
47
49
  )
48
50
 
49
51
 
@@ -370,6 +372,7 @@ def test_configure_generates_manifest_with_admission_checks_for_flex_single_slic
370
372
  KUEUE_CONFIG,
371
373
  num_slices=1,
372
374
  flex=True,
375
+ system=UserFacingNameToSystemCharacteristics["l4-1"],
373
376
  )
374
377
 
375
378
  kueue_manager.install_or_upgrade(kueue_config)
@@ -382,7 +385,7 @@ def test_configure_generates_manifest_with_admission_checks_for_flex_single_slic
382
385
  )
383
386
  assert (
384
387
  cluster_queue["spec"]["resourceGroups"][0]["flavors"][0]["name"]
385
- == "1xv5p-8"
388
+ == "1xl4-1"
386
389
  )
387
390
  assert cluster_queue["spec"]["admissionChecks"][0] == "dws-prov"
388
391
 
@@ -406,6 +409,7 @@ def test_configure_generates_correct_manifest_with_gke_default_topology(
406
409
  accelerator_type=AcceleratorType.GPU,
407
410
  device_type="h100-mega-80gb-8",
408
411
  supports_sub_slicing=False,
412
+ supports_super_slicing=False,
409
413
  docker_platform=DockerPlatform.ARM,
410
414
  gpu_config=GpuConfig(requires_topology=True),
411
415
  ),
@@ -462,6 +466,47 @@ def test_configure_generates_correct_manifest_with_sub_slicing(
462
466
  assert actual_levels == expected_levels
463
467
 
464
468
 
469
+ @patch("xpk.core.kueue_manager.write_tmp_file")
470
+ def test_configure_generates_correct_manifest_with_super_slicing(
471
+ write_tmp_file_mock: MagicMock,
472
+ mock_commands: CommandsTester,
473
+ kueue_manager: KueueManager,
474
+ ):
475
+ """Test that __configure generates correct manifest with super-slicing topology."""
476
+ set_installed_kueue_version(mock_commands, None)
477
+ kueue_config = dataclasses.replace(
478
+ KUEUE_CONFIG,
479
+ configure_super_slicing=True,
480
+ system=UserFacingNameToSystemCharacteristics["tpu7x-4x4x4"],
481
+ )
482
+
483
+ kueue_manager.install_or_upgrade(kueue_config)
484
+
485
+ rendered_manifest: str = write_tmp_file_mock.call_args[0][0]
486
+ manifest_docs = list(yaml.safe_load_all(rendered_manifest))
487
+ resource_flavor = _first(
488
+ doc for doc in manifest_docs if doc["kind"] == "ResourceFlavor"
489
+ )
490
+ assert resource_flavor["spec"]["topologyName"] == "super-slice-topology"
491
+ assert resource_flavor["spec"]["nodeLabels"] == {
492
+ "cloud.google.com/gke-tpu-accelerator": "tpu7x",
493
+ "cloud.google.com/gke-tpu-partition-4x4x4-state": "HEALTHY",
494
+ }
495
+ topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
496
+ assert topology["metadata"]["name"] == "super-slice-topology"
497
+ expected_levels = [
498
+ "cloud.google.com/gce-topology-block",
499
+ "cloud.google.com/gke-tpu-partition-4x4x4-id",
500
+ "kubernetes.io/hostname",
501
+ ]
502
+ actual_levels = [level["nodeLabel"] for level in topology["spec"]["levels"]]
503
+ assert actual_levels == expected_levels
504
+ cluster_queue = _first(
505
+ doc for doc in manifest_docs if doc["kind"] == "ClusterQueue"
506
+ )
507
+ assert cluster_queue["spec"]["admissionChecks"][0] == "ss-kueue-operator"
508
+
509
+
465
510
  @patch("xpk.core.kueue_manager.write_tmp_file")
466
511
  def test_configure_generates_correct_manifest_with_pathways(
467
512
  write_tmp_file_mock: MagicMock,
@@ -549,7 +594,7 @@ def test_has_sub_slicing_enabled_returns_false_when_sub_slicing_topology_is_not_
549
594
  assert result is False
550
595
 
551
596
 
552
- def test_has_sub_slicing_enabled_returns_true_when_sub_slicing_topology_is_not_present(
597
+ def test_has_sub_slicing_enabled_returns_true_when_sub_slicing_topology_is_present(
553
598
  mock_commands: CommandsTester,
554
599
  ):
555
600
  mock_commands.set_result_for_command(
@@ -562,6 +607,41 @@ def test_has_sub_slicing_enabled_returns_true_when_sub_slicing_topology_is_not_p
562
607
  assert result is True
563
608
 
564
609
 
610
+ def test_has_super_slicing_enabled_returns_exit_code_when_command_fails(
611
+ mock_commands: CommandsTester,
612
+ ):
613
+ mock_commands.set_result_for_command((1, ""), "kubectl get topology")
614
+
615
+ return_code, result = has_super_slicing_enabled()
616
+
617
+ assert return_code == 1
618
+ assert result is None
619
+
620
+
621
+ def test_has_super_slicing_enabled_returns_false_when_super_slicing_topology_is_not_present(
622
+ mock_commands: CommandsTester,
623
+ ):
624
+ mock_commands.set_result_for_command((0, ""), "kubectl get topology")
625
+
626
+ return_code, result = has_super_slicing_enabled()
627
+
628
+ assert return_code == 0
629
+ assert result is False
630
+
631
+
632
+ def test_has_super_slicing_enabled_returns_true_when_super_slicing_topology_is_present(
633
+ mock_commands: CommandsTester,
634
+ ):
635
+ mock_commands.set_result_for_command(
636
+ (0, "super-slice-topology"), "kubectl get topology"
637
+ )
638
+
639
+ return_code, result = has_super_slicing_enabled()
640
+
641
+ assert return_code == 0
642
+ assert result is True
643
+
644
+
565
645
  T = TypeVar("T")
566
646
 
567
647
 
xpk/core/nap.py CHANGED
@@ -147,17 +147,18 @@ def enable_autoprovisioning_on_cluster(
147
147
 
148
148
  for i, command in enumerate(commands):
149
149
  xpk_print(f'To complete {task_names[i]} we are executing {command}')
150
- max_return_code = run_commands(
150
+ maybe_failure = run_commands(
151
151
  commands,
152
152
  'Update node pools with autoprovisioning support',
153
153
  task_names,
154
154
  )
155
- if max_return_code != 0:
155
+ if maybe_failure is not None:
156
156
  xpk_print(
157
157
  'Update node pools with autoprovisioning support returned ERROR:'
158
- f' {max_return_code}'
158
+ f' {maybe_failure.return_code}'
159
159
  )
160
- return None, max_return_code
160
+ return None, maybe_failure.return_code
161
+
161
162
  return autoprovisioning_config, return_code
162
163
 
163
164
 
xpk/core/nodepool.py CHANGED
@@ -15,6 +15,8 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from typing import List
18
+
19
+ from ..utils.feature_flags import FeatureFlags
18
20
  from ..utils.console import ask_for_user_consent, xpk_print
19
21
  from .scheduling import get_placement_policy_name, is_placement_policy_supported
20
22
  from .capacity import (
@@ -25,14 +27,14 @@ from .capacity import (
25
27
  get_capacity_type,
26
28
  print_reservations,
27
29
  )
28
- from .commands import run_command_for_value, run_commands
30
+ from .commands import run_command_for_value, run_commands, FailedCommand
29
31
  from .gcloud_context import GkeServerConfig, get_cluster_location, zone_to_region
30
32
  from .resources import (
31
33
  ConfigMapType,
32
34
  check_cluster_resources,
33
35
  update_cluster_configmap,
34
36
  )
35
- from .system_characteristics import AcceleratorType
37
+ from .system_characteristics import AcceleratorType, SystemCharacteristics
36
38
 
37
39
 
38
40
  CLOUD_PLATFORM_AUTH_SCOPE_URL = (
@@ -43,7 +45,7 @@ OLDER_PATHWAYS_CPU_NP_TO_DELETE = ['cpu-rm-np', 'cpu-proxy-np', 'cpu-user-np']
43
45
 
44
46
 
45
47
  def run_gke_node_pool_create_command(
46
- args, system, gke_node_pool_version
48
+ args, system: SystemCharacteristics, gke_node_pool_version: str
47
49
  ) -> int:
48
50
  """Run the Create GKE Node Pool request.
49
51
 
@@ -84,7 +86,7 @@ def run_gke_node_pool_create_command(
84
86
  else:
85
87
  max_nodes = 1000
86
88
  capacity_args, return_code = get_capacity_arguments_from_capacity_type(
87
- args, capacity_type, max_nodes
89
+ args, capacity_type, max_nodes, system.accelerator_type
88
90
  )
89
91
  if return_code > 0:
90
92
  xpk_print('Parsing capacity arguments failed!')
@@ -200,13 +202,13 @@ def run_gke_node_pool_create_command(
200
202
  xpk_print(
201
203
  f'To complete {delete_task_names[i]} we are executing {command}'
202
204
  )
203
- max_return_code = run_commands(
205
+ maybe_failure = run_commands(
204
206
  delete_commands,
205
207
  'Delete Nodepools',
206
208
  delete_task_names,
207
209
  )
208
- if max_return_code != 0:
209
- xpk_print(f'Delete Nodepools returned ERROR {max_return_code}')
210
+ if maybe_failure is not None:
211
+ xpk_print(f'Delete Nodepools returned ERROR {maybe_failure.return_code}')
210
212
  return 1
211
213
 
212
214
  # Enable Workload Identity on existing Nodepools
@@ -224,15 +226,15 @@ def run_gke_node_pool_create_command(
224
226
  xpk_print(
225
227
  f'To complete {update_WI_task_names[i]} we are executing {command}'
226
228
  )
227
- max_return_code = run_commands(
229
+ maybe_failure = run_commands(
228
230
  update_WI_commands,
229
231
  'Enable Workload Identity on existing Nodepools',
230
232
  update_WI_task_names,
231
233
  )
232
- if max_return_code != 0:
234
+ if maybe_failure is not None:
233
235
  xpk_print(
234
236
  'Enable Workload Identity on existing Nodepools returned ERROR'
235
- f' {max_return_code}'
237
+ f' {maybe_failure.return_code}'
236
238
  )
237
239
  return 1
238
240
 
@@ -256,12 +258,17 @@ def run_gke_node_pool_create_command(
256
258
 
257
259
  placement_args = ''
258
260
  if is_placement_policy_supported(system):
259
- placement_policy = get_placement_policy_name(system)
261
+ super_slicing = FeatureFlags.SUPER_SLICING_ENABLED and args.super_slicing
262
+ placement_policy = get_placement_policy_name(
263
+ system,
264
+ super_slicing,
265
+ )
260
266
  ensure_resource_policy_exists(
261
267
  resource_policy_name=placement_policy,
262
268
  project=args.project,
263
269
  zone=args.zone,
264
270
  topology=system.topology,
271
+ super_slicing=super_slicing,
265
272
  )
266
273
  placement_args = f' --placement-policy={placement_policy}'
267
274
 
@@ -358,19 +365,41 @@ def run_gke_node_pool_create_command(
358
365
 
359
366
  for i, command in enumerate(create_commands):
360
367
  xpk_print(f'To complete {create_task_names[i]} we are executing {command}')
361
- max_return_code = run_commands(
368
+ maybe_failure = run_commands(
362
369
  create_commands,
363
370
  'Create Nodepools',
364
371
  create_task_names,
365
372
  )
366
- if max_return_code != 0:
367
- xpk_print(f'Create Nodepools returned ERROR {max_return_code}')
373
+ if maybe_failure is not None:
374
+ display_nodepool_creation_error(maybe_failure)
368
375
  return 1
369
376
 
370
377
  xpk_print('Create or delete node pool request complete.')
371
378
  return 0
372
379
 
373
380
 
381
+ def display_nodepool_creation_error(maybe_failure: FailedCommand) -> None:
382
+ """Display nodepool creation errors to the user."""
383
+
384
+ xpk_print(f'Create Nodepools returned ERROR {maybe_failure.return_code}')
385
+ try:
386
+ with open(maybe_failure.logfile, 'r', encoding='utf-8') as f:
387
+ contents = f.read()
388
+ error_marker = 'finished with error:'
389
+ error = contents[contents.index(error_marker) + len(error_marker) :].strip()
390
+ # the longest error we're expecting to see is 256 characters + np name
391
+ max_error_display_length = 400
392
+ xpk_print(f'Nodepool creation error: {error[:max_error_display_length]}')
393
+ if (
394
+ error.find('lack of capacity') != -1
395
+ or error.find('Requested resource is exhausted') != -1
396
+ ):
397
+ xpk_print('NOTE: this error might be caused by a stockout')
398
+ except (FileNotFoundError, IOError, ValueError):
399
+ # silently ignore any log parsing errors
400
+ pass
401
+
402
+
374
403
  def get_node_pools_to_delete(
375
404
  args, system, existing_node_pool_names, desired_node_pool_names
376
405
  ) -> list:
@@ -585,18 +614,22 @@ def get_desired_node_pool_names(
585
614
  while len(result) < desired_node_pool_count:
586
615
  result.add(f'{cluster_name}-np-{i}')
587
616
  i += 1
588
- return list(result)
617
+ return list(sorted(result))
589
618
 
590
619
 
591
620
  def ensure_resource_policy_exists(
592
- resource_policy_name: str, project: str, zone: str, topology: str
621
+ resource_policy_name: str,
622
+ project: str,
623
+ zone: str,
624
+ topology: str,
625
+ super_slicing: bool,
593
626
  ) -> None:
594
627
  return_code, _ = run_command_for_value(
595
628
  (
596
629
  'gcloud compute resource-policies describe'
597
- f' {resource_policy_name} '
598
- f'--project={project} '
599
- f'--region={zone_to_region(zone)}'
630
+ f' {resource_policy_name}'
631
+ f' --project={project}'
632
+ f' --region={zone_to_region(zone)}'
600
633
  ),
601
634
  'Retrieve resource policy',
602
635
  )
@@ -604,11 +637,15 @@ def ensure_resource_policy_exists(
604
637
  if return_code == 0:
605
638
  return
606
639
 
640
+ # TODO: b/465696970 - Verify the flag below before launching SUPER_SLICING:
641
+ accelerator_topology_mode = (
642
+ ' --accelerator-topology-mode=PROVISION_ONLY' if super_slicing else ''
643
+ )
607
644
  return_code, _ = run_command_for_value(
608
645
  (
609
646
  'gcloud compute resource-policies create workload-policy'
610
647
  f' {resource_policy_name} --project={project} --region={zone_to_region(zone)} --type=HIGH_THROUGHPUT'
611
- f' --accelerator-topology={topology}'
648
+ f' --accelerator-topology={topology}{accelerator_topology_mode}'
612
649
  ),
613
650
  'Create resource policy',
614
651
  )
xpk/core/nodepool_test.py CHANGED
@@ -16,13 +16,23 @@ limitations under the License.
16
16
 
17
17
  import pytest
18
18
  from xpk.core.nodepool import (
19
+ display_nodepool_creation_error,
19
20
  ensure_resource_policy_exists,
20
21
  get_desired_node_pool_names,
21
22
  run_gke_node_pool_create_command,
22
23
  )
23
24
  from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
25
+ from xpk.core.commands import FailedCommand
26
+ from xpk.core.testing.commands_tester import CommandsTester
27
+
24
28
 
25
29
  CLUSTER_NAME = "running-cucumber"
30
+ maybe_failure = FailedCommand(
31
+ return_code=1,
32
+ name="create-nodepool",
33
+ command="test-command",
34
+ logfile="logfile_path",
35
+ )
26
36
 
27
37
 
28
38
  def node_pool_name(number: int) -> str:
@@ -88,61 +98,107 @@ def test_compute_desired_node_pool_names_with_unknown_node_pools():
88
98
  assert set(result) == set(expected_result)
89
99
 
90
100
 
101
+ @pytest.fixture
102
+ def commands_tester(mocker):
103
+ return CommandsTester(
104
+ mocker,
105
+ run_command_for_value_path="xpk.core.nodepool.run_command_for_value",
106
+ )
107
+
108
+
91
109
  def test_ensure_resource_policy_exists_with_existing_policy_retrieves_existing_policy(
92
- mocker,
110
+ commands_tester: CommandsTester,
93
111
  ):
94
- args = mocker.Mock(project="test-project", zone="us-central1-a")
95
- mocker.patch("xpk.core.nodepool.get_cluster_location", return_value=args.zone)
96
- mock = mocker.patch(
97
- "xpk.core.nodepool.run_command_for_value", return_value=(0, "")
98
- )
99
112
  ensure_resource_policy_exists(
100
113
  resource_policy_name="resource-policy",
101
114
  project="test-project",
102
115
  zone="us-central1-a",
103
116
  topology="2x2x1",
117
+ super_slicing=False,
118
+ )
119
+
120
+ assert len(commands_tester.commands_history) == 1
121
+ commands_tester.assert_command_run(
122
+ "gcloud compute resource-policies describe resource-policy",
123
+ "--project=test-project",
124
+ "--region=us-central1",
104
125
  )
105
- mock.assert_called_once()
106
126
 
107
127
 
108
128
  def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
109
- mocker,
129
+ commands_tester: CommandsTester,
110
130
  ):
111
- args = mocker.Mock(project="test-project", zone="us-central1-a")
112
- mocker.patch("xpk.core.nodepool.get_cluster_location", return_value=args.zone)
113
- mock = mocker.patch(
114
- "xpk.core.nodepool.run_command_for_value", side_effect=[(1, ""), (0, "")]
131
+ commands_tester.set_result_for_command(
132
+ (1, ""), "gcloud compute resource-policies describe"
115
133
  )
134
+
116
135
  ensure_resource_policy_exists(
117
136
  resource_policy_name="resource-policy",
118
137
  project="test-project",
119
138
  zone="us-central1-a",
120
139
  topology="2x2x1",
140
+ super_slicing=False,
141
+ )
142
+
143
+ assert len(commands_tester.commands_history) == 2
144
+ commands_tester.assert_command_run(
145
+ "gcloud compute resource-policies describe"
146
+ )
147
+ commands_tester.assert_command_run(
148
+ "gcloud compute resource-policies create workload-policy resource-policy",
149
+ "--project=test-project",
150
+ "--region=us-central1",
151
+ "--accelerator-topology=2x2x1",
152
+ )
153
+ commands_tester.assert_command_not_run(
154
+ "gcloud compute resource-policies create workload-policy",
155
+ "--accelerator-topology-mode",
156
+ )
157
+
158
+
159
+ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_for_super_slicing(
160
+ commands_tester: CommandsTester,
161
+ ):
162
+ commands_tester.set_result_for_command(
163
+ (1, ""), "gcloud compute resource-policies describe"
164
+ )
165
+
166
+ ensure_resource_policy_exists(
167
+ resource_policy_name="ss-resource-policy",
168
+ project="test-project",
169
+ zone="us-central1-a",
170
+ topology="2x2x1",
171
+ super_slicing=True,
172
+ )
173
+
174
+ commands_tester.assert_command_run(
175
+ "gcloud compute resource-policies create workload-policy",
176
+ "--accelerator-topology-mode",
121
177
  )
122
- assert mock.call_count == 2
123
- assert mock.call_args_list[0].args[1] == "Retrieve resource policy"
124
178
 
125
179
 
126
180
  def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creation_fails(
127
- mocker,
181
+ commands_tester: CommandsTester,
128
182
  ):
129
183
  with pytest.raises(RuntimeError):
130
- args = mocker.Mock(project="test-project", zone="us-central1-a")
131
- mocker.patch(
132
- "xpk.core.nodepool.get_cluster_location", return_value=args.zone
133
- )
134
- mocker.patch(
135
- "xpk.core.nodepool.run_command_for_value",
136
- side_effect=[(1, ""), (1, "")],
184
+ commands_tester.set_result_for_command(
185
+ (1, ""), "gcloud compute resource-policies"
137
186
  )
187
+
138
188
  ensure_resource_policy_exists(
139
189
  resource_policy_name="resource-policy",
140
190
  project="test-project",
141
191
  zone="us-central1-a",
142
192
  topology="2x2x1",
193
+ super_slicing=False,
143
194
  )
144
195
 
145
196
 
197
+ @pytest.fixture
198
+ def mock_xpk_print(mocker):
199
+ return mocker.patch("xpk.core.nodepool.xpk_print")
200
+
201
+
146
202
  @pytest.fixture
147
203
  def mock_nodepool_dependencies(mocker):
148
204
  """Mocks dependencies for run_gke_node_pool_create_command."""
@@ -159,7 +215,7 @@ def mock_nodepool_dependencies(mocker):
159
215
  mocker.patch(
160
216
  "xpk.core.nodepool.get_cluster_location", return_value="us-central1"
161
217
  )
162
- mocker.patch("xpk.core.nodepool.run_commands", return_value=0)
218
+ mocker.patch("xpk.core.nodepool.run_commands", return_value=None)
163
219
  mocker.patch("xpk.core.nodepool.ask_for_user_consent", return_value=True)
164
220
  mock_is_placement_policy_supported = mocker.patch(
165
221
  "xpk.core.nodepool.is_placement_policy_supported"
@@ -194,6 +250,7 @@ def test_placement_policy_created_for_gpu_with_valid_topology(
194
250
  accelerator_type=AcceleratorType.GPU,
195
251
  device_type="h100-80gb-8",
196
252
  supports_sub_slicing=False,
253
+ supports_super_slicing=False,
197
254
  docker_platform=DockerPlatform.ARM,
198
255
  gpu_config=GpuConfig(requires_topology=True),
199
256
  )
@@ -226,6 +283,7 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
226
283
  accelerator_type=AcceleratorType.GPU,
227
284
  device_type="h100-80gb-8",
228
285
  supports_sub_slicing=False,
286
+ supports_super_slicing=False,
229
287
  docker_platform=DockerPlatform.ARM,
230
288
  gpu_config=GpuConfig(requires_topology=True),
231
289
  )
@@ -261,6 +319,7 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
261
319
  device_type="tpu7x-8",
262
320
  requires_workload_policy=True,
263
321
  supports_sub_slicing=False,
322
+ supports_super_slicing=False,
264
323
  docker_platform=DockerPlatform.ARM,
265
324
  )
266
325
 
@@ -294,9 +353,79 @@ def test_placement_policy_not_created_for_non7x_tpu(
294
353
  accelerator_type=AcceleratorType.TPU,
295
354
  device_type="v6e-4",
296
355
  supports_sub_slicing=True,
356
+ supports_super_slicing=False,
297
357
  docker_platform=DockerPlatform.ARM,
298
358
  )
299
359
 
300
360
  run_gke_node_pool_create_command(args, system, "1.2.3")
301
361
 
302
362
  mock_ensure_resource_policy.assert_not_called()
363
+
364
+
365
+ @pytest.mark.parametrize(
366
+ argnames="error_message,is_stockout",
367
+ argvalues=[
368
+ (
369
+ (
370
+ "Requested resource is exhausted: Zone 'us-central1-c' is not"
371
+ " available. Please try another zone."
372
+ ),
373
+ True,
374
+ ),
375
+ (
376
+ (
377
+ "TPU: the nodes (in pool test-pool) cannot be created now due"
378
+ " to lack of capacity in your reservation. They will be created"
379
+ " asynchronously once capacity is available. You can either"
380
+ " wait for the nodes to be up, or delete the node pool and try"
381
+ " re-creating it again later"
382
+ ),
383
+ True,
384
+ ),
385
+ ("Generic error message", False),
386
+ ],
387
+ )
388
+ def test_display_nodepool_creation_error_handles_error_messages(
389
+ mocker, mock_xpk_print, error_message, is_stockout
390
+ ):
391
+ """Tests that display_nodepool_creation_error surfaces errors and detects stockouts."""
392
+
393
+ log_contents = """Operation [
394
+ ...
395
+ ] finished with error: """ + error_message + "\n"
396
+ mocker.patch("builtins.open", mocker.mock_open(read_data=log_contents))
397
+ display_nodepool_creation_error(maybe_failure)
398
+
399
+ assert mock_xpk_print.call_count == 3 if is_stockout else 2
400
+ assert (
401
+ mock_xpk_print.call_args_list[0].args[0]
402
+ == "Create Nodepools returned ERROR 1"
403
+ )
404
+ assert (
405
+ mock_xpk_print.call_args_list[1].args[0]
406
+ == "Nodepool creation error: " + error_message
407
+ )
408
+ assert (
409
+ not is_stockout
410
+ or mock_xpk_print.call_args_list[2].args[0]
411
+ == "NOTE: this error might be caused by a stockout"
412
+ )
413
+
414
+
415
+ def test_display_nodepool_creation_ignores_logs_without_errors(
416
+ mocker,
417
+ mock_xpk_print,
418
+ ):
419
+ """Tests that display_nodepool_creation_error ignores log files with no errors."""
420
+
421
+ log_contents = """Operation [
422
+ ...
423
+ ] succeeded!"""
424
+ mocker.patch("builtins.open", mocker.mock_open(read_data=log_contents))
425
+ display_nodepool_creation_error(maybe_failure)
426
+
427
+ assert mock_xpk_print.call_count == 1
428
+ assert (
429
+ mock_xpk_print.call_args_list[0].args[0]
430
+ == "Create Nodepools returned ERROR 1"
431
+ )
xpk/core/pathways.py CHANGED
@@ -325,9 +325,10 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
325
325
  if len(workloads) == 1:
326
326
  return_code = run_command_with_updates(commands[0], 'Delete Workload')
327
327
  else:
328
- return_code = run_commands(
328
+ maybe_failure = run_commands(
329
329
  commands, 'Delete Workload', task_names, batch=100
330
330
  )
331
+ return_code = 0 if maybe_failure is None else maybe_failure.return_code
331
332
 
332
333
  if return_code != 0:
333
334
  xpk_print(f'Delete Workload request returned ERROR {return_code}')
xpk/core/resources.py CHANGED
@@ -201,15 +201,15 @@ def _create_or_update_cluster_configmap(configmap_yml: dict[str, str]) -> int:
201
201
  task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
202
202
  task_names.append(task_name)
203
203
 
204
- return_code = run_commands(
204
+ maybe_failure = run_commands(
205
205
  commands,
206
206
  'GKE Cluster CreateOrUpdate ConfigMap(s)',
207
207
  task_names,
208
208
  )
209
- if return_code != 0:
209
+ if maybe_failure is not None:
210
210
  xpk_print(
211
211
  'GKE Cluster Create/Update ConfigMap(s) request returned ERROR'
212
- f' {return_code}'
212
+ f' {maybe_failure.return_code}'
213
213
  )
214
214
  return 1
215
215
  return 0