xpk 0.17.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. xpk/commands/cluster.py +4 -35
  2. xpk/commands/cluster_gcluster.py +1 -13
  3. xpk/commands/cluster_gcluster_test.py +2 -10
  4. xpk/commands/cluster_test.py +0 -4
  5. xpk/commands/workload.py +10 -3
  6. xpk/commands/workload_test.py +1 -0
  7. xpk/core/cluster.py +10 -9
  8. xpk/core/config.py +5 -17
  9. xpk/core/kueue_manager_test.py +2 -0
  10. xpk/core/nodepool.py +6 -0
  11. xpk/core/nodepool_test.py +4 -0
  12. xpk/core/scheduling.py +28 -3
  13. xpk/core/scheduling_test.py +38 -1
  14. xpk/core/system_characteristics.py +39 -16
  15. xpk/core/system_characteristics_test.py +11 -0
  16. xpk/core/workload_decorators/rdma_decorator.py +0 -15
  17. xpk/core/workload_decorators/tcpx_decorator.py +0 -8
  18. xpk/core/workload_decorators/tcpx_decorator_test.py +0 -78
  19. xpk/core/workload_decorators/tcpxo_decorator.py +0 -16
  20. xpk/parser/common.py +0 -17
  21. xpk/parser/core.py +0 -39
  22. xpk/parser/storage.py +0 -11
  23. xpk/utils/feature_flags.py +1 -1
  24. xpk/utils/validation.py +0 -8
  25. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/METADATA +15 -4
  26. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/RECORD +30 -41
  27. xpk/commands/batch.py +0 -144
  28. xpk/commands/job.py +0 -244
  29. xpk/commands/kind.py +0 -286
  30. xpk/commands/kjob_common.py +0 -60
  31. xpk/commands/run.py +0 -140
  32. xpk/commands/shell.py +0 -142
  33. xpk/parser/batch.py +0 -43
  34. xpk/parser/job.py +0 -147
  35. xpk/parser/kind.py +0 -95
  36. xpk/parser/run.py +0 -47
  37. xpk/parser/shell.py +0 -59
  38. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/WHEEL +0 -0
  39. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/entry_points.txt +0 -0
  40. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/licenses/LICENSE +0 -0
  41. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py CHANGED
@@ -49,7 +49,6 @@ from ..core.gcloud_context import (
49
49
  zone_to_region,
50
50
  )
51
51
  from ..core.jobset import update_jobset_resources_if_necessary
52
- from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
53
52
  from ..core.kueue_manager import (KueueConfig, KueueManager)
54
53
  from ..core.nap import enable_autoprovisioning_on_cluster
55
54
  from ..core.network import (
@@ -98,7 +97,6 @@ def cluster_adapt(args) -> None:
98
97
  if should_validate_dependencies(args):
99
98
  validate_dependencies_list([
100
99
  SystemDependency.KUBECTL,
101
- SystemDependency.KJOB,
102
100
  SystemDependency.GCLOUD,
103
101
  ])
104
102
  args.enable_pathways = False
@@ -188,7 +186,6 @@ def cluster_adapt(args) -> None:
188
186
  if install_kueue_code != 0:
189
187
  xpk_exit(install_kueue_code)
190
188
 
191
- install_kjob(args)
192
189
  if system.accelerator_type == AcceleratorType.GPU:
193
190
  prepare_gpus(system)
194
191
 
@@ -308,7 +305,6 @@ def cluster_create(args) -> None:
308
305
  if should_validate_dependencies(args):
309
306
  validate_dependencies_list([
310
307
  SystemDependency.KUBECTL,
311
- SystemDependency.KJOB,
312
308
  SystemDependency.GCLOUD,
313
309
  ])
314
310
 
@@ -455,8 +451,6 @@ def cluster_create(args) -> None:
455
451
  if install_kueue_code != 0:
456
452
  xpk_exit(install_kueue_code)
457
453
 
458
- install_kjob(args)
459
-
460
454
  if system.accelerator_type == AcceleratorType.GPU:
461
455
  prepare_gpus(system)
462
456
 
@@ -1239,29 +1233,20 @@ def run_gke_cluster_create_command(
1239
1233
  ' --autoscaling-profile=optimize-utilization'
1240
1234
  ' --labels=gke_product_type=xpk'
1241
1235
  f' --release-channel={release_channel.value.lower()}'
1236
+ ' --enable-ip-alias'
1237
+ ' --enable-dataplane-v2'
1238
+ ' --enable-multi-networking'
1242
1239
  )
1243
1240
 
1244
1241
  if args.gke_version:
1245
1242
  command += ' --no-enable-autoupgrade'
1246
1243
 
1247
- enable_ip_alias = False
1248
-
1249
1244
  if args.private or args.authorized_networks is not None:
1250
- enable_ip_alias = True
1251
1245
  command += ' --enable-master-authorized-networks --enable-private-nodes'
1252
1246
 
1253
- if system.accelerator_type == AcceleratorType.GPU:
1254
- enable_ip_alias = True
1255
- command += ' --enable-dataplane-v2 --enable-multi-networking'
1256
- else:
1247
+ if system.accelerator_type != AcceleratorType.GPU:
1257
1248
  command += ' --location-policy=BALANCED --scopes=storage-full,gke-default'
1258
1249
 
1259
- if args.enable_pathways:
1260
- enable_ip_alias = True
1261
-
1262
- if enable_ip_alias:
1263
- command += ' --enable-ip-alias'
1264
-
1265
1250
  if args.enable_ray_cluster:
1266
1251
  command += ' --addons RayOperator'
1267
1252
 
@@ -1343,22 +1328,6 @@ def install_storage_csis(args):
1343
1328
  xpk_exit(update_cluster_command_code)
1344
1329
 
1345
1330
 
1346
- def install_kjob(args):
1347
- xpk_print('Verifying kjob installation')
1348
- err_code = verify_kjob_installed()
1349
- if err_code > 0:
1350
- xpk_exit(err_code)
1351
-
1352
- xpk_print('Applying kjob CDRs')
1353
- err_code = apply_kjob_crds()
1354
- if err_code > 0:
1355
- xpk_exit(err_code)
1356
-
1357
- err_code = prepare_kjob(args)
1358
- if err_code > 0:
1359
- xpk_exit(err_code)
1360
-
1361
-
1362
1331
  def _install_kueue(
1363
1332
  args,
1364
1333
  system: SystemCharacteristics,
@@ -38,7 +38,6 @@ from ..core.commands import run_command_for_value
38
38
  from ..core.docker_manager import DockerManager
39
39
  from ..core.gcloud_context import zone_to_region
40
40
  from ..core.gcluster_manager import GclusterManager
41
- from ..core.kjob import apply_kjob_crds, prepare_kjob
42
41
  from ..core.remote_state.fuse_remote_state import FuseStateClient
43
42
  from ..core.remote_state.remote_state_client import RemoteStateClient
44
43
  from ..utils.console import xpk_exit, xpk_print
@@ -112,18 +111,7 @@ def cluster_create(
112
111
  get_cluster_credentials(args)
113
112
 
114
113
  err_code = __install_kueue(args)
115
- if err_code > 0:
116
- xpk_exit(err_code)
117
-
118
- err_code = apply_kjob_crds()
119
- if err_code > 0:
120
- xpk_exit(err_code)
121
-
122
- err_code = prepare_kjob(args)
123
- if err_code > 0:
124
- xpk_exit(err_code)
125
-
126
- xpk_exit(0)
114
+ xpk_exit(err_code)
127
115
 
128
116
 
129
117
  def __install_kueue(args) -> int:
@@ -46,8 +46,6 @@ def mock_cluster_create_deps(request):
46
46
  """Mocks dependencies for cluster_create."""
47
47
  with (
48
48
  patch("xpk.commands.cluster_gcluster.xpk_exit") as mock_exit,
49
- patch("xpk.commands.cluster_gcluster.prepare_kjob") as mock_prep_kjob,
50
- patch("xpk.commands.cluster_gcluster.apply_kjob_crds") as mock_apply_kjob,
51
49
  patch(
52
50
  "xpk.commands.cluster_gcluster.get_cluster_credentials"
53
51
  ) as mock_get_creds,
@@ -68,8 +66,6 @@ def mock_cluster_create_deps(request):
68
66
  ):
69
67
  yield {
70
68
  "xpk_exit": mock_exit,
71
- "prepare_kjob": mock_prep_kjob,
72
- "apply_kjob_crds": mock_apply_kjob,
73
69
  "get_cluster_credentials": mock_get_creds,
74
70
  "generate_blueprint": mock_gen_bp,
75
71
  "prepare_gcluster_manager": mock_prep_gcm,
@@ -85,9 +81,6 @@ def test_install_kueue_standard(
85
81
  mock_get_total_chips, mock_args, mock_cluster_create_deps
86
82
  ):
87
83
  """Tests __install_kueue for a standard installation."""
88
- mock_cluster_create_deps["prepare_kjob"].return_value = 0
89
- mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
90
-
91
84
  mock_system = SystemCharacteristics(
92
85
  topology="N/A",
93
86
  vms_per_slice=1,
@@ -98,6 +91,7 @@ def test_install_kueue_standard(
98
91
  device_type="h100-mega-80gb-8",
99
92
  supports_sub_slicing=False,
100
93
  supports_super_slicing=False,
94
+ supports_accelerator_network_profile=True,
101
95
  docker_platform=DockerPlatform.ARM,
102
96
  gpu_config=GpuConfig(requires_topology=True),
103
97
  )
@@ -138,9 +132,6 @@ def test_install_kueue_with_autoprovisioning(
138
132
  mock_enable_autoprovisioning, mock_args, mock_cluster_create_deps
139
133
  ):
140
134
  """Tests __install_kueue with autoprovisioning enabled."""
141
- mock_cluster_create_deps["prepare_kjob"].return_value = 0
142
- mock_cluster_create_deps["apply_kjob_crds"].return_value = 0
143
-
144
135
  mock_args.enable_autoprovisioning = True
145
136
  mock_system = SystemCharacteristics(
146
137
  topology="N/A",
@@ -152,6 +143,7 @@ def test_install_kueue_with_autoprovisioning(
152
143
  device_type="h100-mega-80gb-8",
153
144
  supports_sub_slicing=False,
154
145
  supports_super_slicing=False,
146
+ supports_accelerator_network_profile=True,
155
147
  docker_platform=DockerPlatform.ARM,
156
148
  gpu_config=GpuConfig(requires_topology=True),
157
149
  )
@@ -56,7 +56,6 @@ class _ClusterCreateMocks:
56
56
  create_cluster_configmaps: MagicMock
57
57
  set_jobset_on_cluster: MagicMock
58
58
  get_cluster_location: MagicMock
59
- install_kjob: MagicMock
60
59
  xpk_exit: MagicMock
61
60
  update_jobset_resources_if_necessary: MagicMock
62
61
  _install_kueue: MagicMock
@@ -204,9 +203,6 @@ def cluster_create_mocks(mocker) -> _ClusterCreateMocks:
204
203
  'xpk.commands.cluster.get_cluster_location',
205
204
  return_value='us-central1',
206
205
  ),
207
- install_kjob=mocker.patch(
208
- 'xpk.commands.cluster.install_kjob', return_value=0
209
- ),
210
206
  xpk_exit=mocker.patch('xpk.commands.cluster.xpk_exit'),
211
207
  update_jobset_resources_if_necessary=mocker.patch(
212
208
  'xpk.commands.cluster.update_jobset_resources_if_necessary',
xpk/commands/workload.py CHANGED
@@ -57,6 +57,7 @@ from ..core.scheduling import (
57
57
  WorkloadScheduling,
58
58
  check_if_workload_can_schedule,
59
59
  create_tpu_machine_type,
60
+ create_tpu_slice_topology_annotation,
60
61
  create_tpu_topology,
61
62
  get_cpu_affinity,
62
63
  get_gpu_scheduler,
@@ -132,7 +133,7 @@ spec:
132
133
  annotations:
133
134
  {storage_annotations}
134
135
  {sub_slicing_annotations}
135
- {annotations_machine_label}
136
+ {tpu_slice_topology_annotation}
136
137
  spec:
137
138
  schedulerName: {args.scheduler}
138
139
  imagePullSecrets:
@@ -518,6 +519,8 @@ def workload_create(args) -> None:
518
519
  workload_system, super_slicing=False
519
520
  )
520
521
 
522
+ # TODO(b/466943057): Add ANP label for NAP (if not possible, use CCC)
523
+
521
524
  # Create the workload file based on accelerator type or workload type.
522
525
  if workload_system.accelerator_type == AcceleratorType.GPU:
523
526
  container, debugging_dashboard_id = get_user_workload_container(
@@ -640,7 +643,11 @@ def workload_create(args) -> None:
640
643
  else create_machine_label(workload_system)
641
644
  )
642
645
  node_selector_machine_label = machine_label if not use_super_slicing else ''
643
- annotations_machine_label = machine_label if use_super_slicing else ''
646
+ tpu_slice_topology_annotation = (
647
+ create_tpu_slice_topology_annotation(workload_system.topology)
648
+ if use_super_slicing
649
+ else ''
650
+ )
644
651
 
645
652
  yml_string = WORKLOAD_CREATE_YAML.format(
646
653
  args=args,
@@ -657,7 +664,7 @@ def workload_create(args) -> None:
657
664
  ),
658
665
  placement_policy_label=placement_policy_label,
659
666
  node_selector_machine_label=node_selector_machine_label,
660
- annotations_machine_label=annotations_machine_label,
667
+ tpu_slice_topology_annotation=tpu_slice_topology_annotation,
661
668
  local_queue_name=LOCAL_QUEUE_NAME,
662
669
  autoprovisioning_args=autoprovisioning_args,
663
670
  volumes=get_volumes(args, workload_system),
@@ -36,6 +36,7 @@ SYSTEM_CHARACTERISTICS = SystemCharacteristics(
36
36
  supports_sub_slicing=True,
37
37
  supports_super_slicing=False,
38
38
  requires_workload_policy=False,
39
+ supports_accelerator_network_profile=False,
39
40
  docker_platform=DockerPlatform.AMD,
40
41
  )
41
42
 
xpk/core/cluster.py CHANGED
@@ -391,14 +391,13 @@ def project_id_to_project_number(project_id: str) -> str:
391
391
 
392
392
 
393
393
  def setup_k8s_env(args) -> k8s_client.ApiClient:
394
- if not getattr(args, 'kind_cluster', False):
395
- add_zone_and_project(args)
396
- get_cluster_credentials(args)
397
- args.project_number = (
398
- project_id_to_project_number(args.project)
399
- if not args.dry_run
400
- else abs(hash(args.project) % (10**12)) # 12 digit hash
401
- )
394
+ add_zone_and_project(args)
395
+ get_cluster_credentials(args)
396
+ args.project_number = (
397
+ project_id_to_project_number(args.project)
398
+ if not args.dry_run
399
+ else abs(hash(args.project) % (10**12)) # 12 digit hash
400
+ )
402
401
 
403
402
  config.load_kube_config()
404
403
  return k8s_client.ApiClient()
@@ -717,8 +716,10 @@ def get_cluster_credentials(args) -> int:
717
716
  location=location,
718
717
  dns_endpoint=True,
719
718
  )
719
+ if return_code != 0:
720
+ return return_code
720
721
 
721
- if return_code != 0 or not _are_credentials_valid():
722
+ if not _are_credentials_valid():
722
723
  xpk_print('Detected error. Retrying without --dns-endpoint flag...')
723
724
  return_code = _get_credentials(
724
725
  project=args.project,
xpk/core/config.py CHANGED
@@ -19,6 +19,7 @@ import os
19
19
  import ruamel.yaml
20
20
  from abc import ABC, abstractmethod
21
21
  from ..utils import file
22
+ from ..utils.execution_context import is_dry_run
22
23
  from ..utils.console import xpk_print
23
24
  from setuptools_scm import get_version as setuptools_get_version
24
25
  from importlib.metadata import version, PackageNotFoundError
@@ -53,14 +54,6 @@ PROJECT_KEY = 'project-id'
53
54
  CLIENT_ID_KEY = 'client-id'
54
55
  SEND_TELEMETRY_KEY = 'send-telemetry'
55
56
  ZONE_KEY = 'zone'
56
- KJOB_BATCH_IMAGE = 'batch-image'
57
- KJOB_BATCH_WORKING_DIRECTORY = 'batch-working-directory'
58
- KJOB_SHELL_IMAGE = 'shell-image'
59
- KJOB_SHELL_INTERACTIVE_COMMAND = 'shell-interactive-command'
60
- KJOB_SHELL_WORKING_DIRECTORY = 'shell-working-directory'
61
- CONFIGS_KEY = 'configs'
62
- GKE_ENDPOINT_KEY = 'gke-endpoint'
63
- DEPENDENCIES_KEY = 'deps-verified-version'
64
57
 
65
58
  DEFAULT_KEYS = [
66
59
  CFG_BUCKET_KEY,
@@ -69,13 +62,6 @@ DEFAULT_KEYS = [
69
62
  CLIENT_ID_KEY,
70
63
  SEND_TELEMETRY_KEY,
71
64
  ZONE_KEY,
72
- GKE_ENDPOINT_KEY,
73
- DEPENDENCIES_KEY,
74
- KJOB_BATCH_IMAGE,
75
- KJOB_BATCH_WORKING_DIRECTORY,
76
- KJOB_SHELL_IMAGE,
77
- KJOB_SHELL_INTERACTIVE_COMMAND,
78
- KJOB_SHELL_WORKING_DIRECTORY,
79
65
  ]
80
66
  VERTEX_TENSORBOARD_FEATURE_FLAG = XPK_CURRENT_VERSION >= '0.4.0'
81
67
 
@@ -111,8 +97,7 @@ class FileSystemConfig(Config):
111
97
  self._allowed_keys = DEFAULT_KEYS
112
98
 
113
99
  def _open_configs(self) -> dict | None:
114
- dir_path = '/'.join(self._config.split('/')[:-1])
115
- file.ensure_directory_exists(dir_path)
100
+ file.ensure_directory_exists(os.path.dirname(self._config))
116
101
 
117
102
  if not os.path.exists(self._config):
118
103
  return None
@@ -122,6 +107,9 @@ class FileSystemConfig(Config):
122
107
  return config_yaml
123
108
 
124
109
  def _save_configs(self, config_yaml: dict) -> None:
110
+ if is_dry_run():
111
+ return None
112
+
125
113
  with open(self._config, encoding='utf-8', mode='w') as stream:
126
114
  yaml.dump(config_yaml, stream)
127
115
 
@@ -36,6 +36,7 @@ TPU_SYSTEM: SystemCharacteristics = SystemCharacteristics(
36
36
  device_type="v5p-8",
37
37
  supports_sub_slicing=False,
38
38
  supports_super_slicing=False,
39
+ supports_accelerator_network_profile=False,
39
40
  docker_platform=DockerPlatform.ARM,
40
41
  )
41
42
 
@@ -411,6 +412,7 @@ def test_configure_generates_correct_manifest_with_gke_default_topology(
411
412
  supports_sub_slicing=False,
412
413
  supports_super_slicing=False,
413
414
  docker_platform=DockerPlatform.ARM,
415
+ supports_accelerator_network_profile=True,
414
416
  gpu_config=GpuConfig(requires_topology=True),
415
417
  ),
416
418
  )
xpk/core/nodepool.py CHANGED
@@ -289,6 +289,12 @@ def run_gke_node_pool_create_command(
289
289
  f'{placement_args}'
290
290
  ' --enable-gvnic'
291
291
  )
292
+
293
+ if system.supports_accelerator_network_profile:
294
+ command += (
295
+ ' --accelerator-network-profile=auto'
296
+ ' --node-labels=cloud.google.com/gke-networking-dra-driver=true'
297
+ )
292
298
  if system.accelerator_type == AcceleratorType.TPU:
293
299
  command += f' --node-version={gke_node_pool_version}'
294
300
  if capacity_type == CapacityType.FLEX_START:
xpk/core/nodepool_test.py CHANGED
@@ -251,6 +251,7 @@ def test_placement_policy_created_for_gpu_with_valid_topology(
251
251
  device_type="h100-80gb-8",
252
252
  supports_sub_slicing=False,
253
253
  supports_super_slicing=False,
254
+ supports_accelerator_network_profile=True,
254
255
  docker_platform=DockerPlatform.ARM,
255
256
  gpu_config=GpuConfig(requires_topology=True),
256
257
  )
@@ -284,6 +285,7 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
284
285
  device_type="h100-80gb-8",
285
286
  supports_sub_slicing=False,
286
287
  supports_super_slicing=False,
288
+ supports_accelerator_network_profile=True,
287
289
  docker_platform=DockerPlatform.ARM,
288
290
  gpu_config=GpuConfig(requires_topology=True),
289
291
  )
@@ -320,6 +322,7 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
320
322
  requires_workload_policy=True,
321
323
  supports_sub_slicing=False,
322
324
  supports_super_slicing=False,
325
+ supports_accelerator_network_profile=False,
323
326
  docker_platform=DockerPlatform.ARM,
324
327
  )
325
328
 
@@ -354,6 +357,7 @@ def test_placement_policy_not_created_for_non7x_tpu(
354
357
  device_type="v6e-4",
355
358
  supports_sub_slicing=True,
356
359
  supports_super_slicing=False,
360
+ supports_accelerator_network_profile=True,
357
361
  docker_platform=DockerPlatform.ARM,
358
362
  )
359
363
 
xpk/core/scheduling.py CHANGED
@@ -18,7 +18,7 @@ from enum import Enum
18
18
 
19
19
  from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled, has_super_slicing_enabled
20
20
  from ..utils.feature_flags import FeatureFlags
21
- from ..utils.topology import get_slice_topology_level
21
+ from ..utils.topology import get_slice_topology_level, parse_topology
22
22
  from ..utils.console import xpk_print
23
23
  from ..utils.topology import is_topology_valid
24
24
  from ..utils.execution_context import is_dry_run
@@ -34,6 +34,7 @@ from packaging.version import Version
34
34
 
35
35
  _SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
36
36
  _SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.14.0')
37
+ _SUPER_SLICING_MAX_TOPOLOGY = (16, 24, 24)
37
38
 
38
39
 
39
40
  class WorkloadScheduling(Enum):
@@ -115,7 +116,7 @@ def check_if_workload_can_schedule(
115
116
  args,
116
117
  workload_system,
117
118
  max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
118
- ):
119
+ ) and _check_super_slicing_topology(workload_system):
119
120
  return WorkloadScheduling.SUPER_SLICING_AVAILABLE
120
121
  else:
121
122
  return WorkloadScheduling.UNAVAILABLE
@@ -189,7 +190,6 @@ def _check_super_slicing_availability(
189
190
  workload_system: SystemCharacteristics,
190
191
  cluster_system: SystemCharacteristics,
191
192
  ) -> bool:
192
- # TODO: b/465447813 - Add super-slicing workload topology validation.
193
193
  if (
194
194
  (not FeatureFlags.SUPER_SLICING_ENABLED)
195
195
  or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
@@ -212,6 +212,27 @@ def _check_super_slicing_availability(
212
212
  )
213
213
 
214
214
 
215
+ def _check_super_slicing_topology(
216
+ workload_system: SystemCharacteristics,
217
+ ) -> bool:
218
+ topology = parse_topology(workload_system.topology)
219
+ result = (
220
+ all(size % 4 == 0 and size >= 4 for size in topology)
221
+ and len(topology) == len(_SUPER_SLICING_MAX_TOPOLOGY)
222
+ and topology[0] <= topology[1] <= topology[2]
223
+ and all(a <= b for a, b in zip(topology, _SUPER_SLICING_MAX_TOPOLOGY))
224
+ )
225
+
226
+ if not result:
227
+ xpk_print(
228
+ 'Error: Invalid super-slicing topology. It must adhere to the format of'
229
+ ' 4i x 4j x 4k, where i <= j <= k, and i, j, k are integers, with a'
230
+ ' maximum of 16x24x24.'
231
+ )
232
+
233
+ return result
234
+
235
+
215
236
  def get_total_chips_requested_from_args(
216
237
  args, system: SystemCharacteristics
217
238
  ) -> int:
@@ -342,6 +363,10 @@ def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
342
363
  ]
343
364
 
344
365
 
366
+ def create_tpu_slice_topology_annotation(workload_topology: str) -> str:
367
+ return f'cloud.google.com/gke-tpu-slice-topology: {workload_topology}'
368
+
369
+
345
370
  def create_placement_policy_label(
346
371
  system: SystemCharacteristics, super_slicing: bool
347
372
  ) -> str:
@@ -22,7 +22,7 @@ from pytest_mock import MockerFixture
22
22
  from xpk.core.capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
23
23
  from xpk.core.testing.commands_tester import CommandsTester
24
24
  from xpk.utils.feature_flags import FeatureFlags
25
- from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, get_placement_policy_name, is_placement_policy_supported
25
+ from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, create_tpu_slice_topology_annotation, get_placement_policy_name, is_placement_policy_supported
26
26
  from .system_characteristics import SystemCharacteristics, AcceleratorType, DockerPlatform, get_system_characteristics_by_device_type
27
27
 
28
28
 
@@ -66,6 +66,7 @@ def test_create_placement_policy_label_returns_valid_label():
66
66
  accelerator_type=AcceleratorType.TPU,
67
67
  supports_sub_slicing=False,
68
68
  supports_super_slicing=False,
69
+ supports_accelerator_network_profile=False,
69
70
  docker_platform=DockerPlatform.ARM,
70
71
  )
71
72
  label = create_placement_policy_label(
@@ -89,6 +90,7 @@ def test_get_placement_policy_name_returns_valid_name():
89
90
  accelerator_type=AcceleratorType.TPU,
90
91
  supports_sub_slicing=False,
91
92
  supports_super_slicing=False,
93
+ supports_accelerator_network_profile=False,
92
94
  docker_platform=DockerPlatform.ARM,
93
95
  )
94
96
  name = get_placement_policy_name(system_characteristics, super_slicing=False)
@@ -107,6 +109,7 @@ def test_get_placement_policy_name_super_slicing_returns_valid_name():
107
109
  accelerator_type=AcceleratorType.TPU,
108
110
  supports_sub_slicing=False,
109
111
  supports_super_slicing=False,
112
+ supports_accelerator_network_profile=False,
110
113
  docker_platform=DockerPlatform.ARM,
111
114
  )
112
115
  name = get_placement_policy_name(system_characteristics, super_slicing=True)
@@ -125,6 +128,7 @@ def test_is_placement_policy_supported_returns_true_for_system_characteristics_s
125
128
  accelerator_type=AcceleratorType.TPU,
126
129
  supports_sub_slicing=False,
127
130
  supports_super_slicing=False,
131
+ supports_accelerator_network_profile=False,
128
132
  docker_platform=DockerPlatform.ARM,
129
133
  )
130
134
  assert is_placement_policy_supported(system_characteristics) is True
@@ -142,6 +146,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
142
146
  accelerator_type=AcceleratorType.TPU,
143
147
  supports_sub_slicing=False,
144
148
  supports_super_slicing=False,
149
+ supports_accelerator_network_profile=False,
145
150
  docker_platform=DockerPlatform.ARM,
146
151
  )
147
152
  assert is_placement_policy_supported(system_characteristics) is False
@@ -159,6 +164,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
159
164
  accelerator_type=AcceleratorType.TPU,
160
165
  supports_sub_slicing=False,
161
166
  supports_super_slicing=False,
167
+ supports_accelerator_network_profile=False,
162
168
  docker_platform=DockerPlatform.ARM,
163
169
  )
164
170
  assert is_placement_policy_supported(system_characteristics) is False
@@ -369,6 +375,28 @@ SUPER_SLICING_CASE = SchedulingTestCase(
369
375
  ),
370
376
  WorkloadScheduling.UNAVAILABLE,
371
377
  ),
378
+ (
379
+ 'Super-slicing, but workload topology is not divisible by four',
380
+ dataclasses.replace(
381
+ SUPER_SLICING_CASE,
382
+ workload_system=_get_system_characteristics_or_die(
383
+ 'tpu7x-2x2x1'
384
+ ),
385
+ ),
386
+ WorkloadScheduling.UNAVAILABLE,
387
+ ),
388
+ (
389
+ 'Super-slicing, but workload topology is too big for super-slice',
390
+ dataclasses.replace(
391
+ SUPER_SLICING_CASE,
392
+ workload_system=_get_system_characteristics_or_die(
393
+ 'tpu7x-4x4x32'
394
+ ),
395
+ # 10 cubes, to make sure vms fit:
396
+ resources_config_map={'tpu7x-128': str(64 // 4 * 10)},
397
+ ),
398
+ WorkloadScheduling.UNAVAILABLE,
399
+ ),
372
400
  (
373
401
  (
374
402
  'Super-slicing should be ignored when a given device is already'
@@ -426,3 +454,12 @@ def test_check_if_workload_can_schedule(
426
454
  )
427
455
  == expected
428
456
  )
457
+
458
+
459
+ def test_create_tpu_slice_topology_annotation():
460
+ workload_system = _get_system_characteristics_or_die('tpu7x-4x4x8')
461
+
462
+ assert (
463
+ create_tpu_slice_topology_annotation(workload_system.topology)
464
+ == 'cloud.google.com/gke-tpu-slice-topology: 4x4x8'
465
+ )