xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. integration/README.md +19 -0
  2. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  3. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  4. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  5. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  6. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  7. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  8. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  9. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  10. xpk/blueprints/a4/storage_crd.yaml +52 -0
  11. xpk/commands/cluster.py +33 -12
  12. xpk/commands/cluster_gcluster_test.py +5 -1
  13. xpk/commands/cluster_test.py +125 -0
  14. xpk/commands/config.py +3 -3
  15. xpk/commands/inspector.py +5 -3
  16. xpk/commands/kind.py +2 -0
  17. xpk/commands/managed_ml_diagnostics.py +249 -0
  18. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  19. xpk/commands/workload.py +124 -139
  20. xpk/commands/workload_test.py +160 -118
  21. xpk/core/blueprint/blueprint_generator.py +3 -0
  22. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  23. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  24. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  25. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  26. xpk/core/capacity.py +2 -0
  27. xpk/core/cluster.py +18 -47
  28. xpk/core/cluster_test.py +76 -1
  29. xpk/core/config.py +81 -7
  30. xpk/core/config_test.py +67 -11
  31. xpk/core/docker_container.py +3 -1
  32. xpk/core/docker_image.py +10 -6
  33. xpk/core/docker_resources.py +1 -10
  34. xpk/core/kjob.py +17 -16
  35. xpk/core/kueue_manager.py +13 -19
  36. xpk/core/kueue_manager_test.py +27 -1
  37. xpk/core/nap.py +13 -14
  38. xpk/core/nodepool.py +17 -15
  39. xpk/core/nodepool_test.py +25 -4
  40. xpk/core/pathways.py +23 -0
  41. xpk/core/pathways_test.py +57 -0
  42. xpk/core/resources.py +84 -27
  43. xpk/core/scheduling.py +128 -132
  44. xpk/core/scheduling_test.py +215 -2
  45. xpk/core/system_characteristics.py +179 -0
  46. xpk/core/system_characteristics_test.py +49 -1
  47. xpk/core/telemetry.py +4 -4
  48. xpk/core/telemetry_test.py +9 -9
  49. xpk/core/vertex.py +4 -3
  50. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  51. xpk/main.py +2 -0
  52. xpk/parser/cluster.py +22 -88
  53. xpk/parser/cluster_test.py +41 -0
  54. xpk/parser/common.py +84 -0
  55. xpk/parser/storage.py +10 -0
  56. xpk/parser/storage_test.py +47 -0
  57. xpk/parser/workload.py +14 -41
  58. xpk/parser/workload_test.py +2 -48
  59. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  60. xpk/utils/feature_flags.py +3 -0
  61. xpk/utils/validation.py +2 -2
  62. xpk-0.16.0.dist-info/METADATA +127 -0
  63. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
  64. xpk-0.15.0.dist-info/METADATA +0 -1666
  65. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,57 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import pytest
18
+ from unittest.mock import MagicMock
19
+ from xpk.core.testing.commands_tester import CommandsTester
20
+ from .pathways import get_pathways_machine_types
21
+
22
+
23
+ @pytest.fixture(autouse=True)
24
+ def commands_tester(mocker: MagicMock):
25
+ return CommandsTester(
26
+ mocker,
27
+ run_command_with_updates_path=(
28
+ "xpk.core.pathways.run_command_with_updates"
29
+ ),
30
+ run_command_for_value_path="xpk.core.pathways.run_command_for_value",
31
+ )
32
+
33
+
34
+ def test_get_pathways_machine_types_when_command_fails_returns_failed_exit_code(
35
+ commands_tester: CommandsTester,
36
+ ):
37
+ commands_tester.set_result_for_command(
38
+ (1, ""), "gcloud compute machine-types list"
39
+ )
40
+ return_code, machine_types = get_pathways_machine_types(
41
+ project="gke-project", zone="us-central1-a"
42
+ )
43
+ assert return_code == 1
44
+ assert machine_types == []
45
+
46
+
47
+ def test_get_pathways_machine_types_when_command_suceeds_returns_machine_types(
48
+ commands_tester: CommandsTester,
49
+ ):
50
+ commands_tester.set_result_for_command(
51
+ (0, "abc\ncba"), "gcloud compute machine-types list"
52
+ )
53
+ return_code, machine_types = get_pathways_machine_types(
54
+ project="gke-project", zone="us-central1-a"
55
+ )
56
+ assert return_code == 0
57
+ assert machine_types == ["abc", "cba"]
xpk/core/resources.py CHANGED
@@ -15,6 +15,7 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from dataclasses import dataclass
18
+ import os
18
19
 
19
20
  from ..utils.console import xpk_print
20
21
  from ..utils.file import write_tmp_file
@@ -30,9 +31,13 @@ from .capacity import (
30
31
  from .commands import run_command_for_value, run_commands
31
32
  from .config import XPK_CURRENT_VERSION
32
33
  from .system_characteristics import AcceleratorType, get_system_characteristics_by_device_type, SystemCharacteristics
34
+ from enum import Enum
35
+
36
+
37
+ class ConfigMapType(Enum):
38
+ RESOURCES = 'resources-configmap'
39
+ METADATA = 'metadata-configmap'
33
40
 
34
- CLUSTER_RESOURCES_CONFIGMAP = 'resources-configmap'
35
- CLUSTER_METADATA_CONFIGMAP = 'metadata-configmap'
36
41
 
37
42
  CLUSTER_CONFIGMAP_YAML = """kind: ConfigMap
38
43
  apiVersion: v1
@@ -50,7 +55,15 @@ class AutoprovisioningConfig:
50
55
  maximum_chips: int
51
56
 
52
57
 
53
- def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
58
+ def get_config_map_name(
59
+ cluster_name: str, config_map_type: ConfigMapType
60
+ ) -> str:
61
+ return f'{cluster_name}-{config_map_type.value}'
62
+
63
+
64
+ def get_cluster_configmap(
65
+ cluster_name: str, config_map_type: ConfigMapType
66
+ ) -> dict[str, str] | None:
54
67
  """Run the Get GKE Cluster ConfigMap request.
55
68
 
56
69
  Args:
@@ -59,15 +72,17 @@ def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
59
72
  Returns:
60
73
  key:value pairs stored in cluster ConfigMap.
61
74
  """
75
+ config_map_name = get_config_map_name(cluster_name, config_map_type)
62
76
  command = (
63
77
  'kubectl get configmap'
64
- f' {configmap_name} -o=custom-columns="ConfigData:data" --no-headers=true'
78
+ f' {config_map_name} -o=custom-columns="ConfigData:data"'
79
+ ' --no-headers=true'
65
80
  )
66
81
 
67
82
  return_code, return_value = run_command_for_value(
68
83
  command,
69
84
  'GKE Cluster Get ConfigMap',
70
- dry_run_return_val='map[]',
85
+ dry_run_return_val=_get_dry_run_config_map_value(config_map_type),
71
86
  )
72
87
  if return_code != 0:
73
88
  xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
@@ -89,9 +104,18 @@ def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
89
104
  return config_map
90
105
 
91
106
 
107
+ def _get_dry_run_config_map_value(config_map_type: ConfigMapType) -> str:
108
+ default_value = 'map[]'
109
+
110
+ if config_map_type == ConfigMapType.RESOURCES:
111
+ return os.getenv('DRY_RUN_RESOURCES_CONFIG_MAP', default_value)
112
+
113
+ return default_value
114
+
115
+
92
116
  def create_cluster_configmaps(
93
117
  args,
94
- system,
118
+ system: SystemCharacteristics,
95
119
  tensorboard_config: dict,
96
120
  autoprovisioning_config: AutoprovisioningConfig | None,
97
121
  ) -> int:
@@ -127,9 +151,11 @@ def create_cluster_configmaps(
127
151
  resources_data = (
128
152
  f'{device_type}: "{int(args.num_slices) * system.vms_per_slice}"'
129
153
  )
130
- resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
154
+ resources_configmap_name = get_config_map_name(
155
+ args.cluster, ConfigMapType.RESOURCES
156
+ )
131
157
  resources_yml = CLUSTER_CONFIGMAP_YAML.format(
132
- args=args, name=resources_configmap_name, data=resources_data
158
+ name=resources_configmap_name, data=resources_data
133
159
  )
134
160
  configmap_yml[resources_configmap_name] = resources_yml
135
161
 
@@ -148,15 +174,17 @@ def create_cluster_configmaps(
148
174
  # Reservation ID if applicable.
149
175
  if capacity_type == CapacityType.RESERVATION:
150
176
  metadata += f'\n {RESERVATION_CONFIG_KEY}: {args.reservation}'
151
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
177
+ metadata_configmap_name = get_config_map_name(
178
+ args.cluster, ConfigMapType.METADATA
179
+ )
152
180
  metadata_yml = CLUSTER_CONFIGMAP_YAML.format(
153
- args=args, name=metadata_configmap_name, data=metadata
181
+ name=metadata_configmap_name, data=metadata
154
182
  )
155
183
  configmap_yml[metadata_configmap_name] = metadata_yml
156
- return create_or_update_cluster_configmap(configmap_yml)
184
+ return _create_or_update_cluster_configmap(configmap_yml)
157
185
 
158
186
 
159
- def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
187
+ def _create_or_update_cluster_configmap(configmap_yml: dict[str, str]) -> int:
160
188
  """
161
189
  Args:
162
190
  configmap_yml: dict containing ConfigMap name and yml string.
@@ -187,7 +215,18 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
187
215
  return 0
188
216
 
189
217
 
190
- def check_cluster_resources(args, system) -> tuple[bool, bool]:
218
+ def update_cluster_configmap(
219
+ cluster_name: str, config_map_type: ConfigMapType, data: str
220
+ ) -> int:
221
+ config_map_name = get_config_map_name(cluster_name, config_map_type)
222
+ yaml = CLUSTER_CONFIGMAP_YAML.format(name=config_map_name, data=data)
223
+ config_map_dict = {config_map_name: yaml}
224
+ return _create_or_update_cluster_configmap(config_map_dict)
225
+
226
+
227
+ def check_cluster_resources(
228
+ args, system: SystemCharacteristics
229
+ ) -> tuple[bool, bool]:
191
230
  """Check if cluster has resources of a specified device_type/gke_accelerator.
192
231
  This check will be skipped if <args.cluster>-<_CLUSTER_RESOURCES_CONFIGMAP> ConfigMap doesn't exist for the cluster.
193
232
 
@@ -200,8 +239,9 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
200
239
  True if resources in the cluster should be checked, False otherwise.
201
240
  True if device_type/gke_accelerator exists in the cluster, False otherwise.
202
241
  """
203
- resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
204
- resources_config_map = get_cluster_configmap(resources_configmap_name)
242
+ resources_config_map = get_cluster_configmap(
243
+ args.cluster, ConfigMapType.RESOURCES
244
+ )
205
245
  if resources_config_map is None:
206
246
  xpk_print(
207
247
  f'No ConfigMap exist for cluster with the name {resources_config_map}.'
@@ -216,20 +256,35 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
216
256
 
217
257
 
218
258
  def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
219
- """Get systemCharcteristics based on the cluster resources configMap
259
+ """Get SystemCharcteristics based on the cluster resources configMap.
260
+
220
261
  Args:
221
262
  args: user provided arguments for running the command.
222
263
 
223
264
  Returns:
224
- returns system characteristics
265
+ returns system characteristics, or None if not found.
266
+ """
267
+ resources_config_map = get_cluster_configmap(
268
+ args.cluster, ConfigMapType.RESOURCES
269
+ )
270
+ return get_cluster_system_characteristics_from_config_map(
271
+ resources_config_map
272
+ )
273
+
274
+
275
+ def get_cluster_system_characteristics_from_config_map(
276
+ resources_config_map: dict[str, str] | None,
277
+ ) -> SystemCharacteristics | None:
278
+ """Get SystemCharcteristics based on the cluster resources configMap.
279
+
280
+ Returns:
281
+ returns system characteristics, or None if not found.
225
282
  """
226
- resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
227
- cluster_config_map = get_cluster_configmap(resources_configmap_name)
228
283
 
229
- if cluster_config_map is None:
284
+ if resources_config_map is None:
230
285
  return None
231
286
 
232
- for key in cluster_config_map:
287
+ for key in resources_config_map:
233
288
  system, result_code = get_system_characteristics_by_device_type(key)
234
289
  if result_code == 0:
235
290
  return system
@@ -238,20 +293,22 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
238
293
 
239
294
 
240
295
  def get_cluster_capacity_type(args) -> CapacityType | None:
241
- """Get systemCharcteristics based on the cluster resources configMap
296
+ """Get CapacityType based on the cluster metadata configMap.
297
+
242
298
  Args:
243
299
  args: user provided arguments for running the command.
244
300
 
245
301
  Returns:
246
- returns system characteristics
302
+ returns CapacityType, or None if not found.
247
303
  """
248
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
249
- cluster_config_map = get_cluster_configmap(metadata_configmap_name)
304
+ metadata_configmap_name = get_cluster_configmap(
305
+ args.cluster, ConfigMapType.METADATA
306
+ )
250
307
 
251
- if cluster_config_map is None:
308
+ if metadata_configmap_name is None:
252
309
  return None
253
310
 
254
- capacityValue = cluster_config_map.get('capacity_type')
311
+ capacityValue = metadata_configmap_name.get('capacity_type')
255
312
  if capacityValue is not None:
256
313
  return CapacityType[capacityValue.upper()]
257
314
 
xpk/core/scheduling.py CHANGED
@@ -14,61 +14,63 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from enum import Enum
18
+
19
+ from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled
20
+ from ..utils.feature_flags import FeatureFlags
17
21
  from ..utils.topology import get_slice_topology_level
18
22
  from ..utils.console import xpk_print
19
23
  from ..utils.topology import is_topology_valid
20
24
  from ..utils.execution_context import is_dry_run
21
25
  from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
22
- from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
23
26
  from .system_characteristics import (
27
+ SUB_SLICING_TOPOLOGIES,
24
28
  AcceleratorType,
25
- AcceleratorTypeToAcceleratorCharacteristics,
26
29
  SystemCharacteristics,
30
+ create_accelerator_label,
31
+ create_machine_label,
27
32
  )
33
+ from packaging.version import Version
28
34
 
35
+ _SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
29
36
 
30
- def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
31
- """Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
32
37
 
33
- Args:
34
- args: user provided arguments for running the command.
35
- system: system characteristics
38
+ class WorkloadScheduling(Enum):
39
+ UNAVAILABLE = 0
40
+ AVAILABLE = 1
41
+ SUB_SLICING_AVAILABLE = 2
42
+
43
+
44
+ def check_if_workload_can_schedule(
45
+ args,
46
+ workload_system: SystemCharacteristics,
47
+ cluster_system: SystemCharacteristics | None,
48
+ resources_config_map: dict[str, str] | None,
49
+ ) -> WorkloadScheduling:
50
+ """Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
36
51
 
37
52
  Returns:
38
- returns true if workload can schedule, otherwise returns false.
53
+ returns WorkloadScheduling describing scheduling option.
39
54
  """
40
- resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
41
- cluster_config_map = get_cluster_configmap(resources_configmap_name)
55
+ if is_dry_run() and not cluster_system:
56
+ xpk_print('Skipping workload scheduling validation in dry run.')
57
+ return WorkloadScheduling.AVAILABLE
42
58
 
43
- # Prevents workload creation failure for existing clusters with no ConfigMap
44
- if cluster_config_map is None:
59
+ if resources_config_map is None:
45
60
  xpk_print(
46
- 'No ConfigMap exist for cluster with the name'
47
- f' {resources_configmap_name}.'
61
+ "Skipping workload scheduling validation, because there's no Resources"
62
+ ' ConfigMap in the cluster.'
48
63
  )
49
- return True
50
-
51
- if is_dry_run():
52
- return True
64
+ return WorkloadScheduling.AVAILABLE
53
65
 
54
- # Check for gke accelerator type:
55
- missing_gke_accelerator_type = False
56
- if not cluster_config_map.get(system.gke_accelerator):
57
- xpk_print(
58
- f'GKE Accelerator Type Check: {args.workload} is requesting'
59
- f' {system.gke_accelerator} but cluster only contains'
60
- f' {cluster_config_map.keys()}. '
61
- )
62
- missing_gke_accelerator_type = True
63
- elif (
64
- cluster_config_map[system.gke_accelerator]
65
- == AUTOPROVISIONING_CONFIG_VALUE
66
- ):
66
+ if _is_cluster_set_up_for_nap(workload_system, resources_config_map):
67
67
  # Run total chip check when in autoprovisioning mode.
68
68
  max_chips_in_cluster = int(
69
- cluster_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY]
69
+ resources_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY]
70
+ )
71
+ num_chips_in_workload = get_total_chips_requested_from_args(
72
+ args, workload_system
70
73
  )
71
- num_chips_in_workload = get_total_chips_requested_from_args(args, system)
72
74
 
73
75
  if num_chips_in_workload > max_chips_in_cluster:
74
76
  xpk_print(
@@ -77,44 +79,100 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
77
79
  ' Resize the cluster to support more chips with'
78
80
  ' `xpk cluster create --autoprovisioning-max-chips=X ...`'
79
81
  )
80
- return False
81
- return True
82
+ return WorkloadScheduling.UNAVAILABLE
83
+ return WorkloadScheduling.AVAILABLE
84
+
85
+ if workload_system.device_type in resources_config_map:
86
+ if _check_workload_size_fits(
87
+ args,
88
+ workload_system,
89
+ max_vm_in_cluster=int(
90
+ resources_config_map[workload_system.device_type]
91
+ ),
92
+ ):
93
+ return WorkloadScheduling.AVAILABLE
94
+ else:
95
+ return WorkloadScheduling.UNAVAILABLE
96
+
97
+ if _check_sub_slicing_availability(
98
+ workload_system=workload_system, cluster_system=cluster_system
99
+ ):
100
+ assert cluster_system
101
+ if _check_workload_size_fits(
102
+ args,
103
+ workload_system,
104
+ max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
105
+ ):
106
+ return WorkloadScheduling.SUB_SLICING_AVAILABLE
107
+ else:
108
+ return WorkloadScheduling.UNAVAILABLE
109
+
110
+ xpk_print(
111
+ 'Workload scheduling validation failed. XPK will not create the workload'
112
+ f' {args.workload}.'
113
+ )
114
+ return WorkloadScheduling.UNAVAILABLE
115
+
116
+
117
+ def _is_cluster_set_up_for_nap(
118
+ workload_system: SystemCharacteristics, resources_config_map: dict[str, str]
119
+ ) -> bool:
120
+ return (
121
+ resources_config_map.get(workload_system.gke_accelerator, None)
122
+ == AUTOPROVISIONING_CONFIG_VALUE
123
+ )
82
124
 
83
- # Check for device type
84
- missing_device_type = False
85
- device_type = system.device_type
86
- if device_type not in cluster_config_map:
87
- xpk_print(
88
- f'Device Type Check: {args.workload} is requesting {device_type} but '
89
- f'cluster only contains {cluster_config_map.keys()}. '
90
- )
91
- missing_device_type = True
92
125
 
93
- if missing_device_type and missing_gke_accelerator_type:
126
+ def _check_workload_size_fits(
127
+ args,
128
+ workload_system: SystemCharacteristics,
129
+ max_vm_in_cluster: int,
130
+ ) -> bool:
131
+ if workload_system.accelerator_type == AcceleratorType.GPU:
132
+ vm_required_by_workload = args.num_nodes
133
+ else:
134
+ vm_required_by_workload = args.num_slices * workload_system.vms_per_slice
135
+
136
+ if vm_required_by_workload > max_vm_in_cluster:
94
137
  xpk_print(
95
- 'Both Device Type and GKE Accelerator Type checks failed.'
96
- f' XPK will not create the workload {args.workload}.'
138
+ f'{args.workload} is requesting {args.num_slices} slice/slices of'
139
+ f' {workload_system.device_type}, which is'
140
+ f' {vm_required_by_workload} VMs, but the cluster only contains'
141
+ f' {max_vm_in_cluster} VMs of {workload_system.device_type}. XPK will'
142
+ ' not create this workload.'
97
143
  )
98
144
  return False
99
- else:
100
- # Check if the size of the workload will fit in the cluster.
101
- max_vm_in_cluster = int(cluster_config_map[device_type])
102
- if system.accelerator_type == AcceleratorType.GPU:
103
- vm_required_by_workload = args.num_nodes
104
- else:
105
- vm_required_by_workload = args.num_slices * system.vms_per_slice
106
- if vm_required_by_workload > max_vm_in_cluster:
107
- xpk_print(
108
- f'{args.workload} is requesting {args.num_slices} slice/slices of'
109
- f' {device_type}, which is {vm_required_by_workload} VMs, but the'
110
- f' cluster only contains {max_vm_in_cluster} VMs of {device_type}.'
111
- ' XPK will not create this workload.'
112
- )
113
- return False
114
-
115
145
  return True
116
146
 
117
147
 
148
+ def _check_sub_slicing_availability(
149
+ workload_system: SystemCharacteristics,
150
+ cluster_system: SystemCharacteristics | None,
151
+ ) -> bool:
152
+ if (
153
+ (not FeatureFlags.SUB_SLICING_ENABLED)
154
+ or (not cluster_system)
155
+ or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
156
+ or (not cluster_system.supports_sub_slicing)
157
+ or (workload_system.topology not in SUB_SLICING_TOPOLOGIES)
158
+ ):
159
+ return False
160
+
161
+ return_code, sub_slicing_enabled = has_sub_slicing_enabled()
162
+ if return_code != 0 or not sub_slicing_enabled:
163
+ return False
164
+
165
+ return_code, current_version = get_installed_kueue_version(
166
+ dry_run_version=Version('0.13')
167
+ )
168
+
169
+ return (
170
+ return_code == 0
171
+ and current_version is not None
172
+ and current_version >= _SUB_SLICING_MINIMUM_KUEUE_VERSION
173
+ )
174
+
175
+
118
176
  def get_total_chips_requested_from_args(
119
177
  args, system: SystemCharacteristics
120
178
  ) -> int:
@@ -135,7 +193,7 @@ def get_total_chips_requested_from_args(
135
193
  return int(num_chips)
136
194
 
137
195
 
138
- def get_cpu_affinity(accelerator_type) -> str:
196
+ def get_cpu_affinity(accelerator_type: AcceleratorType) -> str:
139
197
  """Generate affinity rules for CPU nodepools, so that workload pods are
140
198
  not scheduled on the default pool machines.
141
199
  Args:
@@ -199,10 +257,8 @@ def get_gpu_scheduler(
199
257
  """
200
258
  gpu_scheduler = gpu_scheduler_yaml.format(
201
259
  scheduler_name=args.scheduler,
202
- accelerator_label=create_accelerator_label(
203
- system.accelerator_type, system
204
- ),
205
- machine_label=create_machine_label(system.accelerator_type, system),
260
+ accelerator_label=create_accelerator_label(system),
261
+ machine_label=create_machine_label(system),
206
262
  node_pool_name=f'{args.cluster}-np-0',
207
263
  autoprovisioning_args=autoprovisioning_args,
208
264
  )
@@ -217,74 +273,14 @@ def get_gpu_scheduler(
217
273
  return gpu_scheduler, return_code
218
274
 
219
275
 
220
- def create_accelerator_label(accelerator_type, system) -> str:
221
- """Generates accelerator label.
222
-
223
- Args:
224
- accelerator_type: type of accelerator.
225
- system: system characteristics.
226
-
227
- Returns:
228
- The accelerator label.
229
- """
230
- if accelerator_type == AcceleratorType.CPU:
231
- return ''
232
- return (
233
- f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].accelerator_label}:'
234
- f' {system.gke_accelerator}'
235
- )
236
-
237
-
238
- def create_tpu_machine_type(accelerator_type, system) -> str:
239
- """Generates TPU machine type..
240
-
241
- Args:
242
- accelerator_type: type of accelerator.
243
- system: system characteristics.
244
-
245
- Returns:
246
- The accelerator label.
247
- """
248
- if accelerator_type == AcceleratorType.TPU:
276
+ def create_tpu_machine_type(system: SystemCharacteristics) -> str:
277
+ if system.accelerator_type == AcceleratorType.TPU:
249
278
  return f'{system.gce_machine_type}'
250
279
  return ''
251
280
 
252
281
 
253
- def create_machine_label(
254
- accelerator_type, system, autoprovisioning_enabled: bool = False
255
- ) -> str:
256
- """Generates machine label.
257
-
258
- Args:
259
- accelerator_type: type of accelerator.
260
- system: system characteristics.
261
- autoprovisioning_enabled: describes autoprovisioning enablement.
262
-
263
- Returns:
264
- The machine label.
265
- """
266
- if accelerator_type == AcceleratorType.TPU and not autoprovisioning_enabled:
267
- return (
268
- f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].machine_label}:'
269
- f' {system.topology}'
270
- )
271
- return ''
272
-
273
-
274
- def create_tpu_topology(
275
- accelerator_type, system, autoprovisioning_enabled: bool = False
276
- ) -> str:
277
- """Generates TPU topology.
278
-
279
- Args:
280
- accelerator_type: type of accelerator.
281
- system: system characteristics.
282
- autoprovisioning_enabled: describes autoprovisioning enablement.
283
-
284
- Returns:
285
- The machine label.
286
- """
287
- if accelerator_type == AcceleratorType.TPU and not autoprovisioning_enabled:
282
+ def create_tpu_topology(system: SystemCharacteristics) -> str:
283
+ if system.accelerator_type == AcceleratorType.TPU:
288
284
  return f'{system.topology}'
289
285
  return ''
290
286