xpk 0.14.2__py3-none-any.whl → 0.14.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +57 -22
- xpk/commands/cluster_gcluster_test.py +2 -2
- xpk/commands/cluster_test.py +197 -25
- xpk/commands/inspector.py +20 -7
- xpk/commands/kind.py +1 -1
- xpk/commands/workload.py +42 -4
- xpk/commands/workload_test.py +88 -5
- xpk/core/blueprint/blueprint_definitions.py +16 -1
- xpk/core/blueprint/blueprint_generator.py +11 -11
- xpk/core/capacity.py +17 -0
- xpk/core/capacity_test.py +50 -0
- xpk/core/config.py +1 -1
- xpk/core/docker_container.py +4 -4
- xpk/core/docker_resources.py +11 -11
- xpk/core/kjob.py +3 -5
- xpk/core/kueue_manager.py +21 -10
- xpk/core/kueue_manager_test.py +379 -536
- xpk/core/nap.py +1 -1
- xpk/core/nodepool.py +9 -9
- xpk/core/nodepool_test.py +4 -4
- xpk/core/pathways.py +1 -1
- xpk/core/resources.py +1 -1
- xpk/core/scheduling.py +7 -13
- xpk/core/system_characteristics.py +42 -35
- xpk/core/system_characteristics_test.py +3 -3
- xpk/core/testing/__init__.py +15 -0
- xpk/core/testing/commands_tester.py +131 -0
- xpk/core/testing/commands_tester_test.py +129 -0
- xpk/core/updates.py +57 -0
- xpk/core/updates_test.py +80 -0
- xpk/main.py +7 -4
- xpk/parser/common.py +8 -0
- xpk/utils/execution_context.py +20 -2
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/METADATA +1 -3
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/RECORD +39 -33
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/WHEEL +0 -0
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.2.dist-info → xpk-0.14.3.dist-info}/top_level.txt +0 -0
xpk/commands/workload_test.py
CHANGED
|
@@ -17,8 +17,9 @@ limitations under the License.
|
|
|
17
17
|
import dataclasses
|
|
18
18
|
from unittest.mock import MagicMock, patch
|
|
19
19
|
import pytest
|
|
20
|
-
from ..core.system_characteristics import SystemCharacteristics
|
|
21
|
-
from .workload import _validate_sub_slicing_topology
|
|
20
|
+
from ..core.system_characteristics import SystemCharacteristics, AcceleratorType
|
|
21
|
+
from .workload import _validate_sub_slicing_topology, _validate_sub_slicing_availability
|
|
22
|
+
from packaging.version import Version
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
SYSTEM_CHARACTERISTICS = SystemCharacteristics(
|
|
@@ -27,7 +28,7 @@ SYSTEM_CHARACTERISTICS = SystemCharacteristics(
|
|
|
27
28
|
gke_accelerator='nvidia-l4',
|
|
28
29
|
gce_machine_type='g2-standard-12',
|
|
29
30
|
chips_per_vm=1,
|
|
30
|
-
accelerator_type=
|
|
31
|
+
accelerator_type=AcceleratorType.TPU,
|
|
31
32
|
device_type='l4-1',
|
|
32
33
|
supports_sub_slicing=True,
|
|
33
34
|
requires_workload_policy=False,
|
|
@@ -40,7 +41,7 @@ def xpk_print(mocker):
|
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
|
|
43
|
-
xpk_print,
|
|
44
|
+
xpk_print: MagicMock,
|
|
44
45
|
):
|
|
45
46
|
with pytest.raises(SystemExit):
|
|
46
47
|
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '2x1')
|
|
@@ -50,7 +51,9 @@ def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
|
|
|
50
51
|
)
|
|
51
52
|
|
|
52
53
|
|
|
53
|
-
def test_validate_sub_slicing_topology_exits_for_too_large_topology(
|
|
54
|
+
def test_validate_sub_slicing_topology_exits_for_too_large_topology(
|
|
55
|
+
xpk_print: MagicMock,
|
|
56
|
+
):
|
|
54
57
|
with pytest.raises(SystemExit):
|
|
55
58
|
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '16x16')
|
|
56
59
|
|
|
@@ -64,6 +67,86 @@ def test_validate_sub_slicing_topology_does_nothing_for_supported_topology():
|
|
|
64
67
|
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '4x4')
|
|
65
68
|
|
|
66
69
|
|
|
70
|
+
def test_validate_sub_slicing_availability_exits_when_getting_topologies_fails(
|
|
71
|
+
xpk_print: MagicMock, mocker
|
|
72
|
+
):
|
|
73
|
+
mocker.patch(
|
|
74
|
+
'xpk.commands.workload.has_sub_slicing_enabled',
|
|
75
|
+
return_value=(1, None),
|
|
76
|
+
)
|
|
77
|
+
with pytest.raises(SystemExit):
|
|
78
|
+
_validate_sub_slicing_availability()
|
|
79
|
+
|
|
80
|
+
assert (
|
|
81
|
+
'Unable to validate sub-slicing support'
|
|
82
|
+
in xpk_print.mock_calls[0].args[0]
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_validate_sub_slicing_availability_exits_when_subslicing_topology_is_not_defined(
|
|
87
|
+
xpk_print: MagicMock, mocker
|
|
88
|
+
):
|
|
89
|
+
mocker.patch(
|
|
90
|
+
'xpk.commands.workload.has_sub_slicing_enabled',
|
|
91
|
+
return_value=(0, False),
|
|
92
|
+
)
|
|
93
|
+
with pytest.raises(SystemExit):
|
|
94
|
+
_validate_sub_slicing_availability()
|
|
95
|
+
|
|
96
|
+
assert (
|
|
97
|
+
'Cluster has not been not set up for Sub-slicing.'
|
|
98
|
+
in xpk_print.mock_calls[0].args[0]
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_validate_sub_slicing_availability_exits_when_kueue_version_cannot_be_determined(
|
|
103
|
+
xpk_print: MagicMock, mocker
|
|
104
|
+
):
|
|
105
|
+
mocker.patch(
|
|
106
|
+
'xpk.commands.workload.has_sub_slicing_enabled',
|
|
107
|
+
return_value=(0, True),
|
|
108
|
+
)
|
|
109
|
+
mocker.patch(
|
|
110
|
+
'xpk.commands.workload.KueueManager.get_installed_kueue_version',
|
|
111
|
+
return_value=(1, None),
|
|
112
|
+
)
|
|
113
|
+
with pytest.raises(SystemExit):
|
|
114
|
+
_validate_sub_slicing_availability()
|
|
115
|
+
|
|
116
|
+
assert 'Unable to validate sub-slicing' in xpk_print.mock_calls[0].args[0]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_validate_sub_slicing_availability_exits_when_kueue_version_does_not_meet_minimum_requirements(
|
|
120
|
+
xpk_print: MagicMock, mocker
|
|
121
|
+
):
|
|
122
|
+
mocker.patch(
|
|
123
|
+
'xpk.commands.workload.has_sub_slicing_enabled',
|
|
124
|
+
return_value=(0, True),
|
|
125
|
+
)
|
|
126
|
+
mocker.patch(
|
|
127
|
+
'xpk.commands.workload.KueueManager.get_installed_kueue_version',
|
|
128
|
+
return_value=(0, Version('0.0.0')),
|
|
129
|
+
)
|
|
130
|
+
with pytest.raises(SystemExit):
|
|
131
|
+
_validate_sub_slicing_availability()
|
|
132
|
+
|
|
133
|
+
assert 'The minimal required version is' in xpk_print.mock_calls[0].args[0]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_validate_sub_slicing_availability_does_nothing_when_cluster_is_correctly_configured_for_subslicing(
|
|
137
|
+
mocker,
|
|
138
|
+
):
|
|
139
|
+
mocker.patch(
|
|
140
|
+
'xpk.commands.workload.has_sub_slicing_enabled',
|
|
141
|
+
return_value=(0, True),
|
|
142
|
+
)
|
|
143
|
+
mocker.patch(
|
|
144
|
+
'xpk.commands.workload.KueueManager.get_installed_kueue_version',
|
|
145
|
+
return_value=(0, Version('0.13.0')),
|
|
146
|
+
)
|
|
147
|
+
_validate_sub_slicing_availability()
|
|
148
|
+
|
|
149
|
+
|
|
67
150
|
@patch('xpk.commands.common.xpk_print')
|
|
68
151
|
def test_validate_sub_slicing_topology_fails_for_unsupported_system(
|
|
69
152
|
common_xpk_print: MagicMock,
|
|
@@ -36,6 +36,21 @@ class DeploymentModule:
|
|
|
36
36
|
settings: Optional[dict[str, Any]] = None
|
|
37
37
|
use: Optional[list[str]] = None
|
|
38
38
|
|
|
39
|
+
def update_settings(self, additionalSettings: dict[str, Any]) -> None:
|
|
40
|
+
if self.settings is None:
|
|
41
|
+
self.settings = dict()
|
|
42
|
+
self.settings.update(additionalSettings)
|
|
43
|
+
|
|
44
|
+
def set_setting(self, key: str, value: Any) -> None:
|
|
45
|
+
if self.settings is None:
|
|
46
|
+
self.settings = dict()
|
|
47
|
+
self.settings[key] = value
|
|
48
|
+
|
|
49
|
+
def append_use(self, use: str) -> None:
|
|
50
|
+
if self.use is None:
|
|
51
|
+
self.use = list()
|
|
52
|
+
self.use.append(use)
|
|
53
|
+
|
|
39
54
|
|
|
40
55
|
@dataclass
|
|
41
56
|
class DeploymentGroup:
|
|
@@ -59,6 +74,6 @@ class Blueprint:
|
|
|
59
74
|
blueprint_name: Optional[str]
|
|
60
75
|
toolkit_modules_url: str
|
|
61
76
|
toolkit_modules_version: str
|
|
62
|
-
vars: dict[str, str | list[str]] | None
|
|
77
|
+
vars: dict[str, str | list[str] | dict[str, str]] | None
|
|
63
78
|
terraform_providers: Optional[dict[str, Any]] = None
|
|
64
79
|
validators: Optional[list[Any]] = None
|
|
@@ -211,9 +211,9 @@ class BlueprintGenerator:
|
|
|
211
211
|
outputs=["instructions"],
|
|
212
212
|
)
|
|
213
213
|
if capacity_type == CapacityType.FLEX_START:
|
|
214
|
-
a3_megagpu_pool_0.
|
|
214
|
+
a3_megagpu_pool_0.update_settings(self.get_dws_flex_start())
|
|
215
215
|
else:
|
|
216
|
-
a3_megagpu_pool_0.
|
|
216
|
+
a3_megagpu_pool_0.update_settings({"static_node_count": num_nodes})
|
|
217
217
|
|
|
218
218
|
set_placement_policy = capacity_type != CapacityType.SPOT
|
|
219
219
|
workload = DeploymentModule(
|
|
@@ -252,8 +252,8 @@ class BlueprintGenerator:
|
|
|
252
252
|
|
|
253
253
|
print(reservation_placement_policy)
|
|
254
254
|
if reservation_placement_policy is not None:
|
|
255
|
-
a3_megagpu_pool_0.
|
|
256
|
-
reservation_placement_policy
|
|
255
|
+
a3_megagpu_pool_0.set_setting(
|
|
256
|
+
"placement_policy", reservation_placement_policy
|
|
257
257
|
)
|
|
258
258
|
|
|
259
259
|
primary_group = DeploymentGroup(
|
|
@@ -268,7 +268,7 @@ class BlueprintGenerator:
|
|
|
268
268
|
],
|
|
269
269
|
)
|
|
270
270
|
if set_placement_policy and reservation_placement_policy is None:
|
|
271
|
-
a3_megagpu_pool_0.
|
|
271
|
+
a3_megagpu_pool_0.append_use(group_placement_0.id)
|
|
272
272
|
primary_group.modules.append(group_placement_0)
|
|
273
273
|
a3_mega_blueprint = Blueprint(
|
|
274
274
|
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
@@ -580,9 +580,9 @@ class BlueprintGenerator:
|
|
|
580
580
|
outputs=["instructions"],
|
|
581
581
|
)
|
|
582
582
|
if capacity_type == CapacityType.FLEX_START:
|
|
583
|
-
gpu_pool.
|
|
583
|
+
gpu_pool.update_settings(self.get_dws_flex_start())
|
|
584
584
|
else:
|
|
585
|
-
gpu_pool.
|
|
585
|
+
gpu_pool.update_settings({"static_node_count": num_nodes})
|
|
586
586
|
|
|
587
587
|
workload_manager_install_id = "workload-manager-install"
|
|
588
588
|
workload_manager_install = DeploymentModule(
|
|
@@ -855,9 +855,9 @@ class BlueprintGenerator:
|
|
|
855
855
|
outputs=["instructions"],
|
|
856
856
|
)
|
|
857
857
|
if capacity_type == CapacityType.FLEX_START:
|
|
858
|
-
gpu_pool.
|
|
858
|
+
gpu_pool.update_settings(self.get_dws_flex_start())
|
|
859
859
|
else:
|
|
860
|
-
gpu_pool.
|
|
860
|
+
gpu_pool.update_settings({"static_node_count": num_nodes})
|
|
861
861
|
|
|
862
862
|
workload_manager_install_id = "workload-manager-install"
|
|
863
863
|
workload_manager_install = DeploymentModule(
|
|
@@ -956,7 +956,7 @@ class BlueprintGenerator:
|
|
|
956
956
|
)
|
|
957
957
|
|
|
958
958
|
def _getblock_terraform_backend(
|
|
959
|
-
self, gcs_bucket: str, cluster_name: str, prefix: str = ""
|
|
959
|
+
self, gcs_bucket: str | None, cluster_name: str, prefix: str = ""
|
|
960
960
|
) -> dict | None:
|
|
961
961
|
if gcs_bucket is None:
|
|
962
962
|
return None
|
|
@@ -986,7 +986,7 @@ class BlueprintGenerator:
|
|
|
986
986
|
yaml_parser.dump(xpk_blueprint, blueprint_file)
|
|
987
987
|
return blueprint_path
|
|
988
988
|
|
|
989
|
-
def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
|
|
989
|
+
def _get_blueprint_path(self, blueprint_name, prefix: str = "") -> str:
|
|
990
990
|
blueprint_path = os.path.join(
|
|
991
991
|
self._get_storage_path(prefix), f"{blueprint_name}.yaml"
|
|
992
992
|
)
|
xpk/core/capacity.py
CHANGED
|
@@ -152,6 +152,23 @@ def get_reservation_placement_policy(
|
|
|
152
152
|
return output.strip()
|
|
153
153
|
|
|
154
154
|
|
|
155
|
+
def get_reservation_deployment_type(
|
|
156
|
+
reservation: str, zone: str, project: str
|
|
157
|
+
) -> str:
|
|
158
|
+
"""Get reservation deployment type."""
|
|
159
|
+
command = (
|
|
160
|
+
f'gcloud beta compute reservations describe {reservation}'
|
|
161
|
+
f' --project={project} --zone={zone} --format="value(deploymentType)"'
|
|
162
|
+
)
|
|
163
|
+
return_code, output = run_command_for_value(
|
|
164
|
+
command, 'Get reservation deployment type', dry_run_return_val='DENSE'
|
|
165
|
+
)
|
|
166
|
+
if return_code != 0:
|
|
167
|
+
xpk_print(f'Get reservation deployment type ERROR {return_code}')
|
|
168
|
+
xpk_exit(1)
|
|
169
|
+
return output.strip()
|
|
170
|
+
|
|
171
|
+
|
|
155
172
|
def verify_reservation_exists(args) -> int:
|
|
156
173
|
"""Verify the reservation exists.
|
|
157
174
|
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
from unittest.mock import MagicMock, patch
|
|
19
|
+
from .capacity import get_reservation_deployment_type
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@patch('xpk.core.capacity.xpk_print')
|
|
23
|
+
def test_get_reservation_deployment_type_exits_with_command_fails(
|
|
24
|
+
xpk_print: MagicMock, mocker
|
|
25
|
+
):
|
|
26
|
+
mocker.patch(
|
|
27
|
+
target='xpk.core.capacity.run_command_for_value', return_value=(1, '')
|
|
28
|
+
)
|
|
29
|
+
with pytest.raises(SystemExit):
|
|
30
|
+
get_reservation_deployment_type(
|
|
31
|
+
reservation='reservation', zone='zone', project='project'
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
assert (
|
|
35
|
+
'Get reservation deployment type ERROR 1'
|
|
36
|
+
in xpk_print.mock_calls[0].args[0]
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_get_reservation_deployment_type_returns_deployment_type_when_command_succeeds(
|
|
41
|
+
mocker,
|
|
42
|
+
):
|
|
43
|
+
mocker.patch(
|
|
44
|
+
target='xpk.core.capacity.run_command_for_value',
|
|
45
|
+
return_value=(0, 'DENSE'),
|
|
46
|
+
)
|
|
47
|
+
result = get_reservation_deployment_type(
|
|
48
|
+
reservation='reservation', zone='zone', project='project'
|
|
49
|
+
)
|
|
50
|
+
assert result == 'DENSE'
|
xpk/core/config.py
CHANGED
|
@@ -22,7 +22,7 @@ from ..utils import file
|
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
|
|
24
24
|
# This is the version for XPK PyPI package
|
|
25
|
-
__version__ = 'v0.14.
|
|
25
|
+
__version__ = 'v0.14.3'
|
|
26
26
|
XPK_CURRENT_VERSION = __version__
|
|
27
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
28
28
|
|
xpk/core/docker_container.py
CHANGED
|
@@ -97,7 +97,7 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
|
|
|
97
97
|
)
|
|
98
98
|
|
|
99
99
|
gpu_workload_terminate_command = ''
|
|
100
|
-
if system.accelerator_type == AcceleratorType
|
|
100
|
+
if system.accelerator_type == AcceleratorType.GPU:
|
|
101
101
|
gpu_workload_terminate_command = (
|
|
102
102
|
'echo Main app is done > /usr/share/workload/workload_terminated; '
|
|
103
103
|
)
|
|
@@ -105,7 +105,7 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
|
|
|
105
105
|
tpu_stacktrace_terminate_command = ''
|
|
106
106
|
if (
|
|
107
107
|
not args.use_pathways
|
|
108
|
-
and system.accelerator_type == AcceleratorType
|
|
108
|
+
and system.accelerator_type == AcceleratorType.TPU
|
|
109
109
|
and args.deploy_stacktrace_sidecar
|
|
110
110
|
):
|
|
111
111
|
tpu_stacktrace_terminate_command = (
|
|
@@ -193,7 +193,7 @@ def get_user_workload_container(args, system: SystemCharacteristics):
|
|
|
193
193
|
].resource_type
|
|
194
194
|
if (
|
|
195
195
|
not args.use_pathways
|
|
196
|
-
and system.accelerator_type == AcceleratorType
|
|
196
|
+
and system.accelerator_type == AcceleratorType.TPU
|
|
197
197
|
and args.deploy_stacktrace_sidecar
|
|
198
198
|
):
|
|
199
199
|
xpk_print(
|
|
@@ -219,7 +219,7 @@ def get_main_container_docker_image(args, system: SystemCharacteristics) -> str:
|
|
|
219
219
|
Workload docker image as a YAML string
|
|
220
220
|
"""
|
|
221
221
|
|
|
222
|
-
if system.accelerator_type == AcceleratorType
|
|
222
|
+
if system.accelerator_type == AcceleratorType.GPU:
|
|
223
223
|
return 'gpu-image'
|
|
224
224
|
|
|
225
225
|
return f'{args.docker_name}'
|
xpk/core/docker_resources.py
CHANGED
|
@@ -43,10 +43,10 @@ def get_main_container_resources(
|
|
|
43
43
|
return resources_yaml
|
|
44
44
|
|
|
45
45
|
gpu_resources_yaml = """nvidia.com/gpu: {system.chips_per_vm}"""
|
|
46
|
-
if system.accelerator_type == AcceleratorType
|
|
46
|
+
if system.accelerator_type == AcceleratorType.GPU:
|
|
47
47
|
return gpu_resources_yaml.format(system=system)
|
|
48
48
|
|
|
49
|
-
if system.accelerator_type == AcceleratorType
|
|
49
|
+
if system.accelerator_type == AcceleratorType.CPU:
|
|
50
50
|
# CPUs don't have chips, but have a subresource called vCPUs.
|
|
51
51
|
# system.chips_per_vm is used as a proxy for vCPUs.
|
|
52
52
|
# Some vCPUs get used in hosting system pods of the workloads,
|
|
@@ -67,10 +67,10 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
|
|
|
67
67
|
str:
|
|
68
68
|
YAML with the env config for the main container, as a YAML string.
|
|
69
69
|
"""
|
|
70
|
-
if system.accelerator_type == AcceleratorType
|
|
70
|
+
if system.accelerator_type == AcceleratorType.GPU:
|
|
71
71
|
return get_gpu_env(args, system)
|
|
72
72
|
|
|
73
|
-
if system.accelerator_type == AcceleratorType
|
|
73
|
+
if system.accelerator_type == AcceleratorType.CPU:
|
|
74
74
|
return get_cpu_env(args, system)
|
|
75
75
|
|
|
76
76
|
return format_env_dict(args.env, system)
|
|
@@ -176,7 +176,7 @@ def get_cpu_env(args, system) -> str:
|
|
|
176
176
|
|
|
177
177
|
|
|
178
178
|
def format_env_dict(env, system: SystemCharacteristics) -> str:
|
|
179
|
-
if system.accelerator_type == AcceleratorType
|
|
179
|
+
if system.accelerator_type == AcceleratorType.GPU:
|
|
180
180
|
# For GPUs, it has two more spaces ahead of name and value respectively
|
|
181
181
|
env_format = '''
|
|
182
182
|
- name: {key}
|
|
@@ -265,7 +265,7 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
|
|
|
265
265
|
driver: {driver}"""
|
|
266
266
|
|
|
267
267
|
if (
|
|
268
|
-
system.accelerator_type == AcceleratorType
|
|
268
|
+
system.accelerator_type == AcceleratorType.TPU
|
|
269
269
|
and args.deploy_stacktrace_sidecar
|
|
270
270
|
):
|
|
271
271
|
volumes += """
|
|
@@ -317,7 +317,7 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
|
|
|
317
317
|
name: shared-tmp
|
|
318
318
|
"""
|
|
319
319
|
elif (
|
|
320
|
-
system.accelerator_type == AcceleratorType
|
|
320
|
+
system.accelerator_type == AcceleratorType.TPU
|
|
321
321
|
and args.deploy_stacktrace_sidecar
|
|
322
322
|
):
|
|
323
323
|
volume_mount_yaml += """- name: tpu-stack-trace
|
|
@@ -325,7 +325,7 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
|
|
|
325
325
|
- name: shared-data
|
|
326
326
|
mountPath: /shared-volume
|
|
327
327
|
"""
|
|
328
|
-
elif system.accelerator_type == AcceleratorType
|
|
328
|
+
elif system.accelerator_type == AcceleratorType.GPU:
|
|
329
329
|
volume_mount_yaml = ''
|
|
330
330
|
|
|
331
331
|
storages: list[Storage] = (
|
|
@@ -379,7 +379,7 @@ def add_container_ports(args, system: SystemCharacteristics) -> str:
|
|
|
379
379
|
return ''
|
|
380
380
|
|
|
381
381
|
gpu_port_yaml = """- containerPort: 6002"""
|
|
382
|
-
if system.accelerator_type == AcceleratorType
|
|
382
|
+
if system.accelerator_type == AcceleratorType.GPU:
|
|
383
383
|
return gpu_port_yaml
|
|
384
384
|
return port_yaml
|
|
385
385
|
|
|
@@ -394,7 +394,7 @@ def add_jax_coordinator_port(system) -> str:
|
|
|
394
394
|
str:
|
|
395
395
|
jax coordinator port as a YAML string
|
|
396
396
|
"""
|
|
397
|
-
if system.accelerator_type == AcceleratorType
|
|
397
|
+
if system.accelerator_type == AcceleratorType.CPU:
|
|
398
398
|
return '- containerPort: 1234'
|
|
399
399
|
return ''
|
|
400
400
|
|
|
@@ -411,6 +411,6 @@ def add_image_pull_policy_for_pw_or_gpu(args, system: SystemCharacteristics):
|
|
|
411
411
|
"""
|
|
412
412
|
yaml = """imagePullPolicy: Always"""
|
|
413
413
|
|
|
414
|
-
if args.use_pathways or system.accelerator_type == AcceleratorType
|
|
414
|
+
if args.use_pathways or system.accelerator_type == AcceleratorType.GPU:
|
|
415
415
|
return yaml.format(args=args)
|
|
416
416
|
return ''
|
xpk/core/kjob.py
CHANGED
|
@@ -296,15 +296,13 @@ def create_job_template_instance(
|
|
|
296
296
|
working_directory = JobTemplateDefaults.WORKING_DIRECTORY.value
|
|
297
297
|
resources = (
|
|
298
298
|
job_resources_template.format(gpu_per_node=system.chips_per_vm)
|
|
299
|
-
if system is not None
|
|
300
|
-
and system.accelerator_type == AcceleratorType["GPU"]
|
|
299
|
+
if system is not None and system.accelerator_type == AcceleratorType.GPU
|
|
301
300
|
else ""
|
|
302
301
|
)
|
|
303
302
|
|
|
304
303
|
node_selector = (
|
|
305
304
|
job_node_selector_template.format(gpu_name=system.gke_accelerator)
|
|
306
|
-
if system is not None
|
|
307
|
-
and system.accelerator_type == AcceleratorType["GPU"]
|
|
305
|
+
if system is not None and system.accelerator_type == AcceleratorType.GPU
|
|
308
306
|
else ""
|
|
309
307
|
)
|
|
310
308
|
yml_string = job_template_yaml.format(
|
|
@@ -319,7 +317,7 @@ def create_job_template_instance(
|
|
|
319
317
|
priority=args.priority if hasattr(args, "priority") else "medium",
|
|
320
318
|
service_account=service_account,
|
|
321
319
|
)
|
|
322
|
-
if system is not None and system.accelerator_type == AcceleratorType
|
|
320
|
+
if system is not None and system.accelerator_type == AcceleratorType.GPU:
|
|
323
321
|
yml_string = decorate_job_template_with_gpu(yml_string, system.device_type)
|
|
324
322
|
|
|
325
323
|
return run_kubectl_apply(
|
xpk/core/kueue_manager.py
CHANGED
|
@@ -40,6 +40,7 @@ from ..core.commands import (
|
|
|
40
40
|
from ..utils.file import write_tmp_file
|
|
41
41
|
from ..utils.console import xpk_print, xpk_exit
|
|
42
42
|
from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
|
|
43
|
+
from packaging.version import Version
|
|
43
44
|
|
|
44
45
|
WAIT_FOR_KUEUE_TIMEOUT = "10m"
|
|
45
46
|
CLUSTER_QUEUE_NAME = "cluster-queue"
|
|
@@ -51,7 +52,7 @@ KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
|
|
|
51
52
|
KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2"
|
|
52
53
|
MEMORY_SIZE_PER_VM = 1.2
|
|
53
54
|
MIN_MEMORY_LIMIT_SIZE = 4096
|
|
54
|
-
KUEUE_VERSION = "v0.12.2"
|
|
55
|
+
KUEUE_VERSION = Version("v0.12.2")
|
|
55
56
|
|
|
56
57
|
|
|
57
58
|
@dataclass
|
|
@@ -78,7 +79,7 @@ class KueueManager:
|
|
|
78
79
|
|
|
79
80
|
def __init__(
|
|
80
81
|
self,
|
|
81
|
-
kueue_version:
|
|
82
|
+
kueue_version: Version = KUEUE_VERSION,
|
|
82
83
|
template_path=TEMPLATE_PATH,
|
|
83
84
|
):
|
|
84
85
|
self.kueue_version = kueue_version
|
|
@@ -111,9 +112,9 @@ class KueueManager:
|
|
|
111
112
|
)
|
|
112
113
|
return 0
|
|
113
114
|
else:
|
|
114
|
-
xpk_print(f"Upgrading Kueue to version {self.kueue_version}...")
|
|
115
|
+
xpk_print(f"Upgrading Kueue to version v{self.kueue_version}...")
|
|
115
116
|
else:
|
|
116
|
-
xpk_print(f"Installing Kueue version {self.kueue_version}...")
|
|
117
|
+
xpk_print(f"Installing Kueue version v{self.kueue_version}...")
|
|
117
118
|
|
|
118
119
|
install_return_code = self.__install(tolerations)
|
|
119
120
|
if install_return_code != 0:
|
|
@@ -121,7 +122,7 @@ class KueueManager:
|
|
|
121
122
|
|
|
122
123
|
return self.__configure(kueue_config)
|
|
123
124
|
|
|
124
|
-
def get_installed_kueue_version(self) -> tuple[int,
|
|
125
|
+
def get_installed_kueue_version(self) -> tuple[int, Version | None]:
|
|
125
126
|
command = (
|
|
126
127
|
"kubectl get deployment kueue-controller-manager -n kueue-system -o"
|
|
127
128
|
" jsonpath='{.spec.template.spec.containers[0].image}'"
|
|
@@ -130,15 +131,14 @@ class KueueManager:
|
|
|
130
131
|
return_code, val = run_command_for_value(
|
|
131
132
|
command,
|
|
132
133
|
task,
|
|
133
|
-
dry_run_return_val=""
|
|
134
|
-
v0.12.1""",
|
|
134
|
+
dry_run_return_val="",
|
|
135
135
|
)
|
|
136
136
|
if return_code != 0:
|
|
137
137
|
return return_code, None
|
|
138
138
|
version_tag = val.split(":")
|
|
139
139
|
if len(version_tag) == 1:
|
|
140
140
|
return 1, None
|
|
141
|
-
return return_code, version_tag[-1]
|
|
141
|
+
return return_code, Version(version_tag[-1])
|
|
142
142
|
|
|
143
143
|
def __install(
|
|
144
144
|
self,
|
|
@@ -162,7 +162,7 @@ class KueueManager:
|
|
|
162
162
|
return self.__wait_for_kueue_available()
|
|
163
163
|
|
|
164
164
|
def __install_kueue_crs(self) -> int:
|
|
165
|
-
manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/{self.kueue_version}/manifests.yaml"
|
|
165
|
+
manifest_url = f"https://github.com/kubernetes-sigs/kueue/releases/download/v{self.kueue_version}/manifests.yaml"
|
|
166
166
|
install_command = (
|
|
167
167
|
f"kubectl apply --server-side --force-conflicts -f {manifest_url}"
|
|
168
168
|
)
|
|
@@ -199,7 +199,7 @@ class KueueManager:
|
|
|
199
199
|
0 if successful and 1 otherwise.
|
|
200
200
|
"""
|
|
201
201
|
command = (
|
|
202
|
-
"kubectl wait deploy/kueue-controller-manager -
|
|
202
|
+
"kubectl wait deploy/kueue-controller-manager -n kueue-system"
|
|
203
203
|
f" --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}"
|
|
204
204
|
)
|
|
205
205
|
task = "Wait for Kueue to be available"
|
|
@@ -421,3 +421,14 @@ class KueueManager:
|
|
|
421
421
|
if return_code != 0:
|
|
422
422
|
xpk_print(f"{task} returned ERROR {return_code}")
|
|
423
423
|
return return_code
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def has_sub_slicing_enabled() -> tuple[int, bool | None]:
|
|
427
|
+
return_code, value = run_command_for_value(
|
|
428
|
+
command="kubectl get topology", task="Get defined topologies"
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
if return_code != 0:
|
|
432
|
+
return return_code, None
|
|
433
|
+
|
|
434
|
+
return return_code, SUB_SLICE_TOPOLOGY_NAME in value
|