xpk 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +33 -12
- xpk/commands/cluster_gcluster_test.py +5 -1
- xpk/commands/cluster_test.py +125 -0
- xpk/commands/config.py +3 -3
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +2 -0
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/workload.py +125 -139
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +3 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +2 -0
- xpk/core/cluster.py +18 -47
- xpk/core/cluster_test.py +76 -1
- xpk/core/config.py +81 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/kjob.py +17 -16
- xpk/core/kueue_manager.py +13 -19
- xpk/core/kueue_manager_test.py +27 -1
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +17 -15
- xpk/core/nodepool_test.py +25 -4
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +128 -132
- xpk/core/scheduling_test.py +215 -2
- xpk/core/system_characteristics.py +179 -0
- xpk/core/system_characteristics_test.py +49 -1
- xpk/core/telemetry.py +4 -4
- xpk/core/telemetry_test.py +9 -9
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +2 -0
- xpk/parser/cluster.py +22 -88
- xpk/parser/cluster_test.py +41 -0
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -41
- xpk/parser/workload_test.py +2 -48
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/utils/feature_flags.py +3 -0
- xpk/utils/validation.py +2 -2
- xpk-0.16.1.dist-info/METADATA +127 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/RECORD +67 -48
- xpk-0.15.0.dist-info/METADATA +0 -1666
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/WHEEL +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/top_level.txt +0 -0
xpk/core/scheduling_test.py
CHANGED
|
@@ -14,8 +14,32 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from
|
|
18
|
-
from
|
|
17
|
+
from argparse import Namespace
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
import dataclasses
|
|
20
|
+
import pytest
|
|
21
|
+
from pytest_mock import MockerFixture
|
|
22
|
+
from xpk.core.capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
|
|
23
|
+
from xpk.core.testing.commands_tester import CommandsTester
|
|
24
|
+
from xpk.utils.feature_flags import FeatureFlags
|
|
25
|
+
from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, get_placement_policy_name, is_placement_policy_supported
|
|
26
|
+
from .system_characteristics import SystemCharacteristics, AcceleratorType, DockerPlatform, get_system_characteristics_by_device_type
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _get_system_characteristics_or_die(
|
|
30
|
+
device_type: str,
|
|
31
|
+
) -> SystemCharacteristics:
|
|
32
|
+
system = get_system_characteristics_by_device_type(device_type)[0]
|
|
33
|
+
assert system
|
|
34
|
+
return system
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@pytest.fixture(autouse=True)
|
|
38
|
+
def commands_tester(mocker: MockerFixture) -> CommandsTester:
|
|
39
|
+
return CommandsTester(
|
|
40
|
+
mocker=mocker,
|
|
41
|
+
run_command_for_value_path='xpk.core.kueue_manager.run_command_for_value',
|
|
42
|
+
)
|
|
19
43
|
|
|
20
44
|
|
|
21
45
|
def test_create_sub_slicing_annotations_returns_valid_annotations():
|
|
@@ -41,6 +65,7 @@ def test_create_placement_policy_label_returns_valid_label():
|
|
|
41
65
|
device_type='tpu7x',
|
|
42
66
|
accelerator_type=AcceleratorType.TPU,
|
|
43
67
|
supports_sub_slicing=False,
|
|
68
|
+
docker_platform=DockerPlatform.ARM,
|
|
44
69
|
)
|
|
45
70
|
label = create_placement_policy_label(system_characteristics)
|
|
46
71
|
assert (
|
|
@@ -60,6 +85,7 @@ def test_get_placement_policy_name_returns_valid_name():
|
|
|
60
85
|
device_type='tpu7x',
|
|
61
86
|
accelerator_type=AcceleratorType.TPU,
|
|
62
87
|
supports_sub_slicing=False,
|
|
88
|
+
docker_platform=DockerPlatform.ARM,
|
|
63
89
|
)
|
|
64
90
|
name = get_placement_policy_name(system_characteristics)
|
|
65
91
|
assert name == 'tpu7x-1x1x1-placement-policy'
|
|
@@ -76,6 +102,7 @@ def test_is_placement_policy_supported_returns_true_for_system_characteristics_s
|
|
|
76
102
|
device_type='tpu7x',
|
|
77
103
|
accelerator_type=AcceleratorType.TPU,
|
|
78
104
|
supports_sub_slicing=False,
|
|
105
|
+
docker_platform=DockerPlatform.ARM,
|
|
79
106
|
)
|
|
80
107
|
assert is_placement_policy_supported(system_characteristics) is True
|
|
81
108
|
|
|
@@ -91,6 +118,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
|
|
|
91
118
|
device_type='tpu7x',
|
|
92
119
|
accelerator_type=AcceleratorType.TPU,
|
|
93
120
|
supports_sub_slicing=False,
|
|
121
|
+
docker_platform=DockerPlatform.ARM,
|
|
94
122
|
)
|
|
95
123
|
assert is_placement_policy_supported(system_characteristics) is False
|
|
96
124
|
|
|
@@ -106,5 +134,190 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
|
|
|
106
134
|
device_type='tpu7x',
|
|
107
135
|
accelerator_type=AcceleratorType.TPU,
|
|
108
136
|
supports_sub_slicing=False,
|
|
137
|
+
docker_platform=DockerPlatform.ARM,
|
|
109
138
|
)
|
|
110
139
|
assert is_placement_policy_supported(system_characteristics) is False
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@dataclass(frozen=True)
|
|
143
|
+
class SchedulingTestCase:
|
|
144
|
+
workload_system: SystemCharacteristics
|
|
145
|
+
num_slices: int = 1
|
|
146
|
+
cluster_system: SystemCharacteristics | None = None
|
|
147
|
+
resources_config_map: dict[str, str] | None = None
|
|
148
|
+
sub_slicing_feature_enabled: bool = False
|
|
149
|
+
kueue_version: str | None = None
|
|
150
|
+
sub_slicing_topology_set: bool = False
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
SUB_SLICING_CASE = SchedulingTestCase(
|
|
154
|
+
workload_system=_get_system_characteristics_or_die('v6e-8'),
|
|
155
|
+
cluster_system=_get_system_characteristics_or_die('v6e-16'),
|
|
156
|
+
resources_config_map={'v6e-16': '8'},
|
|
157
|
+
sub_slicing_feature_enabled=True,
|
|
158
|
+
kueue_version='0.13.0',
|
|
159
|
+
sub_slicing_topology_set=True,
|
|
160
|
+
num_slices=1,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
NAP_CASE = SchedulingTestCase(
|
|
164
|
+
workload_system=_get_system_characteristics_or_die('v6e-8'),
|
|
165
|
+
cluster_system=None,
|
|
166
|
+
resources_config_map={
|
|
167
|
+
'tpu-v6e-slice': AUTOPROVISIONING_CONFIG_VALUE,
|
|
168
|
+
AUTOPROVISIONING_CONFIG_MAXIMUM_KEY: '10',
|
|
169
|
+
},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@pytest.mark.parametrize(
|
|
174
|
+
'title, case, expected',
|
|
175
|
+
[
|
|
176
|
+
(
|
|
177
|
+
'No resources config map',
|
|
178
|
+
SchedulingTestCase(
|
|
179
|
+
workload_system=_get_system_characteristics_or_die('v6e-8'),
|
|
180
|
+
resources_config_map=None,
|
|
181
|
+
),
|
|
182
|
+
WorkloadScheduling.AVAILABLE,
|
|
183
|
+
),
|
|
184
|
+
(
|
|
185
|
+
'Cluster system matches and workload fits',
|
|
186
|
+
SchedulingTestCase(
|
|
187
|
+
workload_system=_get_system_characteristics_or_die('v6e-8'),
|
|
188
|
+
resources_config_map={'v6e-8': '8'},
|
|
189
|
+
num_slices=2,
|
|
190
|
+
),
|
|
191
|
+
WorkloadScheduling.AVAILABLE,
|
|
192
|
+
),
|
|
193
|
+
(
|
|
194
|
+
'Cluster system does not match',
|
|
195
|
+
SchedulingTestCase(
|
|
196
|
+
workload_system=_get_system_characteristics_or_die('v6e-8'),
|
|
197
|
+
resources_config_map={'tpu7x-32': '16'},
|
|
198
|
+
),
|
|
199
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
200
|
+
),
|
|
201
|
+
(
|
|
202
|
+
'Workload does not fit',
|
|
203
|
+
SchedulingTestCase(
|
|
204
|
+
workload_system=_get_system_characteristics_or_die('v6e-8'),
|
|
205
|
+
resources_config_map={'v6e-8': '8'},
|
|
206
|
+
num_slices=100,
|
|
207
|
+
),
|
|
208
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
209
|
+
),
|
|
210
|
+
(
|
|
211
|
+
'Correct NAP',
|
|
212
|
+
NAP_CASE,
|
|
213
|
+
WorkloadScheduling.AVAILABLE,
|
|
214
|
+
),
|
|
215
|
+
(
|
|
216
|
+
'NAP, too big workload',
|
|
217
|
+
dataclasses.replace(NAP_CASE, num_slices=100),
|
|
218
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
219
|
+
),
|
|
220
|
+
(
|
|
221
|
+
'Correct Sub-slicing',
|
|
222
|
+
SUB_SLICING_CASE,
|
|
223
|
+
WorkloadScheduling.SUB_SLICING_AVAILABLE,
|
|
224
|
+
),
|
|
225
|
+
(
|
|
226
|
+
'Sub-slicing, but disabled flag',
|
|
227
|
+
dataclasses.replace(
|
|
228
|
+
SUB_SLICING_CASE, sub_slicing_feature_enabled=False
|
|
229
|
+
),
|
|
230
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
231
|
+
),
|
|
232
|
+
(
|
|
233
|
+
'Sub-slicing, but low Kueue version',
|
|
234
|
+
dataclasses.replace(SUB_SLICING_CASE, kueue_version='0.12.0'),
|
|
235
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
236
|
+
),
|
|
237
|
+
(
|
|
238
|
+
'Sub-slicing, but no sub-slicing-topology',
|
|
239
|
+
dataclasses.replace(
|
|
240
|
+
SUB_SLICING_CASE, sub_slicing_topology_set=False
|
|
241
|
+
),
|
|
242
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
243
|
+
),
|
|
244
|
+
(
|
|
245
|
+
'Sub-slicing, but workload too big',
|
|
246
|
+
dataclasses.replace(SUB_SLICING_CASE, num_slices=100),
|
|
247
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
248
|
+
),
|
|
249
|
+
(
|
|
250
|
+
'Sub-slicing, but cluster system is incorrect',
|
|
251
|
+
dataclasses.replace(
|
|
252
|
+
SUB_SLICING_CASE,
|
|
253
|
+
cluster_system=_get_system_characteristics_or_die('tpu7x-16'),
|
|
254
|
+
),
|
|
255
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
256
|
+
),
|
|
257
|
+
(
|
|
258
|
+
'Sub-slicing, but workload system is incorrect',
|
|
259
|
+
dataclasses.replace(
|
|
260
|
+
SUB_SLICING_CASE,
|
|
261
|
+
workload_system=_get_system_characteristics_or_die('tpu7x-8'),
|
|
262
|
+
),
|
|
263
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
264
|
+
),
|
|
265
|
+
(
|
|
266
|
+
'Sub-slicing, but workload topology is incorrect',
|
|
267
|
+
dataclasses.replace(
|
|
268
|
+
SUB_SLICING_CASE,
|
|
269
|
+
workload_system=_get_system_characteristics_or_die('v6e-2x2'),
|
|
270
|
+
),
|
|
271
|
+
WorkloadScheduling.UNAVAILABLE,
|
|
272
|
+
),
|
|
273
|
+
(
|
|
274
|
+
(
|
|
275
|
+
'Sub-slicing should be ignored when a given device is already'
|
|
276
|
+
' present in the cluster'
|
|
277
|
+
),
|
|
278
|
+
dataclasses.replace(
|
|
279
|
+
SUB_SLICING_CASE,
|
|
280
|
+
workload_system=_get_system_characteristics_or_die('v6e-8'),
|
|
281
|
+
cluster_system=_get_system_characteristics_or_die('v6e-8'),
|
|
282
|
+
resources_config_map={'v6e-8': '4'},
|
|
283
|
+
),
|
|
284
|
+
WorkloadScheduling.AVAILABLE,
|
|
285
|
+
),
|
|
286
|
+
],
|
|
287
|
+
)
|
|
288
|
+
def test_check_if_workload_can_schedule(
|
|
289
|
+
commands_tester: CommandsTester,
|
|
290
|
+
title: str,
|
|
291
|
+
case: SchedulingTestCase,
|
|
292
|
+
expected: WorkloadScheduling,
|
|
293
|
+
):
|
|
294
|
+
FeatureFlags.SUB_SLICING_ENABLED = case.sub_slicing_feature_enabled
|
|
295
|
+
commands_tester.set_result_for_command(
|
|
296
|
+
(
|
|
297
|
+
0,
|
|
298
|
+
f'registry.k8s.io/kueue/kueue:v{case.kueue_version}'
|
|
299
|
+
if case.kueue_version
|
|
300
|
+
else '',
|
|
301
|
+
),
|
|
302
|
+
'kubectl get deployment',
|
|
303
|
+
'image',
|
|
304
|
+
)
|
|
305
|
+
commands_tester.set_result_for_command(
|
|
306
|
+
(0, 'sub-slice-topology' if case.sub_slicing_topology_set else ''),
|
|
307
|
+
'kubectl get topology',
|
|
308
|
+
)
|
|
309
|
+
args = Namespace(
|
|
310
|
+
cluster='test-cluster',
|
|
311
|
+
workload='test-workload',
|
|
312
|
+
num_slices=case.num_slices,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
assert (
|
|
316
|
+
check_if_workload_can_schedule(
|
|
317
|
+
args,
|
|
318
|
+
workload_system=case.workload_system,
|
|
319
|
+
cluster_system=case.cluster_system,
|
|
320
|
+
resources_config_map=case.resources_config_map,
|
|
321
|
+
)
|
|
322
|
+
== expected
|
|
323
|
+
)
|