xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. integration/README.md +19 -0
  2. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  3. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  4. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  5. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  6. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  7. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  8. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  9. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  10. xpk/blueprints/a4/storage_crd.yaml +52 -0
  11. xpk/commands/cluster.py +33 -12
  12. xpk/commands/cluster_gcluster_test.py +5 -1
  13. xpk/commands/cluster_test.py +125 -0
  14. xpk/commands/config.py +3 -3
  15. xpk/commands/inspector.py +5 -3
  16. xpk/commands/kind.py +2 -0
  17. xpk/commands/managed_ml_diagnostics.py +249 -0
  18. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  19. xpk/commands/workload.py +124 -139
  20. xpk/commands/workload_test.py +160 -118
  21. xpk/core/blueprint/blueprint_generator.py +3 -0
  22. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  23. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  24. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  25. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  26. xpk/core/capacity.py +2 -0
  27. xpk/core/cluster.py +18 -47
  28. xpk/core/cluster_test.py +76 -1
  29. xpk/core/config.py +81 -7
  30. xpk/core/config_test.py +67 -11
  31. xpk/core/docker_container.py +3 -1
  32. xpk/core/docker_image.py +10 -6
  33. xpk/core/docker_resources.py +1 -10
  34. xpk/core/kjob.py +17 -16
  35. xpk/core/kueue_manager.py +13 -19
  36. xpk/core/kueue_manager_test.py +27 -1
  37. xpk/core/nap.py +13 -14
  38. xpk/core/nodepool.py +17 -15
  39. xpk/core/nodepool_test.py +25 -4
  40. xpk/core/pathways.py +23 -0
  41. xpk/core/pathways_test.py +57 -0
  42. xpk/core/resources.py +84 -27
  43. xpk/core/scheduling.py +128 -132
  44. xpk/core/scheduling_test.py +215 -2
  45. xpk/core/system_characteristics.py +179 -0
  46. xpk/core/system_characteristics_test.py +49 -1
  47. xpk/core/telemetry.py +4 -4
  48. xpk/core/telemetry_test.py +9 -9
  49. xpk/core/vertex.py +4 -3
  50. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  51. xpk/main.py +2 -0
  52. xpk/parser/cluster.py +22 -88
  53. xpk/parser/cluster_test.py +41 -0
  54. xpk/parser/common.py +84 -0
  55. xpk/parser/storage.py +10 -0
  56. xpk/parser/storage_test.py +47 -0
  57. xpk/parser/workload.py +14 -41
  58. xpk/parser/workload_test.py +2 -48
  59. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  60. xpk/utils/feature_flags.py +3 -0
  61. xpk/utils/validation.py +2 -2
  62. xpk-0.16.0.dist-info/METADATA +127 -0
  63. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
  64. xpk-0.15.0.dist-info/METADATA +0 -1666
  65. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
@@ -14,8 +14,32 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from .scheduling import create_sub_slicing_annotations, create_placement_policy_label, get_placement_policy_name, is_placement_policy_supported
18
- from .system_characteristics import SystemCharacteristics, AcceleratorType
17
+ from argparse import Namespace
18
+ from dataclasses import dataclass
19
+ import dataclasses
20
+ import pytest
21
+ from pytest_mock import MockerFixture
22
+ from xpk.core.capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
23
+ from xpk.core.testing.commands_tester import CommandsTester
24
+ from xpk.utils.feature_flags import FeatureFlags
25
+ from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, get_placement_policy_name, is_placement_policy_supported
26
+ from .system_characteristics import SystemCharacteristics, AcceleratorType, DockerPlatform, get_system_characteristics_by_device_type
27
+
28
+
29
+ def _get_system_characteristics_or_die(
30
+ device_type: str,
31
+ ) -> SystemCharacteristics:
32
+ system = get_system_characteristics_by_device_type(device_type)[0]
33
+ assert system
34
+ return system
35
+
36
+
37
+ @pytest.fixture(autouse=True)
38
+ def commands_tester(mocker: MockerFixture) -> CommandsTester:
39
+ return CommandsTester(
40
+ mocker=mocker,
41
+ run_command_for_value_path='xpk.core.kueue_manager.run_command_for_value',
42
+ )
19
43
 
20
44
 
21
45
  def test_create_sub_slicing_annotations_returns_valid_annotations():
@@ -41,6 +65,7 @@ def test_create_placement_policy_label_returns_valid_label():
41
65
  device_type='tpu7x',
42
66
  accelerator_type=AcceleratorType.TPU,
43
67
  supports_sub_slicing=False,
68
+ docker_platform=DockerPlatform.ARM,
44
69
  )
45
70
  label = create_placement_policy_label(system_characteristics)
46
71
  assert (
@@ -60,6 +85,7 @@ def test_get_placement_policy_name_returns_valid_name():
60
85
  device_type='tpu7x',
61
86
  accelerator_type=AcceleratorType.TPU,
62
87
  supports_sub_slicing=False,
88
+ docker_platform=DockerPlatform.ARM,
63
89
  )
64
90
  name = get_placement_policy_name(system_characteristics)
65
91
  assert name == 'tpu7x-1x1x1-placement-policy'
@@ -76,6 +102,7 @@ def test_is_placement_policy_supported_returns_true_for_system_characteristics_s
76
102
  device_type='tpu7x',
77
103
  accelerator_type=AcceleratorType.TPU,
78
104
  supports_sub_slicing=False,
105
+ docker_platform=DockerPlatform.ARM,
79
106
  )
80
107
  assert is_placement_policy_supported(system_characteristics) is True
81
108
 
@@ -91,6 +118,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
91
118
  device_type='tpu7x',
92
119
  accelerator_type=AcceleratorType.TPU,
93
120
  supports_sub_slicing=False,
121
+ docker_platform=DockerPlatform.ARM,
94
122
  )
95
123
  assert is_placement_policy_supported(system_characteristics) is False
96
124
 
@@ -106,5 +134,190 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
106
134
  device_type='tpu7x',
107
135
  accelerator_type=AcceleratorType.TPU,
108
136
  supports_sub_slicing=False,
137
+ docker_platform=DockerPlatform.ARM,
109
138
  )
110
139
  assert is_placement_policy_supported(system_characteristics) is False
140
+
141
+
142
+ @dataclass(frozen=True)
143
+ class SchedulingTestCase:
144
+ workload_system: SystemCharacteristics
145
+ num_slices: int = 1
146
+ cluster_system: SystemCharacteristics | None = None
147
+ resources_config_map: dict[str, str] | None = None
148
+ sub_slicing_feature_enabled: bool = False
149
+ kueue_version: str | None = None
150
+ sub_slicing_topology_set: bool = False
151
+
152
+
153
+ SUB_SLICING_CASE = SchedulingTestCase(
154
+ workload_system=_get_system_characteristics_or_die('v6e-8'),
155
+ cluster_system=_get_system_characteristics_or_die('v6e-16'),
156
+ resources_config_map={'v6e-16': '8'},
157
+ sub_slicing_feature_enabled=True,
158
+ kueue_version='0.13.0',
159
+ sub_slicing_topology_set=True,
160
+ num_slices=1,
161
+ )
162
+
163
+ NAP_CASE = SchedulingTestCase(
164
+ workload_system=_get_system_characteristics_or_die('v6e-8'),
165
+ cluster_system=None,
166
+ resources_config_map={
167
+ 'tpu-v6e-slice': AUTOPROVISIONING_CONFIG_VALUE,
168
+ AUTOPROVISIONING_CONFIG_MAXIMUM_KEY: '10',
169
+ },
170
+ )
171
+
172
+
173
+ @pytest.mark.parametrize(
174
+ 'title, case, expected',
175
+ [
176
+ (
177
+ 'No resources config map',
178
+ SchedulingTestCase(
179
+ workload_system=_get_system_characteristics_or_die('v6e-8'),
180
+ resources_config_map=None,
181
+ ),
182
+ WorkloadScheduling.AVAILABLE,
183
+ ),
184
+ (
185
+ 'Cluster system matches and workload fits',
186
+ SchedulingTestCase(
187
+ workload_system=_get_system_characteristics_or_die('v6e-8'),
188
+ resources_config_map={'v6e-8': '8'},
189
+ num_slices=2,
190
+ ),
191
+ WorkloadScheduling.AVAILABLE,
192
+ ),
193
+ (
194
+ 'Cluster system does not match',
195
+ SchedulingTestCase(
196
+ workload_system=_get_system_characteristics_or_die('v6e-8'),
197
+ resources_config_map={'tpu7x-32': '16'},
198
+ ),
199
+ WorkloadScheduling.UNAVAILABLE,
200
+ ),
201
+ (
202
+ 'Workload does not fit',
203
+ SchedulingTestCase(
204
+ workload_system=_get_system_characteristics_or_die('v6e-8'),
205
+ resources_config_map={'v6e-8': '8'},
206
+ num_slices=100,
207
+ ),
208
+ WorkloadScheduling.UNAVAILABLE,
209
+ ),
210
+ (
211
+ 'Correct NAP',
212
+ NAP_CASE,
213
+ WorkloadScheduling.AVAILABLE,
214
+ ),
215
+ (
216
+ 'NAP, too big workload',
217
+ dataclasses.replace(NAP_CASE, num_slices=100),
218
+ WorkloadScheduling.UNAVAILABLE,
219
+ ),
220
+ (
221
+ 'Correct Sub-slicing',
222
+ SUB_SLICING_CASE,
223
+ WorkloadScheduling.SUB_SLICING_AVAILABLE,
224
+ ),
225
+ (
226
+ 'Sub-slicing, but disabled flag',
227
+ dataclasses.replace(
228
+ SUB_SLICING_CASE, sub_slicing_feature_enabled=False
229
+ ),
230
+ WorkloadScheduling.UNAVAILABLE,
231
+ ),
232
+ (
233
+ 'Sub-slicing, but low Kueue version',
234
+ dataclasses.replace(SUB_SLICING_CASE, kueue_version='0.12.0'),
235
+ WorkloadScheduling.UNAVAILABLE,
236
+ ),
237
+ (
238
+ 'Sub-slicing, but no sub-slicing-topology',
239
+ dataclasses.replace(
240
+ SUB_SLICING_CASE, sub_slicing_topology_set=False
241
+ ),
242
+ WorkloadScheduling.UNAVAILABLE,
243
+ ),
244
+ (
245
+ 'Sub-slicing, but workload too big',
246
+ dataclasses.replace(SUB_SLICING_CASE, num_slices=100),
247
+ WorkloadScheduling.UNAVAILABLE,
248
+ ),
249
+ (
250
+ 'Sub-slicing, but cluster system is incorrect',
251
+ dataclasses.replace(
252
+ SUB_SLICING_CASE,
253
+ cluster_system=_get_system_characteristics_or_die('tpu7x-16'),
254
+ ),
255
+ WorkloadScheduling.UNAVAILABLE,
256
+ ),
257
+ (
258
+ 'Sub-slicing, but workload system is incorrect',
259
+ dataclasses.replace(
260
+ SUB_SLICING_CASE,
261
+ workload_system=_get_system_characteristics_or_die('tpu7x-8'),
262
+ ),
263
+ WorkloadScheduling.UNAVAILABLE,
264
+ ),
265
+ (
266
+ 'Sub-slicing, but workload topology is incorrect',
267
+ dataclasses.replace(
268
+ SUB_SLICING_CASE,
269
+ workload_system=_get_system_characteristics_or_die('v6e-2x2'),
270
+ ),
271
+ WorkloadScheduling.UNAVAILABLE,
272
+ ),
273
+ (
274
+ (
275
+ 'Sub-slicing should be ignored when a given device is already'
276
+ ' present in the cluster'
277
+ ),
278
+ dataclasses.replace(
279
+ SUB_SLICING_CASE,
280
+ workload_system=_get_system_characteristics_or_die('v6e-8'),
281
+ cluster_system=_get_system_characteristics_or_die('v6e-8'),
282
+ resources_config_map={'v6e-8': '4'},
283
+ ),
284
+ WorkloadScheduling.AVAILABLE,
285
+ ),
286
+ ],
287
+ )
288
+ def test_check_if_workload_can_schedule(
289
+ commands_tester: CommandsTester,
290
+ title: str,
291
+ case: SchedulingTestCase,
292
+ expected: WorkloadScheduling,
293
+ ):
294
+ FeatureFlags.SUB_SLICING_ENABLED = case.sub_slicing_feature_enabled
295
+ commands_tester.set_result_for_command(
296
+ (
297
+ 0,
298
+ f'registry.k8s.io/kueue/kueue:v{case.kueue_version}'
299
+ if case.kueue_version
300
+ else '',
301
+ ),
302
+ 'kubectl get deployment',
303
+ 'image',
304
+ )
305
+ commands_tester.set_result_for_command(
306
+ (0, 'sub-slice-topology' if case.sub_slicing_topology_set else ''),
307
+ 'kubectl get topology',
308
+ )
309
+ args = Namespace(
310
+ cluster='test-cluster',
311
+ workload='test-workload',
312
+ num_slices=case.num_slices,
313
+ )
314
+
315
+ assert (
316
+ check_if_workload_can_schedule(
317
+ args,
318
+ workload_system=case.workload_system,
319
+ cluster_system=case.cluster_system,
320
+ resources_config_map=case.resources_config_map,
321
+ )
322
+ == expected
323
+ )