xpk 0.16.1__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. xpk/commands/cluster.py +48 -5
  2. xpk/commands/cluster_gcluster.py +3 -0
  3. xpk/commands/cluster_gcluster_test.py +2 -0
  4. xpk/commands/cluster_test.py +203 -0
  5. xpk/commands/common.py +6 -0
  6. xpk/commands/kind.py +2 -0
  7. xpk/commands/workload.py +35 -15
  8. xpk/commands/workload_test.py +1 -0
  9. xpk/core/capacity.py +83 -46
  10. xpk/core/capacity_test.py +82 -28
  11. xpk/core/commands.py +39 -12
  12. xpk/core/kueue_manager.py +42 -11
  13. xpk/core/kueue_manager_test.py +83 -3
  14. xpk/core/nap.py +5 -4
  15. xpk/core/nodepool.py +57 -20
  16. xpk/core/nodepool_test.py +152 -23
  17. xpk/core/pathways.py +2 -1
  18. xpk/core/resources.py +3 -3
  19. xpk/core/scheduling.py +54 -10
  20. xpk/core/scheduling_test.py +118 -13
  21. xpk/core/system_characteristics.py +41 -24
  22. xpk/core/system_characteristics_test.py +37 -4
  23. xpk/core/telemetry.py +5 -0
  24. xpk/core/telemetry_test.py +19 -2
  25. xpk/core/updates.py +1 -1
  26. xpk/main.py +2 -1
  27. xpk/parser/cluster.py +34 -2
  28. xpk/parser/cluster_test.py +117 -0
  29. xpk/parser/common.py +32 -0
  30. xpk/parser/common_test.py +49 -0
  31. xpk/templates/kueue_config.yaml.j2 +21 -5
  32. xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
  33. xpk/utils/kueue.py +6 -2
  34. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/METADATA +2 -1
  35. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/RECORD +39 -37
  36. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/WHEEL +0 -0
  37. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/entry_points.txt +0 -0
  38. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/licenses/LICENSE +0 -0
  39. {xpk-0.16.1.dist-info → xpk-0.17.1.dist-info}/top_level.txt +0 -0
@@ -32,7 +32,6 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
32
32
  gke_accelerator="test",
33
33
  machine_type="test",
34
34
  supported_topologies=["1x1"],
35
- supports_sub_slicing=False,
36
35
  docker_platform=DockerPlatform.AMD,
37
36
  tpu_type_requires_workload_policy=False,
38
37
  )
@@ -46,6 +45,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
46
45
  accelerator_type=AcceleratorType.TPU,
47
46
  device_type="test-1",
48
47
  supports_sub_slicing=False,
48
+ supports_super_slicing=False,
49
49
  docker_platform=DockerPlatform.AMD,
50
50
  requires_workload_policy=False,
51
51
  )
@@ -62,7 +62,6 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
62
62
  gke_accelerator="test",
63
63
  machine_type="test",
64
64
  supported_topologies=["2x2"],
65
- supports_sub_slicing=False,
66
65
  docker_platform=DockerPlatform.AMD,
67
66
  tpu_type_requires_workload_policy=True,
68
67
  )
@@ -76,6 +75,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
76
75
  accelerator_type=AcceleratorType.TPU,
77
76
  device_type="test-8",
78
77
  supports_sub_slicing=False,
78
+ supports_super_slicing=False,
79
79
  docker_platform=DockerPlatform.AMD,
80
80
  requires_workload_policy=False,
81
81
  )
@@ -92,7 +92,6 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
92
92
  gke_accelerator="test",
93
93
  machine_type="test",
94
94
  supported_topologies=["2x2x2"],
95
- supports_sub_slicing=False,
96
95
  docker_platform=DockerPlatform.AMD,
97
96
  tpu_type_requires_workload_policy=True,
98
97
  )
@@ -106,6 +105,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
106
105
  accelerator_type=AcceleratorType.TPU,
107
106
  device_type="test-16",
108
107
  supports_sub_slicing=False,
108
+ supports_super_slicing=False,
109
109
  docker_platform=DockerPlatform.AMD,
110
110
  requires_workload_policy=True,
111
111
  )
@@ -115,6 +115,38 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
115
115
  }
116
116
 
117
117
 
118
+ def test_get_tpu_system_characteristics_map_sets_sub_slicing_support():
119
+ result = get_tpu_system_characteristics_map(
120
+ prefix="test",
121
+ tensorcores_per_chip=2,
122
+ gke_accelerator="test",
123
+ machine_type="test",
124
+ supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
125
+ docker_platform=DockerPlatform.AMD,
126
+ sub_slicing_topologies=set(["4x4x8", "4x4x16"]),
127
+ )
128
+
129
+ assert result["test-4x4x4"].supports_sub_slicing is False
130
+ assert result["test-4x4x8"].supports_sub_slicing is True
131
+ assert result["test-4x4x16"].supports_sub_slicing is True
132
+
133
+
134
+ def test_get_tpu_system_characteristics_map_sets_super_slicing_support():
135
+ result = get_tpu_system_characteristics_map(
136
+ prefix="test",
137
+ tensorcores_per_chip=2,
138
+ gke_accelerator="test",
139
+ machine_type="test",
140
+ supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
141
+ docker_platform=DockerPlatform.AMD,
142
+ super_slicing_topologies=set(["4x4x8", "4x4x16"]),
143
+ )
144
+
145
+ assert result["test-4x4x4"].supports_super_slicing is False
146
+ assert result["test-4x4x8"].supports_super_slicing is True
147
+ assert result["test-4x4x16"].supports_super_slicing is True
148
+
149
+
118
150
  def test_get_tpu_system_characteristics_map_prefers_default_topologies():
119
151
  result = get_tpu_system_characteristics_map(
120
152
  prefix="test",
@@ -122,7 +154,6 @@ def test_get_tpu_system_characteristics_map_prefers_default_topologies():
122
154
  gke_accelerator="test",
123
155
  machine_type="test",
124
156
  supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
125
- supports_sub_slicing=False,
126
157
  docker_platform=DockerPlatform.AMD,
127
158
  default_topologies=set(["4x8x16"]),
128
159
  )
@@ -174,6 +205,7 @@ def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
174
205
  accelerator_type=AcceleratorType.GPU,
175
206
  device_type="l4-1",
176
207
  supports_sub_slicing=False,
208
+ supports_super_slicing=False,
177
209
  docker_platform=DockerPlatform.AMD,
178
210
  gpu_config=GpuConfig(requires_topology=False),
179
211
  )
@@ -192,5 +224,6 @@ def test_system_characteristics_post_init_throws_for_gpu_without_config():
192
224
  accelerator_type=AcceleratorType.GPU,
193
225
  device_type="l4-1",
194
226
  supports_sub_slicing=False,
227
+ supports_super_slicing=False,
195
228
  docker_platform=DockerPlatform.AMD,
196
229
  )
xpk/core/telemetry.py CHANGED
@@ -124,6 +124,7 @@ class MetricsEventMetadataKey(Enum):
124
124
  EXIT_CODE = "XPK_EXIT_CODE"
125
125
  RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
126
126
  RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
127
+ LATENCY_SECONDS = "XPK_LATENCY_SECONDS"
127
128
 
128
129
 
129
130
  @dataclass
@@ -190,10 +191,14 @@ def _generate_payload(events: list[_MetricsEvent]) -> str:
190
191
  base_concord_event = _get_base_concord_event()
191
192
  base_event_metadata = _get_base_event_metadata()
192
193
  serialized_events = []
194
+ first_time = events[0].time if len(events) > 0 else 0
193
195
  for event in events:
194
196
  metadata = {
195
197
  **base_event_metadata,
196
198
  **event.metadata,
199
+ MetricsEventMetadataKey.LATENCY_SECONDS: str(
200
+ int(event.time - first_time)
201
+ ),
197
202
  }
198
203
  serialized_events.append({
199
204
  "event_time_ms": int(event.time * 1000),
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import itertools
17
18
  import pytest
18
19
  import json
19
20
  from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
@@ -26,7 +27,7 @@ from pytest_mock import MockerFixture
26
27
  @pytest.fixture(autouse=True)
27
28
  def setup_mocks(mocker: MockerFixture):
28
29
  mocker.patch('xpk.core.telemetry._get_session_id', return_value='321231')
29
- mocker.patch('time.time', return_value=0)
30
+ mocker.patch('time.time', side_effect=itertools.count())
30
31
  mocker.patch('platform.python_version', return_value='99.99.99')
31
32
  mocker.patch('os.path.basename', return_value='xpk.py')
32
33
  mocker.patch('os.path.abspath', return_value='/home/xpk_user')
@@ -76,6 +77,7 @@ def test_metrics_collector_logs_start_event_correctly():
76
77
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
77
78
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
78
79
  {'key': 'XPK_COMMAND', 'value': 'test'},
80
+ {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
79
81
  ],
80
82
  'event_name': 'start',
81
83
  'event_type': 'commands',
@@ -106,6 +108,7 @@ def test_metrics_collector_logs_complete_event_correctly():
106
108
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
107
109
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
108
110
  {'key': 'XPK_EXIT_CODE', 'value': '2'},
111
+ {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
109
112
  ],
110
113
  'event_name': 'complete',
111
114
  'event_type': 'commands',
@@ -129,6 +132,7 @@ def test_metrics_collector_logs_custom_event_correctly():
129
132
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
130
133
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
131
134
  {'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
135
+ {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
132
136
  ],
133
137
  'event_name': 'test',
134
138
  'event_type': 'custom',
@@ -136,6 +140,19 @@ def test_metrics_collector_logs_custom_event_correctly():
136
140
  }
137
141
 
138
142
 
143
+ def test_metrics_collector_computest_latency_correctly():
144
+ MetricsCollector.log_start(command='test')
145
+ MetricsCollector.log_complete(exit_code=0)
146
+ payload = json.loads(MetricsCollector.flush())
147
+ extension_json = json.loads(payload['log_event'][1]['source_extension_json'])
148
+ latency = (
149
+ el['value']
150
+ for el in extension_json['event_metadata']
151
+ if el['key'] == 'XPK_LATENCY_SECONDS'
152
+ )
153
+ assert next(latency, None) == '1'
154
+
155
+
139
156
  def test_metrics_collector_logs_correct_envelope():
140
157
  MetricsCollector.log_start(command='test')
141
158
  MetricsCollector.log_custom(
@@ -145,7 +162,7 @@ def test_metrics_collector_logs_correct_envelope():
145
162
  payload = json.loads(MetricsCollector.flush())
146
163
  assert payload['client_info'] == {'client_type': 'XPK'}
147
164
  assert payload['log_source_name'] == 'CONCORD'
148
- assert payload['request_time_ms'] == 0
165
+ assert payload['request_time_ms'] == 3000
149
166
  assert len(payload['log_event']) == 3
150
167
 
151
168
 
xpk/core/updates.py CHANGED
@@ -28,7 +28,7 @@ def get_latest_xpk_version() -> tuple[int, Version | None]:
28
28
  return 0, Version(__version__)
29
29
 
30
30
  return_code, result = run_command_for_value(
31
- command="pip index versions xpk --json",
31
+ command="pip index versions xpk --json --no-input",
32
32
  task="Retrieve latest XPK version",
33
33
  quiet=True,
34
34
  )
xpk/main.py CHANGED
@@ -36,6 +36,7 @@ import argcomplete
36
36
  import sys
37
37
 
38
38
  from .parser.core import set_parser
39
+ from .parser.common import extract_command_path
39
40
  from .core.updates import print_xpk_hello
40
41
  from .core.config import set_config, FileSystemConfig
41
42
  from .core.telemetry import MetricsCollector, send_clearcut_payload, should_send_telemetry
@@ -78,7 +79,7 @@ def main() -> None:
78
79
  or ('force' in main_args and main_args.force)
79
80
  ),
80
81
  )
81
- MetricsCollector.log_start(main_args.xpk_subcommands)
82
+ MetricsCollector.log_start(command=extract_command_path(parser, main_args))
82
83
  print_xpk_hello()
83
84
  main_args.func(main_args)
84
85
  xpk_print('XPK Done.', flush=True)
xpk/parser/cluster.py CHANGED
@@ -132,6 +132,10 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
132
132
 
133
133
  if FeatureFlags.SUB_SLICING_ENABLED:
134
134
  add_cluster_create_sub_slicing_arguments(cluster_create_optional_arguments)
135
+ if FeatureFlags.SUPER_SLICING_ENABLED:
136
+ add_cluster_create_super_slicing_arguments(
137
+ cluster_create_optional_arguments
138
+ )
135
139
 
136
140
  autoprovisioning_arguments = cluster_create_parser.add_argument_group(
137
141
  'Autoprovisioning Arguments',
@@ -205,6 +209,10 @@ def set_cluster_create_pathways_parser(
205
209
  add_cluster_create_sub_slicing_arguments(
206
210
  cluster_create_pathways_optional_arguments
207
211
  )
212
+ if FeatureFlags.SUPER_SLICING_ENABLED:
213
+ add_cluster_create_super_slicing_arguments(
214
+ cluster_create_pathways_optional_arguments
215
+ )
208
216
 
209
217
  autoprovisioning_arguments = (
210
218
  cluster_create_pathways_parser.add_argument_group(
@@ -330,7 +338,7 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
330
338
  add_resource_limits(cluster_create_resource_limits)
331
339
 
332
340
  cluster_create_ray_parser.set_defaults(
333
- func=cluster_create_ray_cluster, sub_slicing=False
341
+ func=cluster_create_ray_cluster, sub_slicing=False, super_slicing=False
334
342
  )
335
343
 
336
344
 
@@ -596,7 +604,10 @@ def add_shared_cluster_create_optional_arguments(
596
604
  parser_or_group.add_argument(
597
605
  '--num-slices',
598
606
  type=int,
599
- default=1,
607
+ # removing default in case of super slicing because
608
+ # --num-slices must be equal to --num-cubes if both are set
609
+ # it will default to 1 during validation
610
+ default=1 if not FeatureFlags.SUPER_SLICING_ENABLED else None,
600
611
  help='The number of slices to run the job on, defaults to 1.',
601
612
  required=False,
602
613
  )
@@ -910,3 +921,24 @@ def add_cluster_create_sub_slicing_arguments(
910
921
  action='store_true',
911
922
  help='Whether to set up cluster to support sub-slicing',
912
923
  )
924
+
925
+
926
+ def add_cluster_create_super_slicing_arguments(
927
+ parser_or_group: ParserOrArgumentGroup,
928
+ ):
929
+ parser_or_group.add_argument(
930
+ '--super-slicing',
931
+ action='store_true',
932
+ help='Whether to set up cluster to support super-slicing',
933
+ )
934
+ parser_or_group.add_argument(
935
+ '--num-cubes',
936
+ type=int,
937
+ # default value is set during validation because it needs to be compared
938
+ # against --num-slices
939
+ help=(
940
+ 'Total number of cubes to create within a cluster, defaults to 1. Can'
941
+ ' only be used with --super-slicing.'
942
+ ),
943
+ required=False,
944
+ )
@@ -144,3 +144,120 @@ def test_cluster_create_enable_lustre_legacy_port_can_be_set():
144
144
  ])
145
145
 
146
146
  assert args.enable_legacy_lustre_port is True
147
+
148
+
149
+ def test_cluster_create_super_slicing_is_hidden_with_flag_off():
150
+ FeatureFlags.SUPER_SLICING_ENABLED = False
151
+ parser = argparse.ArgumentParser()
152
+
153
+ set_cluster_create_parser(parser)
154
+ help_str = parser.format_help()
155
+
156
+ assert "--super-slicing" not in help_str
157
+
158
+
159
+ def test_cluster_create_super_slicing_is_shown_with_flag_on():
160
+ FeatureFlags.SUPER_SLICING_ENABLED = True
161
+ parser = argparse.ArgumentParser()
162
+
163
+ set_cluster_create_parser(parser)
164
+ help_str = parser.format_help()
165
+
166
+ assert "--super-slicing" in help_str
167
+
168
+
169
+ def test_cluster_create_super_slicing_is_false_by_default():
170
+ FeatureFlags.SUPER_SLICING_ENABLED = True
171
+ parser = argparse.ArgumentParser()
172
+
173
+ set_cluster_create_parser(parser)
174
+ args = parser.parse_args(
175
+ ["--cluster", "test-cluster", "--tpu-type", "tpu7x-2"]
176
+ )
177
+
178
+ assert args.super_slicing is False
179
+
180
+
181
+ def test_cluster_create_super_slicing_can_be_set():
182
+ FeatureFlags.SUPER_SLICING_ENABLED = True
183
+ parser = argparse.ArgumentParser()
184
+
185
+ set_cluster_create_parser(parser)
186
+ args = parser.parse_args(
187
+ ["--cluster", "test-cluster", "--tpu-type", "tpu7x-2", "--super-slicing"],
188
+ )
189
+
190
+ assert args.super_slicing is True
191
+
192
+
193
+ def test_cluster_create_num_cubes_is_hidden_with_flag_off():
194
+ FeatureFlags.SUPER_SLICING_ENABLED = False
195
+ parser = argparse.ArgumentParser()
196
+
197
+ set_cluster_create_parser(parser)
198
+ help_str = parser.format_help()
199
+
200
+ assert "--num-cubes" not in help_str
201
+
202
+
203
+ def test_cluster_create_num_cubes_is_shown_with_flag_on():
204
+ FeatureFlags.SUPER_SLICING_ENABLED = True
205
+ parser = argparse.ArgumentParser()
206
+
207
+ set_cluster_create_parser(parser)
208
+ help_str = parser.format_help()
209
+
210
+ assert "--num-cubes" in help_str
211
+
212
+
213
+ def test_cluster_create_num_cubes_can_be_set():
214
+ FeatureFlags.SUPER_SLICING_ENABLED = True
215
+ parser = argparse.ArgumentParser()
216
+
217
+ set_cluster_create_parser(parser)
218
+ args = parser.parse_args(
219
+ [
220
+ "--cluster",
221
+ "test-cluster",
222
+ "--tpu-type",
223
+ "tpu7x-2",
224
+ "--num-cubes",
225
+ "5",
226
+ ],
227
+ )
228
+
229
+ assert args.num_cubes == 5
230
+
231
+
232
+ def test_cluster_create_num_slices_defaults_to_1_if_no_superslicing_feature():
233
+ FeatureFlags.SUPER_SLICING_ENABLED = False
234
+ parser = argparse.ArgumentParser()
235
+
236
+ set_cluster_create_parser(parser)
237
+ args = parser.parse_args(
238
+ [
239
+ "--cluster",
240
+ "test-cluster",
241
+ "--tpu-type",
242
+ "tpu7x-2",
243
+ ],
244
+ )
245
+
246
+ assert args.num_slices == 1
247
+
248
+
249
+ def test_cluster_create_num_slices_has_no_default_if_superslicing_feature():
250
+ FeatureFlags.SUPER_SLICING_ENABLED = True
251
+ parser = argparse.ArgumentParser()
252
+
253
+ set_cluster_create_parser(parser)
254
+ args = parser.parse_args(
255
+ [
256
+ "--cluster",
257
+ "test-cluster",
258
+ "--tpu-type",
259
+ "tpu7x-2",
260
+ ],
261
+ )
262
+
263
+ assert args.num_slices is None
xpk/parser/common.py CHANGED
@@ -369,3 +369,35 @@ def add_tpu_and_device_type_arguments(
369
369
  ) -> None:
370
370
  add_tpu_type_argument(custom_parser_or_group)
371
371
  add_device_type_argument(custom_parser_or_group)
372
+
373
+
374
+ def extract_command_path(parser: argparse.ArgumentParser, args):
375
+ """
376
+ Reconstructs the command path (e.g. 'cluster create').
377
+ """
378
+
379
+ def _get_path_segments(current_parser):
380
+ subparser_action = next(
381
+ (
382
+ action
383
+ for action in current_parser._actions # pylint: disable=protected-access
384
+ if isinstance(action, argparse._SubParsersAction) # pylint: disable=protected-access
385
+ ),
386
+ None,
387
+ )
388
+
389
+ if subparser_action is None:
390
+ return []
391
+
392
+ chosen_command = getattr(args, subparser_action.dest, None)
393
+
394
+ if chosen_command is None:
395
+ return []
396
+
397
+ if chosen_command in subparser_action.choices:
398
+ next_parser = subparser_action.choices[chosen_command]
399
+ return [chosen_command] + _get_path_segments(next_parser)
400
+
401
+ return [chosen_command]
402
+
403
+ return ' '.join(_get_path_segments(parser))
@@ -0,0 +1,49 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import argparse
18
+ from .common import extract_command_path
19
+ from .core import set_parser
20
+
21
+
22
+ def test_extract_zero_level_nested_command():
23
+ parser = argparse.ArgumentParser()
24
+ set_parser(parser=parser)
25
+ args = parser.parse_args([])
26
+ assert extract_command_path(parser, args) == ""
27
+
28
+
29
+ def test_extract_one_level_nested_command():
30
+ parser = argparse.ArgumentParser()
31
+ set_parser(parser=parser)
32
+ args = parser.parse_args(["version"])
33
+ assert extract_command_path(parser, args) == "version"
34
+
35
+
36
+ def test_extract_two_level_nested_command():
37
+ parser = argparse.ArgumentParser()
38
+ set_parser(parser=parser)
39
+ args = parser.parse_args(["cluster", "list"])
40
+ assert extract_command_path(parser, args) == "cluster list"
41
+
42
+
43
+ def test_extract_two_level_nested_command_with_flags():
44
+ parser = argparse.ArgumentParser()
45
+ set_parser(parser=parser)
46
+ args = parser.parse_args(
47
+ ["cluster", "list", "--project=abc", "--zone=us-central1-a"]
48
+ )
49
+ assert extract_command_path(parser, args) == "cluster list"
@@ -1,15 +1,16 @@
1
- {% for flavor in flavors %}
1
+ {%- for flavor in flavors %}
2
2
  apiVersion: kueue.x-k8s.io/v1beta1
3
3
  kind: ResourceFlavor
4
4
  metadata:
5
5
  name: "{{ flavor.name }}"
6
6
  spec:
7
7
  nodeLabels: {{ flavor.nodeLabels | tojson }}
8
- {% if flavor.topologyLabel %}
8
+ {%- if flavor.topologyLabel %}
9
9
  {{ flavor.topologyLabel }}
10
- {% endif %}
10
+ {%- endif %}
11
11
  ---
12
- {% endfor %}
12
+ {%- endfor %}
13
+ {%- if 'dws-prov' in admission_checks %}
13
14
  apiVersion: kueue.x-k8s.io/v1beta1
14
15
  kind: AdmissionCheck
15
16
  metadata:
@@ -21,6 +22,16 @@ spec:
21
22
  kind: ProvisioningRequestConfig
22
23
  name: dws-config
23
24
  ---
25
+ {%- endif %}
26
+ {%- if 'ss-kueue-operator' in admission_checks %}
27
+ apiVersion: kueue.x-k8s.io/v1beta1
28
+ kind: AdmissionCheck
29
+ metadata:
30
+ name: ss-kueue-operator
31
+ spec:
32
+ controllerName: accelerator.gke.io/slice
33
+ ---
34
+ {%- endif %}
24
35
  apiVersion: kueue.x-k8s.io/v1beta1
25
36
  kind: ProvisioningRequestConfig
26
37
  metadata:
@@ -44,7 +55,12 @@ spec:
44
55
  withinClusterQueue: LowerPriority
45
56
  namespaceSelector: {} # match all.
46
57
  resourceGroups: {{ resource_groups }}
47
- {{ admission_checks | indent(2) }}
58
+ {%- if admission_checks %}
59
+ admissionChecks:
60
+ {%- for check in admission_checks %}
61
+ - {{ check }}
62
+ {%- endfor %}
63
+ {%- endif %}
48
64
  ---
49
65
  apiVersion: kueue.x-k8s.io/v1beta1
50
66
  kind: LocalQueue
@@ -0,0 +1,9 @@
1
+ apiVersion: kueue.x-k8s.io/v1beta1
2
+ kind: Topology
3
+ metadata:
4
+ name: {{ super_slice_topology_name }}
5
+ spec:
6
+ levels:
7
+ - nodeLabel: cloud.google.com/gce-topology-block
8
+ - nodeLabel: cloud.google.com/gke-tpu-partition-4x4x4-id
9
+ - nodeLabel: kubernetes.io/hostname
xpk/utils/kueue.py CHANGED
@@ -14,7 +14,11 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from ..core.system_characteristics import AcceleratorType
17
18
 
18
- def is_queued_cluster(num_slices: int) -> bool:
19
+
20
+ def is_queued_cluster(
21
+ num_slices: int, accelerator_type: AcceleratorType
22
+ ) -> bool:
19
23
  """Determines if admission checks should be enabled and cluster queued."""
20
- return num_slices <= 1
24
+ return num_slices <= 1 and accelerator_type == AcceleratorType.GPU
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.16.1
3
+ Version: 0.17.1
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -25,6 +25,7 @@ Requires-Dist: packaging==24.2
25
25
  Requires-Dist: google-cloud-filestore==1.12.0
26
26
  Requires-Dist: google-cloud-storage
27
27
  Requires-Dist: Jinja2==3.1.6
28
+ Requires-Dist: urllib3<2.6.0
28
29
  Provides-Extra: dev
29
30
  Requires-Dist: pyink==24.3.0; extra == "dev"
30
31
  Requires-Dist: pylint>=2.6.0; extra == "dev"