xpk 0.16.0__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +48 -5
- xpk/commands/cluster_gcluster.py +3 -0
- xpk/commands/cluster_gcluster_test.py +2 -0
- xpk/commands/cluster_test.py +203 -0
- xpk/commands/common.py +6 -0
- xpk/commands/kind.py +2 -0
- xpk/commands/workload.py +35 -15
- xpk/commands/workload_test.py +1 -0
- xpk/core/capacity.py +83 -46
- xpk/core/capacity_test.py +82 -28
- xpk/core/commands.py +39 -12
- xpk/core/kueue_manager.py +42 -11
- xpk/core/kueue_manager_test.py +83 -3
- xpk/core/nap.py +5 -4
- xpk/core/nodepool.py +57 -20
- xpk/core/nodepool_test.py +152 -23
- xpk/core/pathways.py +2 -1
- xpk/core/resources.py +3 -3
- xpk/core/scheduling.py +54 -10
- xpk/core/scheduling_test.py +118 -13
- xpk/core/system_characteristics.py +41 -24
- xpk/core/system_characteristics_test.py +37 -4
- xpk/core/telemetry.py +5 -0
- xpk/core/telemetry_test.py +19 -2
- xpk/core/updates.py +1 -1
- xpk/main.py +2 -1
- xpk/parser/cluster.py +34 -2
- xpk/parser/cluster_test.py +117 -0
- xpk/parser/common.py +32 -0
- xpk/parser/common_test.py +49 -0
- xpk/templates/kueue_config.yaml.j2 +21 -5
- xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
- xpk/utils/kueue.py +6 -2
- {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/METADATA +2 -1
- {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/RECORD +39 -37
- {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/WHEEL +0 -0
- {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/top_level.txt +0 -0
|
@@ -32,7 +32,6 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
32
32
|
gke_accelerator="test",
|
|
33
33
|
machine_type="test",
|
|
34
34
|
supported_topologies=["1x1"],
|
|
35
|
-
supports_sub_slicing=False,
|
|
36
35
|
docker_platform=DockerPlatform.AMD,
|
|
37
36
|
tpu_type_requires_workload_policy=False,
|
|
38
37
|
)
|
|
@@ -46,6 +45,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
46
45
|
accelerator_type=AcceleratorType.TPU,
|
|
47
46
|
device_type="test-1",
|
|
48
47
|
supports_sub_slicing=False,
|
|
48
|
+
supports_super_slicing=False,
|
|
49
49
|
docker_platform=DockerPlatform.AMD,
|
|
50
50
|
requires_workload_policy=False,
|
|
51
51
|
)
|
|
@@ -62,7 +62,6 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
62
62
|
gke_accelerator="test",
|
|
63
63
|
machine_type="test",
|
|
64
64
|
supported_topologies=["2x2"],
|
|
65
|
-
supports_sub_slicing=False,
|
|
66
65
|
docker_platform=DockerPlatform.AMD,
|
|
67
66
|
tpu_type_requires_workload_policy=True,
|
|
68
67
|
)
|
|
@@ -76,6 +75,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
76
75
|
accelerator_type=AcceleratorType.TPU,
|
|
77
76
|
device_type="test-8",
|
|
78
77
|
supports_sub_slicing=False,
|
|
78
|
+
supports_super_slicing=False,
|
|
79
79
|
docker_platform=DockerPlatform.AMD,
|
|
80
80
|
requires_workload_policy=False,
|
|
81
81
|
)
|
|
@@ -92,7 +92,6 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
|
|
|
92
92
|
gke_accelerator="test",
|
|
93
93
|
machine_type="test",
|
|
94
94
|
supported_topologies=["2x2x2"],
|
|
95
|
-
supports_sub_slicing=False,
|
|
96
95
|
docker_platform=DockerPlatform.AMD,
|
|
97
96
|
tpu_type_requires_workload_policy=True,
|
|
98
97
|
)
|
|
@@ -106,6 +105,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
|
|
|
106
105
|
accelerator_type=AcceleratorType.TPU,
|
|
107
106
|
device_type="test-16",
|
|
108
107
|
supports_sub_slicing=False,
|
|
108
|
+
supports_super_slicing=False,
|
|
109
109
|
docker_platform=DockerPlatform.AMD,
|
|
110
110
|
requires_workload_policy=True,
|
|
111
111
|
)
|
|
@@ -115,6 +115,38 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
|
|
|
115
115
|
}
|
|
116
116
|
|
|
117
117
|
|
|
118
|
+
def test_get_tpu_system_characteristics_map_sets_sub_slicing_support():
|
|
119
|
+
result = get_tpu_system_characteristics_map(
|
|
120
|
+
prefix="test",
|
|
121
|
+
tensorcores_per_chip=2,
|
|
122
|
+
gke_accelerator="test",
|
|
123
|
+
machine_type="test",
|
|
124
|
+
supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
|
|
125
|
+
docker_platform=DockerPlatform.AMD,
|
|
126
|
+
sub_slicing_topologies=set(["4x4x8", "4x4x16"]),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
assert result["test-4x4x4"].supports_sub_slicing is False
|
|
130
|
+
assert result["test-4x4x8"].supports_sub_slicing is True
|
|
131
|
+
assert result["test-4x4x16"].supports_sub_slicing is True
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_get_tpu_system_characteristics_map_sets_super_slicing_support():
|
|
135
|
+
result = get_tpu_system_characteristics_map(
|
|
136
|
+
prefix="test",
|
|
137
|
+
tensorcores_per_chip=2,
|
|
138
|
+
gke_accelerator="test",
|
|
139
|
+
machine_type="test",
|
|
140
|
+
supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
|
|
141
|
+
docker_platform=DockerPlatform.AMD,
|
|
142
|
+
super_slicing_topologies=set(["4x4x8", "4x4x16"]),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
assert result["test-4x4x4"].supports_super_slicing is False
|
|
146
|
+
assert result["test-4x4x8"].supports_super_slicing is True
|
|
147
|
+
assert result["test-4x4x16"].supports_super_slicing is True
|
|
148
|
+
|
|
149
|
+
|
|
118
150
|
def test_get_tpu_system_characteristics_map_prefers_default_topologies():
|
|
119
151
|
result = get_tpu_system_characteristics_map(
|
|
120
152
|
prefix="test",
|
|
@@ -122,7 +154,6 @@ def test_get_tpu_system_characteristics_map_prefers_default_topologies():
|
|
|
122
154
|
gke_accelerator="test",
|
|
123
155
|
machine_type="test",
|
|
124
156
|
supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
|
|
125
|
-
supports_sub_slicing=False,
|
|
126
157
|
docker_platform=DockerPlatform.AMD,
|
|
127
158
|
default_topologies=set(["4x8x16"]),
|
|
128
159
|
)
|
|
@@ -174,6 +205,7 @@ def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
|
|
|
174
205
|
accelerator_type=AcceleratorType.GPU,
|
|
175
206
|
device_type="l4-1",
|
|
176
207
|
supports_sub_slicing=False,
|
|
208
|
+
supports_super_slicing=False,
|
|
177
209
|
docker_platform=DockerPlatform.AMD,
|
|
178
210
|
gpu_config=GpuConfig(requires_topology=False),
|
|
179
211
|
)
|
|
@@ -192,5 +224,6 @@ def test_system_characteristics_post_init_throws_for_gpu_without_config():
|
|
|
192
224
|
accelerator_type=AcceleratorType.GPU,
|
|
193
225
|
device_type="l4-1",
|
|
194
226
|
supports_sub_slicing=False,
|
|
227
|
+
supports_super_slicing=False,
|
|
195
228
|
docker_platform=DockerPlatform.AMD,
|
|
196
229
|
)
|
xpk/core/telemetry.py
CHANGED
|
@@ -124,6 +124,7 @@ class MetricsEventMetadataKey(Enum):
|
|
|
124
124
|
EXIT_CODE = "XPK_EXIT_CODE"
|
|
125
125
|
RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
|
|
126
126
|
RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
|
|
127
|
+
LATENCY_SECONDS = "XPK_LATENCY_SECONDS"
|
|
127
128
|
|
|
128
129
|
|
|
129
130
|
@dataclass
|
|
@@ -190,10 +191,14 @@ def _generate_payload(events: list[_MetricsEvent]) -> str:
|
|
|
190
191
|
base_concord_event = _get_base_concord_event()
|
|
191
192
|
base_event_metadata = _get_base_event_metadata()
|
|
192
193
|
serialized_events = []
|
|
194
|
+
first_time = events[0].time if len(events) > 0 else 0
|
|
193
195
|
for event in events:
|
|
194
196
|
metadata = {
|
|
195
197
|
**base_event_metadata,
|
|
196
198
|
**event.metadata,
|
|
199
|
+
MetricsEventMetadataKey.LATENCY_SECONDS: str(
|
|
200
|
+
int(event.time - first_time)
|
|
201
|
+
),
|
|
197
202
|
}
|
|
198
203
|
serialized_events.append({
|
|
199
204
|
"event_time_ms": int(event.time * 1000),
|
xpk/core/telemetry_test.py
CHANGED
|
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import itertools
|
|
17
18
|
import pytest
|
|
18
19
|
import json
|
|
19
20
|
from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
|
|
@@ -26,7 +27,7 @@ from pytest_mock import MockerFixture
|
|
|
26
27
|
@pytest.fixture(autouse=True)
|
|
27
28
|
def setup_mocks(mocker: MockerFixture):
|
|
28
29
|
mocker.patch('xpk.core.telemetry._get_session_id', return_value='321231')
|
|
29
|
-
mocker.patch('time.time',
|
|
30
|
+
mocker.patch('time.time', side_effect=itertools.count())
|
|
30
31
|
mocker.patch('platform.python_version', return_value='99.99.99')
|
|
31
32
|
mocker.patch('os.path.basename', return_value='xpk.py')
|
|
32
33
|
mocker.patch('os.path.abspath', return_value='/home/xpk_user')
|
|
@@ -76,6 +77,7 @@ def test_metrics_collector_logs_start_event_correctly():
|
|
|
76
77
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
77
78
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
78
79
|
{'key': 'XPK_COMMAND', 'value': 'test'},
|
|
80
|
+
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
79
81
|
],
|
|
80
82
|
'event_name': 'start',
|
|
81
83
|
'event_type': 'commands',
|
|
@@ -106,6 +108,7 @@ def test_metrics_collector_logs_complete_event_correctly():
|
|
|
106
108
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
107
109
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
108
110
|
{'key': 'XPK_EXIT_CODE', 'value': '2'},
|
|
111
|
+
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
109
112
|
],
|
|
110
113
|
'event_name': 'complete',
|
|
111
114
|
'event_type': 'commands',
|
|
@@ -129,6 +132,7 @@ def test_metrics_collector_logs_custom_event_correctly():
|
|
|
129
132
|
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
130
133
|
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
131
134
|
{'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
|
|
135
|
+
{'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
|
|
132
136
|
],
|
|
133
137
|
'event_name': 'test',
|
|
134
138
|
'event_type': 'custom',
|
|
@@ -136,6 +140,19 @@ def test_metrics_collector_logs_custom_event_correctly():
|
|
|
136
140
|
}
|
|
137
141
|
|
|
138
142
|
|
|
143
|
+
def test_metrics_collector_computest_latency_correctly():
|
|
144
|
+
MetricsCollector.log_start(command='test')
|
|
145
|
+
MetricsCollector.log_complete(exit_code=0)
|
|
146
|
+
payload = json.loads(MetricsCollector.flush())
|
|
147
|
+
extension_json = json.loads(payload['log_event'][1]['source_extension_json'])
|
|
148
|
+
latency = (
|
|
149
|
+
el['value']
|
|
150
|
+
for el in extension_json['event_metadata']
|
|
151
|
+
if el['key'] == 'XPK_LATENCY_SECONDS'
|
|
152
|
+
)
|
|
153
|
+
assert next(latency, None) == '1'
|
|
154
|
+
|
|
155
|
+
|
|
139
156
|
def test_metrics_collector_logs_correct_envelope():
|
|
140
157
|
MetricsCollector.log_start(command='test')
|
|
141
158
|
MetricsCollector.log_custom(
|
|
@@ -145,7 +162,7 @@ def test_metrics_collector_logs_correct_envelope():
|
|
|
145
162
|
payload = json.loads(MetricsCollector.flush())
|
|
146
163
|
assert payload['client_info'] == {'client_type': 'XPK'}
|
|
147
164
|
assert payload['log_source_name'] == 'CONCORD'
|
|
148
|
-
assert payload['request_time_ms'] ==
|
|
165
|
+
assert payload['request_time_ms'] == 3000
|
|
149
166
|
assert len(payload['log_event']) == 3
|
|
150
167
|
|
|
151
168
|
|
xpk/core/updates.py
CHANGED
|
@@ -28,7 +28,7 @@ def get_latest_xpk_version() -> tuple[int, Version | None]:
|
|
|
28
28
|
return 0, Version(__version__)
|
|
29
29
|
|
|
30
30
|
return_code, result = run_command_for_value(
|
|
31
|
-
command="pip index versions xpk --json",
|
|
31
|
+
command="pip index versions xpk --json --no-input",
|
|
32
32
|
task="Retrieve latest XPK version",
|
|
33
33
|
quiet=True,
|
|
34
34
|
)
|
xpk/main.py
CHANGED
|
@@ -36,6 +36,7 @@ import argcomplete
|
|
|
36
36
|
import sys
|
|
37
37
|
|
|
38
38
|
from .parser.core import set_parser
|
|
39
|
+
from .parser.common import extract_command_path
|
|
39
40
|
from .core.updates import print_xpk_hello
|
|
40
41
|
from .core.config import set_config, FileSystemConfig
|
|
41
42
|
from .core.telemetry import MetricsCollector, send_clearcut_payload, should_send_telemetry
|
|
@@ -78,7 +79,7 @@ def main() -> None:
|
|
|
78
79
|
or ('force' in main_args and main_args.force)
|
|
79
80
|
),
|
|
80
81
|
)
|
|
81
|
-
MetricsCollector.log_start(main_args
|
|
82
|
+
MetricsCollector.log_start(command=extract_command_path(parser, main_args))
|
|
82
83
|
print_xpk_hello()
|
|
83
84
|
main_args.func(main_args)
|
|
84
85
|
xpk_print('XPK Done.', flush=True)
|
xpk/parser/cluster.py
CHANGED
|
@@ -132,6 +132,10 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
132
132
|
|
|
133
133
|
if FeatureFlags.SUB_SLICING_ENABLED:
|
|
134
134
|
add_cluster_create_sub_slicing_arguments(cluster_create_optional_arguments)
|
|
135
|
+
if FeatureFlags.SUPER_SLICING_ENABLED:
|
|
136
|
+
add_cluster_create_super_slicing_arguments(
|
|
137
|
+
cluster_create_optional_arguments
|
|
138
|
+
)
|
|
135
139
|
|
|
136
140
|
autoprovisioning_arguments = cluster_create_parser.add_argument_group(
|
|
137
141
|
'Autoprovisioning Arguments',
|
|
@@ -205,6 +209,10 @@ def set_cluster_create_pathways_parser(
|
|
|
205
209
|
add_cluster_create_sub_slicing_arguments(
|
|
206
210
|
cluster_create_pathways_optional_arguments
|
|
207
211
|
)
|
|
212
|
+
if FeatureFlags.SUPER_SLICING_ENABLED:
|
|
213
|
+
add_cluster_create_super_slicing_arguments(
|
|
214
|
+
cluster_create_pathways_optional_arguments
|
|
215
|
+
)
|
|
208
216
|
|
|
209
217
|
autoprovisioning_arguments = (
|
|
210
218
|
cluster_create_pathways_parser.add_argument_group(
|
|
@@ -330,7 +338,7 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
|
330
338
|
add_resource_limits(cluster_create_resource_limits)
|
|
331
339
|
|
|
332
340
|
cluster_create_ray_parser.set_defaults(
|
|
333
|
-
func=cluster_create_ray_cluster, sub_slicing=False
|
|
341
|
+
func=cluster_create_ray_cluster, sub_slicing=False, super_slicing=False
|
|
334
342
|
)
|
|
335
343
|
|
|
336
344
|
|
|
@@ -596,7 +604,10 @@ def add_shared_cluster_create_optional_arguments(
|
|
|
596
604
|
parser_or_group.add_argument(
|
|
597
605
|
'--num-slices',
|
|
598
606
|
type=int,
|
|
599
|
-
default
|
|
607
|
+
# removing default in case of super slicing because
|
|
608
|
+
# --num-slices must be equal to --num-cubes if both are set
|
|
609
|
+
# it will default to 1 during validation
|
|
610
|
+
default=1 if not FeatureFlags.SUPER_SLICING_ENABLED else None,
|
|
600
611
|
help='The number of slices to run the job on, defaults to 1.',
|
|
601
612
|
required=False,
|
|
602
613
|
)
|
|
@@ -910,3 +921,24 @@ def add_cluster_create_sub_slicing_arguments(
|
|
|
910
921
|
action='store_true',
|
|
911
922
|
help='Whether to set up cluster to support sub-slicing',
|
|
912
923
|
)
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
def add_cluster_create_super_slicing_arguments(
|
|
927
|
+
parser_or_group: ParserOrArgumentGroup,
|
|
928
|
+
):
|
|
929
|
+
parser_or_group.add_argument(
|
|
930
|
+
'--super-slicing',
|
|
931
|
+
action='store_true',
|
|
932
|
+
help='Whether to set up cluster to support super-slicing',
|
|
933
|
+
)
|
|
934
|
+
parser_or_group.add_argument(
|
|
935
|
+
'--num-cubes',
|
|
936
|
+
type=int,
|
|
937
|
+
# default value is set during validation because it needs to be compared
|
|
938
|
+
# against --num-slices
|
|
939
|
+
help=(
|
|
940
|
+
'Total number of cubes to create within a cluster, defaults to 1. Can'
|
|
941
|
+
' only be used with --super-slicing.'
|
|
942
|
+
),
|
|
943
|
+
required=False,
|
|
944
|
+
)
|
xpk/parser/cluster_test.py
CHANGED
|
@@ -144,3 +144,120 @@ def test_cluster_create_enable_lustre_legacy_port_can_be_set():
|
|
|
144
144
|
])
|
|
145
145
|
|
|
146
146
|
assert args.enable_legacy_lustre_port is True
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_cluster_create_super_slicing_is_hidden_with_flag_off():
|
|
150
|
+
FeatureFlags.SUPER_SLICING_ENABLED = False
|
|
151
|
+
parser = argparse.ArgumentParser()
|
|
152
|
+
|
|
153
|
+
set_cluster_create_parser(parser)
|
|
154
|
+
help_str = parser.format_help()
|
|
155
|
+
|
|
156
|
+
assert "--super-slicing" not in help_str
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_cluster_create_super_slicing_is_shown_with_flag_on():
|
|
160
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
161
|
+
parser = argparse.ArgumentParser()
|
|
162
|
+
|
|
163
|
+
set_cluster_create_parser(parser)
|
|
164
|
+
help_str = parser.format_help()
|
|
165
|
+
|
|
166
|
+
assert "--super-slicing" in help_str
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def test_cluster_create_super_slicing_is_false_by_default():
|
|
170
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
171
|
+
parser = argparse.ArgumentParser()
|
|
172
|
+
|
|
173
|
+
set_cluster_create_parser(parser)
|
|
174
|
+
args = parser.parse_args(
|
|
175
|
+
["--cluster", "test-cluster", "--tpu-type", "tpu7x-2"]
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
assert args.super_slicing is False
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_cluster_create_super_slicing_can_be_set():
|
|
182
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
183
|
+
parser = argparse.ArgumentParser()
|
|
184
|
+
|
|
185
|
+
set_cluster_create_parser(parser)
|
|
186
|
+
args = parser.parse_args(
|
|
187
|
+
["--cluster", "test-cluster", "--tpu-type", "tpu7x-2", "--super-slicing"],
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
assert args.super_slicing is True
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def test_cluster_create_num_cubes_is_hidden_with_flag_off():
|
|
194
|
+
FeatureFlags.SUPER_SLICING_ENABLED = False
|
|
195
|
+
parser = argparse.ArgumentParser()
|
|
196
|
+
|
|
197
|
+
set_cluster_create_parser(parser)
|
|
198
|
+
help_str = parser.format_help()
|
|
199
|
+
|
|
200
|
+
assert "--num-cubes" not in help_str
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def test_cluster_create_num_cubes_is_shown_with_flag_on():
|
|
204
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
205
|
+
parser = argparse.ArgumentParser()
|
|
206
|
+
|
|
207
|
+
set_cluster_create_parser(parser)
|
|
208
|
+
help_str = parser.format_help()
|
|
209
|
+
|
|
210
|
+
assert "--num-cubes" in help_str
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def test_cluster_create_num_cubes_can_be_set():
|
|
214
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
215
|
+
parser = argparse.ArgumentParser()
|
|
216
|
+
|
|
217
|
+
set_cluster_create_parser(parser)
|
|
218
|
+
args = parser.parse_args(
|
|
219
|
+
[
|
|
220
|
+
"--cluster",
|
|
221
|
+
"test-cluster",
|
|
222
|
+
"--tpu-type",
|
|
223
|
+
"tpu7x-2",
|
|
224
|
+
"--num-cubes",
|
|
225
|
+
"5",
|
|
226
|
+
],
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
assert args.num_cubes == 5
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def test_cluster_create_num_slices_defaults_to_1_if_no_superslicing_feature():
|
|
233
|
+
FeatureFlags.SUPER_SLICING_ENABLED = False
|
|
234
|
+
parser = argparse.ArgumentParser()
|
|
235
|
+
|
|
236
|
+
set_cluster_create_parser(parser)
|
|
237
|
+
args = parser.parse_args(
|
|
238
|
+
[
|
|
239
|
+
"--cluster",
|
|
240
|
+
"test-cluster",
|
|
241
|
+
"--tpu-type",
|
|
242
|
+
"tpu7x-2",
|
|
243
|
+
],
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
assert args.num_slices == 1
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def test_cluster_create_num_slices_has_no_default_if_superslicing_feature():
|
|
250
|
+
FeatureFlags.SUPER_SLICING_ENABLED = True
|
|
251
|
+
parser = argparse.ArgumentParser()
|
|
252
|
+
|
|
253
|
+
set_cluster_create_parser(parser)
|
|
254
|
+
args = parser.parse_args(
|
|
255
|
+
[
|
|
256
|
+
"--cluster",
|
|
257
|
+
"test-cluster",
|
|
258
|
+
"--tpu-type",
|
|
259
|
+
"tpu7x-2",
|
|
260
|
+
],
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
assert args.num_slices is None
|
xpk/parser/common.py
CHANGED
|
@@ -369,3 +369,35 @@ def add_tpu_and_device_type_arguments(
|
|
|
369
369
|
) -> None:
|
|
370
370
|
add_tpu_type_argument(custom_parser_or_group)
|
|
371
371
|
add_device_type_argument(custom_parser_or_group)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def extract_command_path(parser: argparse.ArgumentParser, args):
|
|
375
|
+
"""
|
|
376
|
+
Reconstructs the command path (e.g. 'cluster create').
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
def _get_path_segments(current_parser):
|
|
380
|
+
subparser_action = next(
|
|
381
|
+
(
|
|
382
|
+
action
|
|
383
|
+
for action in current_parser._actions # pylint: disable=protected-access
|
|
384
|
+
if isinstance(action, argparse._SubParsersAction) # pylint: disable=protected-access
|
|
385
|
+
),
|
|
386
|
+
None,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
if subparser_action is None:
|
|
390
|
+
return []
|
|
391
|
+
|
|
392
|
+
chosen_command = getattr(args, subparser_action.dest, None)
|
|
393
|
+
|
|
394
|
+
if chosen_command is None:
|
|
395
|
+
return []
|
|
396
|
+
|
|
397
|
+
if chosen_command in subparser_action.choices:
|
|
398
|
+
next_parser = subparser_action.choices[chosen_command]
|
|
399
|
+
return [chosen_command] + _get_path_segments(next_parser)
|
|
400
|
+
|
|
401
|
+
return [chosen_command]
|
|
402
|
+
|
|
403
|
+
return ' '.join(_get_path_segments(parser))
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
from .common import extract_command_path
|
|
19
|
+
from .core import set_parser
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_extract_zero_level_nested_command():
|
|
23
|
+
parser = argparse.ArgumentParser()
|
|
24
|
+
set_parser(parser=parser)
|
|
25
|
+
args = parser.parse_args([])
|
|
26
|
+
assert extract_command_path(parser, args) == ""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_extract_one_level_nested_command():
|
|
30
|
+
parser = argparse.ArgumentParser()
|
|
31
|
+
set_parser(parser=parser)
|
|
32
|
+
args = parser.parse_args(["version"])
|
|
33
|
+
assert extract_command_path(parser, args) == "version"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_extract_two_level_nested_command():
|
|
37
|
+
parser = argparse.ArgumentParser()
|
|
38
|
+
set_parser(parser=parser)
|
|
39
|
+
args = parser.parse_args(["cluster", "list"])
|
|
40
|
+
assert extract_command_path(parser, args) == "cluster list"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_extract_two_level_nested_command_with_flags():
|
|
44
|
+
parser = argparse.ArgumentParser()
|
|
45
|
+
set_parser(parser=parser)
|
|
46
|
+
args = parser.parse_args(
|
|
47
|
+
["cluster", "list", "--project=abc", "--zone=us-central1-a"]
|
|
48
|
+
)
|
|
49
|
+
assert extract_command_path(parser, args) == "cluster list"
|
|
@@ -1,15 +1,16 @@
|
|
|
1
|
-
{
|
|
1
|
+
{%- for flavor in flavors %}
|
|
2
2
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
3
3
|
kind: ResourceFlavor
|
|
4
4
|
metadata:
|
|
5
5
|
name: "{{ flavor.name }}"
|
|
6
6
|
spec:
|
|
7
7
|
nodeLabels: {{ flavor.nodeLabels | tojson }}
|
|
8
|
-
{
|
|
8
|
+
{%- if flavor.topologyLabel %}
|
|
9
9
|
{{ flavor.topologyLabel }}
|
|
10
|
-
{
|
|
10
|
+
{%- endif %}
|
|
11
11
|
---
|
|
12
|
-
{
|
|
12
|
+
{%- endfor %}
|
|
13
|
+
{%- if 'dws-prov' in admission_checks %}
|
|
13
14
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
14
15
|
kind: AdmissionCheck
|
|
15
16
|
metadata:
|
|
@@ -21,6 +22,16 @@ spec:
|
|
|
21
22
|
kind: ProvisioningRequestConfig
|
|
22
23
|
name: dws-config
|
|
23
24
|
---
|
|
25
|
+
{%- endif %}
|
|
26
|
+
{%- if 'ss-kueue-operator' in admission_checks %}
|
|
27
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
28
|
+
kind: AdmissionCheck
|
|
29
|
+
metadata:
|
|
30
|
+
name: ss-kueue-operator
|
|
31
|
+
spec:
|
|
32
|
+
controllerName: accelerator.gke.io/slice
|
|
33
|
+
---
|
|
34
|
+
{%- endif %}
|
|
24
35
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
25
36
|
kind: ProvisioningRequestConfig
|
|
26
37
|
metadata:
|
|
@@ -44,7 +55,12 @@ spec:
|
|
|
44
55
|
withinClusterQueue: LowerPriority
|
|
45
56
|
namespaceSelector: {} # match all.
|
|
46
57
|
resourceGroups: {{ resource_groups }}
|
|
47
|
-
{
|
|
58
|
+
{%- if admission_checks %}
|
|
59
|
+
admissionChecks:
|
|
60
|
+
{%- for check in admission_checks %}
|
|
61
|
+
- {{ check }}
|
|
62
|
+
{%- endfor %}
|
|
63
|
+
{%- endif %}
|
|
48
64
|
---
|
|
49
65
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
50
66
|
kind: LocalQueue
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
2
|
+
kind: Topology
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ super_slice_topology_name }}
|
|
5
|
+
spec:
|
|
6
|
+
levels:
|
|
7
|
+
- nodeLabel: cloud.google.com/gce-topology-block
|
|
8
|
+
- nodeLabel: cloud.google.com/gke-tpu-partition-4x4x4-id
|
|
9
|
+
- nodeLabel: kubernetes.io/hostname
|
xpk/utils/kueue.py
CHANGED
|
@@ -14,7 +14,11 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from ..core.system_characteristics import AcceleratorType
|
|
17
18
|
|
|
18
|
-
|
|
19
|
+
|
|
20
|
+
def is_queued_cluster(
|
|
21
|
+
num_slices: int, accelerator_type: AcceleratorType
|
|
22
|
+
) -> bool:
|
|
19
23
|
"""Determines if admission checks should be enabled and cluster queued."""
|
|
20
|
-
return num_slices <= 1
|
|
24
|
+
return num_slices <= 1 and accelerator_type == AcceleratorType.GPU
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.17.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -25,6 +25,7 @@ Requires-Dist: packaging==24.2
|
|
|
25
25
|
Requires-Dist: google-cloud-filestore==1.12.0
|
|
26
26
|
Requires-Dist: google-cloud-storage
|
|
27
27
|
Requires-Dist: Jinja2==3.1.6
|
|
28
|
+
Requires-Dist: urllib3<2.6.0
|
|
28
29
|
Provides-Extra: dev
|
|
29
30
|
Requires-Dist: pyink==24.3.0; extra == "dev"
|
|
30
31
|
Requires-Dist: pylint>=2.6.0; extra == "dev"
|