xpk 0.14.4__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/commands/cluster.py +57 -21
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +11 -2
- xpk/commands/cluster_test.py +233 -12
- xpk/commands/config.py +3 -5
- xpk/commands/kind.py +1 -1
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +28 -12
- xpk/commands/workload_test.py +3 -3
- xpk/core/blueprint/blueprint_generator.py +70 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/capacity.py +46 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +37 -57
- xpk/core/cluster_test.py +95 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +9 -2
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +6 -9
- xpk/core/kueue_manager.py +192 -32
- xpk/core/kueue_manager_test.py +132 -4
- xpk/core/nodepool.py +21 -29
- xpk/core/nodepool_test.py +17 -15
- xpk/core/scheduling.py +16 -1
- xpk/core/scheduling_test.py +85 -6
- xpk/core/system_characteristics.py +77 -19
- xpk/core/system_characteristics_test.py +80 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/main.py +31 -13
- xpk/parser/cluster.py +48 -9
- xpk/parser/cluster_test.py +42 -3
- xpk/parser/workload.py +12 -0
- xpk/parser/workload_test.py +4 -4
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +7 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +0 -11
- xpk/utils/versions.py +31 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/METADATA +113 -92
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/RECORD +58 -48
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
import json
|
|
19
|
+
from .config import xpk_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
|
|
20
|
+
from .telemetry import MetricsCollector, MetricsEventMetadataKey, should_send_telemetry
|
|
21
|
+
from ..utils.execution_context import set_dry_run
|
|
22
|
+
from ..utils.feature_flags import FeatureFlags
|
|
23
|
+
from pytest_mock import MockerFixture
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.fixture(autouse=True)
|
|
27
|
+
def setup_mocks(mocker: MockerFixture):
|
|
28
|
+
mocker.patch('xpk.core.telemetry._get_session_id', return_value='321231')
|
|
29
|
+
mocker.patch('time.time', return_value=0)
|
|
30
|
+
mocker.patch('platform.python_version', return_value='99.99.99')
|
|
31
|
+
mocker.patch('os.path.basename', return_value='xpk.py')
|
|
32
|
+
mocker.patch('os.path.abspath', return_value='/home/xpk_user')
|
|
33
|
+
set_dry_run(False)
|
|
34
|
+
xpk_config.set(CLIENT_ID_KEY, 'client_id')
|
|
35
|
+
yield
|
|
36
|
+
xpk_config.set(CLIENT_ID_KEY, None)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.parametrize(
|
|
40
|
+
argnames='feature_flag,config_value,expected',
|
|
41
|
+
argvalues=[
|
|
42
|
+
(True, 'true', True),
|
|
43
|
+
(False, 'true', False),
|
|
44
|
+
(True, None, True),
|
|
45
|
+
(True, 'false', False),
|
|
46
|
+
],
|
|
47
|
+
)
|
|
48
|
+
def test_should_send_telemetry_returns_correct_value(
|
|
49
|
+
feature_flag: bool, config_value: str, expected: bool
|
|
50
|
+
):
|
|
51
|
+
xpk_config.set(SEND_TELEMETRY_KEY, config_value)
|
|
52
|
+
FeatureFlags.TELEMETRY_ENABLED = feature_flag
|
|
53
|
+
assert should_send_telemetry() is expected
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_metrics_collector_generates_client_id_if_not_present():
|
|
57
|
+
xpk_config.set(CLIENT_ID_KEY, None)
|
|
58
|
+
MetricsCollector.log_start(command='test')
|
|
59
|
+
payload = json.loads(MetricsCollector.flush())
|
|
60
|
+
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
61
|
+
assert extension_json['client_install_id'] is not None
|
|
62
|
+
assert len(extension_json['client_install_id']) > 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_metrics_collector_logs_start_event_correctly():
|
|
66
|
+
MetricsCollector.log_start(command='test')
|
|
67
|
+
payload = json.loads(MetricsCollector.flush())
|
|
68
|
+
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
69
|
+
assert extension_json == {
|
|
70
|
+
'client_install_id': 'client_id',
|
|
71
|
+
'console_type': 'XPK',
|
|
72
|
+
'event_metadata': [
|
|
73
|
+
{'key': 'XPK_SESSION_ID', 'value': '321231'},
|
|
74
|
+
{'key': 'XPK_DRY_RUN', 'value': 'false'},
|
|
75
|
+
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
76
|
+
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
77
|
+
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
78
|
+
{'key': 'XPK_COMMAND', 'value': 'test'},
|
|
79
|
+
],
|
|
80
|
+
'event_name': 'start',
|
|
81
|
+
'event_type': 'commands',
|
|
82
|
+
'release_version': 'v0.15.0',
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_metrics_collector_generates_client_id_when_not_present():
|
|
87
|
+
xpk_config.set(CLIENT_ID_KEY, None)
|
|
88
|
+
MetricsCollector.log_start(command='test')
|
|
89
|
+
payload = json.loads(MetricsCollector.flush())
|
|
90
|
+
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
91
|
+
assert extension_json['client_install_id'] is not None
|
|
92
|
+
assert len(extension_json['client_install_id']) > 0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_metrics_collector_logs_complete_event_correctly():
|
|
96
|
+
MetricsCollector.log_complete(exit_code=2)
|
|
97
|
+
payload = json.loads(MetricsCollector.flush())
|
|
98
|
+
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
99
|
+
assert extension_json == {
|
|
100
|
+
'client_install_id': 'client_id',
|
|
101
|
+
'console_type': 'XPK',
|
|
102
|
+
'event_metadata': [
|
|
103
|
+
{'key': 'XPK_SESSION_ID', 'value': '321231'},
|
|
104
|
+
{'key': 'XPK_DRY_RUN', 'value': 'false'},
|
|
105
|
+
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
106
|
+
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
107
|
+
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
108
|
+
{'key': 'XPK_EXIT_CODE', 'value': '2'},
|
|
109
|
+
],
|
|
110
|
+
'event_name': 'complete',
|
|
111
|
+
'event_type': 'commands',
|
|
112
|
+
'release_version': 'v0.15.0',
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_metrics_collector_logs_custom_event_correctly():
|
|
117
|
+
MetricsCollector.log_custom(
|
|
118
|
+
name='test', metadata={MetricsEventMetadataKey.PROVISIONING_MODE: 'flex'}
|
|
119
|
+
)
|
|
120
|
+
payload = json.loads(MetricsCollector.flush())
|
|
121
|
+
extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
|
|
122
|
+
assert extension_json == {
|
|
123
|
+
'client_install_id': 'client_id',
|
|
124
|
+
'console_type': 'XPK',
|
|
125
|
+
'event_metadata': [
|
|
126
|
+
{'key': 'XPK_SESSION_ID', 'value': '321231'},
|
|
127
|
+
{'key': 'XPK_DRY_RUN', 'value': 'false'},
|
|
128
|
+
{'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
|
|
129
|
+
{'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
|
|
130
|
+
{'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
|
|
131
|
+
{'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
|
|
132
|
+
],
|
|
133
|
+
'event_name': 'test',
|
|
134
|
+
'event_type': 'custom',
|
|
135
|
+
'release_version': 'v0.15.0',
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def test_metrics_collector_logs_correct_envelope():
|
|
140
|
+
MetricsCollector.log_start(command='test')
|
|
141
|
+
MetricsCollector.log_custom(
|
|
142
|
+
name='test', metadata={MetricsEventMetadataKey.PROVISIONING_MODE: 'flex'}
|
|
143
|
+
)
|
|
144
|
+
MetricsCollector.log_complete(exit_code=2)
|
|
145
|
+
payload = json.loads(MetricsCollector.flush())
|
|
146
|
+
assert payload['client_info'] == {'client_type': 'XPK'}
|
|
147
|
+
assert payload['log_source_name'] == 'CONCORD'
|
|
148
|
+
assert payload['request_time_ms'] == 0
|
|
149
|
+
assert len(payload['log_event']) == 3
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_metrics_collector_does_not_flush_event_twice():
|
|
153
|
+
MetricsCollector.log_start(command='test')
|
|
154
|
+
MetricsCollector.flush()
|
|
155
|
+
MetricsCollector.log_start(command='version')
|
|
156
|
+
payload = json.loads(MetricsCollector.flush())
|
|
157
|
+
assert len(payload['log_event']) == 1
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@pytest.mark.parametrize(
|
|
161
|
+
argnames='dry_run,expected', argvalues=[(False, 'false'), (True, 'true')]
|
|
162
|
+
)
|
|
163
|
+
def test_metrics_collector_logs_correct_dry_run_value(
|
|
164
|
+
dry_run: bool, expected: str
|
|
165
|
+
):
|
|
166
|
+
set_dry_run(dry_run)
|
|
167
|
+
MetricsCollector.log_start(command='test')
|
|
168
|
+
payload = MetricsCollector.flush()
|
|
169
|
+
assert _get_metadata_value(payload, 'XPK_DRY_RUN') == expected
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@pytest.mark.parametrize(
|
|
173
|
+
argnames='basename,expected',
|
|
174
|
+
argvalues=[
|
|
175
|
+
('xpk', 'true'),
|
|
176
|
+
('xpk.py', 'false'),
|
|
177
|
+
],
|
|
178
|
+
)
|
|
179
|
+
def test_metrics_collectors_logs_correct_running_as_pip_value(
|
|
180
|
+
basename: str, expected: str, mocker: MockerFixture
|
|
181
|
+
):
|
|
182
|
+
mocker.patch('os.path.basename', return_value=basename)
|
|
183
|
+
MetricsCollector.log_start(command='test')
|
|
184
|
+
payload = MetricsCollector.flush()
|
|
185
|
+
assert _get_metadata_value(payload, 'XPK_RUNNING_AS_PIP') == expected
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@pytest.mark.parametrize(
|
|
189
|
+
argnames='abspath,expected',
|
|
190
|
+
argvalues=[
|
|
191
|
+
('/site-packages/', 'false'),
|
|
192
|
+
('/dist-packages/', 'false'),
|
|
193
|
+
('/home/xpk_user', 'true'),
|
|
194
|
+
],
|
|
195
|
+
)
|
|
196
|
+
def test_metrics_collectors_logs_correct_running_from_source_value(
|
|
197
|
+
abspath: str, expected: str, mocker: MockerFixture
|
|
198
|
+
):
|
|
199
|
+
mocker.patch('os.path.abspath', return_value=abspath)
|
|
200
|
+
MetricsCollector.log_start(command='test')
|
|
201
|
+
payload = MetricsCollector.flush()
|
|
202
|
+
assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _get_metadata_value(payload_str: str, key: str) -> str | None:
|
|
206
|
+
payload = json.loads(payload_str)
|
|
207
|
+
metadata = json.loads(payload['log_event'][0]['source_extension_json'])[
|
|
208
|
+
'event_metadata'
|
|
209
|
+
]
|
|
210
|
+
matching = (item['value'] for item in metadata if item['key'] == key)
|
|
211
|
+
return next(matching, None)
|
xpk/main.py
CHANGED
|
@@ -32,11 +32,13 @@ Next Steps:
|
|
|
32
32
|
"""
|
|
33
33
|
|
|
34
34
|
import argparse
|
|
35
|
+
import argcomplete
|
|
35
36
|
import sys
|
|
36
37
|
|
|
37
38
|
from .parser.core import set_parser
|
|
38
39
|
from .core.updates import print_xpk_hello
|
|
39
|
-
from .
|
|
40
|
+
from .core.telemetry import MetricsCollector, send_clearcut_payload, should_send_telemetry
|
|
41
|
+
from .utils.console import xpk_print, exit_code_to_int
|
|
40
42
|
from .utils.execution_context import set_context
|
|
41
43
|
################### Compatibility Check ###################
|
|
42
44
|
# Check that the user runs the below version or greater.
|
|
@@ -59,19 +61,35 @@ if (
|
|
|
59
61
|
|
|
60
62
|
|
|
61
63
|
def main() -> None:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
64
|
+
try:
|
|
65
|
+
# Create top level parser for xpk command.
|
|
66
|
+
parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
|
|
67
|
+
set_parser(parser=parser)
|
|
68
|
+
argcomplete.autocomplete(parser)
|
|
65
69
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
70
|
+
main_args = parser.parse_args()
|
|
71
|
+
main_args.enable_ray_cluster = False
|
|
72
|
+
set_context(
|
|
73
|
+
dry_run_value='dry_run' in main_args and main_args.dry_run,
|
|
74
|
+
quiet_value=(
|
|
75
|
+
('quiet' in main_args and main_args.quiet)
|
|
76
|
+
or ('force' in main_args and main_args.force)
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
MetricsCollector.log_start(main_args.xpk_subcommands)
|
|
80
|
+
print_xpk_hello()
|
|
81
|
+
main_args.func(main_args)
|
|
82
|
+
xpk_print('XPK Done.', flush=True)
|
|
83
|
+
MetricsCollector.log_complete(0)
|
|
84
|
+
except SystemExit as e:
|
|
85
|
+
MetricsCollector.log_complete(exit_code_to_int(e.code))
|
|
86
|
+
raise
|
|
87
|
+
except:
|
|
88
|
+
MetricsCollector.log_complete(-1)
|
|
89
|
+
raise
|
|
90
|
+
finally:
|
|
91
|
+
if should_send_telemetry():
|
|
92
|
+
send_clearcut_payload(MetricsCollector.flush())
|
|
75
93
|
|
|
76
94
|
|
|
77
95
|
if __name__ == '__main__':
|
xpk/parser/cluster.py
CHANGED
|
@@ -26,7 +26,8 @@ from ..commands.cluster import (
|
|
|
26
26
|
cluster_describe,
|
|
27
27
|
cluster_list,
|
|
28
28
|
)
|
|
29
|
-
from ..
|
|
29
|
+
from ..core.config import xpk_config
|
|
30
|
+
from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType
|
|
30
31
|
from ..core.config import CFG_BUCKET_KEY
|
|
31
32
|
from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
|
|
32
33
|
from .common import add_shared_arguments, ParserOrArgumentGroup
|
|
@@ -103,6 +104,10 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
103
104
|
type=str,
|
|
104
105
|
default=None,
|
|
105
106
|
help='The tpu type to use, v5litepod-16, etc.',
|
|
107
|
+
metavar='TPU_TYPE',
|
|
108
|
+
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
109
|
+
[AcceleratorType.TPU]
|
|
110
|
+
),
|
|
106
111
|
)
|
|
107
112
|
cluster_device_group.add_argument(
|
|
108
113
|
'--device-type',
|
|
@@ -112,6 +117,8 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
112
117
|
'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
|
|
113
118
|
' h100-80gb-8, n2-standard-32-4 etc.'
|
|
114
119
|
),
|
|
120
|
+
metavar='DEVICE_TYPE',
|
|
121
|
+
choices=get_system_characteristics_keys_by_accelerator_type(),
|
|
115
122
|
)
|
|
116
123
|
|
|
117
124
|
### Optional arguments specific to "cluster create"
|
|
@@ -124,7 +131,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
124
131
|
cluster_create_optional_arguments.add_argument(
|
|
125
132
|
'--cluster-state-gcs-bucket',
|
|
126
133
|
type=str,
|
|
127
|
-
default=
|
|
134
|
+
default=xpk_config.get(CFG_BUCKET_KEY),
|
|
128
135
|
help='The name of the bucket to store cluster state.',
|
|
129
136
|
required=False,
|
|
130
137
|
)
|
|
@@ -144,11 +151,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
|
|
|
144
151
|
),
|
|
145
152
|
)
|
|
146
153
|
if FeatureFlags.SUB_SLICING_ENABLED:
|
|
147
|
-
cluster_create_optional_arguments
|
|
148
|
-
'--sub-slicing',
|
|
149
|
-
action='store_true',
|
|
150
|
-
help='Whether to set up cluster to support sub-slicing',
|
|
151
|
-
)
|
|
154
|
+
add_cluster_create_sub_slicing_arguments(cluster_create_optional_arguments)
|
|
152
155
|
|
|
153
156
|
autoprovisioning_arguments = cluster_create_parser.add_argument_group(
|
|
154
157
|
'Autoprovisioning Arguments',
|
|
@@ -209,6 +212,10 @@ def set_cluster_create_pathways_parser(
|
|
|
209
212
|
type=str,
|
|
210
213
|
default=None,
|
|
211
214
|
help='The tpu type to use, v5litepod-16, etc.',
|
|
215
|
+
metavar='TPU_TYPE',
|
|
216
|
+
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
217
|
+
[AcceleratorType.TPU]
|
|
218
|
+
),
|
|
212
219
|
)
|
|
213
220
|
|
|
214
221
|
### Optional arguments specific to "cluster create-pathways"
|
|
@@ -221,6 +228,10 @@ def set_cluster_create_pathways_parser(
|
|
|
221
228
|
add_shared_cluster_create_optional_arguments(
|
|
222
229
|
cluster_create_pathways_optional_arguments
|
|
223
230
|
)
|
|
231
|
+
if FeatureFlags.SUB_SLICING_ENABLED:
|
|
232
|
+
add_cluster_create_sub_slicing_arguments(
|
|
233
|
+
cluster_create_pathways_optional_arguments
|
|
234
|
+
)
|
|
224
235
|
|
|
225
236
|
autoprovisioning_arguments = (
|
|
226
237
|
cluster_create_pathways_parser.add_argument_group(
|
|
@@ -287,6 +298,10 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
|
287
298
|
default=None,
|
|
288
299
|
help='The tpu type to use, v5litepod-16, etc.',
|
|
289
300
|
required=True,
|
|
301
|
+
metavar='TPU_TYPE',
|
|
302
|
+
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
303
|
+
[AcceleratorType.TPU]
|
|
304
|
+
),
|
|
290
305
|
)
|
|
291
306
|
# TODO(bzmarke): Add --device-type to support GPU/CPU
|
|
292
307
|
cluster_create_ray_required_arguments.add_argument(
|
|
@@ -350,7 +365,9 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
|
|
|
350
365
|
)
|
|
351
366
|
add_resource_limits(cluster_create_resource_limits)
|
|
352
367
|
|
|
353
|
-
cluster_create_ray_parser.set_defaults(
|
|
368
|
+
cluster_create_ray_parser.set_defaults(
|
|
369
|
+
func=cluster_create_ray_cluster, sub_slicing=False
|
|
370
|
+
)
|
|
354
371
|
|
|
355
372
|
|
|
356
373
|
def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
|
|
@@ -375,7 +392,7 @@ def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
|
|
|
375
392
|
cluster_delete_optional_arguments.add_argument(
|
|
376
393
|
'--cluster-state-gcs-bucket',
|
|
377
394
|
type=str,
|
|
378
|
-
default=
|
|
395
|
+
default=xpk_config.get(CFG_BUCKET_KEY),
|
|
379
396
|
help='The name of the bucket to store cluster state.',
|
|
380
397
|
required=False,
|
|
381
398
|
)
|
|
@@ -409,6 +426,10 @@ def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
|
|
|
409
426
|
type=str,
|
|
410
427
|
default=None,
|
|
411
428
|
help='The tpu type to cache images on, v5litepod-16, etc.',
|
|
429
|
+
metavar='TPU_TYPE',
|
|
430
|
+
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
431
|
+
[AcceleratorType.TPU]
|
|
432
|
+
),
|
|
412
433
|
)
|
|
413
434
|
cluster_cacheimage_group.add_argument(
|
|
414
435
|
'--device-type',
|
|
@@ -418,6 +439,8 @@ def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
|
|
|
418
439
|
'The device type to cache images on (can be tpu or gpu),'
|
|
419
440
|
' v5litepod-16, h100-80gb-8, etc.'
|
|
420
441
|
),
|
|
442
|
+
metavar='DEVICE_TYPE',
|
|
443
|
+
choices=get_system_characteristics_keys_by_accelerator_type(),
|
|
421
444
|
)
|
|
422
445
|
|
|
423
446
|
### Required arguments
|
|
@@ -508,6 +531,10 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
|
|
|
508
531
|
type=str,
|
|
509
532
|
default=None,
|
|
510
533
|
help='The tpu type used on cluster, v5litepod-16, etc.',
|
|
534
|
+
metavar='TPU_TYPE',
|
|
535
|
+
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
536
|
+
[AcceleratorType.TPU]
|
|
537
|
+
),
|
|
511
538
|
)
|
|
512
539
|
cluster_adapt_device_group.add_argument(
|
|
513
540
|
'--device-type',
|
|
@@ -517,6 +544,8 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
|
|
|
517
544
|
'The device type used on cluster (can be tpu or gpu or cpu), eg.'
|
|
518
545
|
' h100-80gb-8, n2-standard-32-4 etc.'
|
|
519
546
|
),
|
|
547
|
+
metavar='DEVICE_TYPE',
|
|
548
|
+
choices=get_system_characteristics_keys_by_accelerator_type(),
|
|
520
549
|
)
|
|
521
550
|
|
|
522
551
|
cluster_adapt_optional_arguments = cluster_adapt_parser.add_argument_group(
|
|
@@ -937,3 +966,13 @@ def add_resource_limits(parser_or_group: ParserOrArgumentGroup):
|
|
|
937
966
|
default=None,
|
|
938
967
|
help='The CPU limit for the Kueue controller manager.',
|
|
939
968
|
)
|
|
969
|
+
|
|
970
|
+
|
|
971
|
+
def add_cluster_create_sub_slicing_arguments(
|
|
972
|
+
parser_or_group: ParserOrArgumentGroup,
|
|
973
|
+
):
|
|
974
|
+
parser_or_group.add_argument(
|
|
975
|
+
'--sub-slicing',
|
|
976
|
+
action='store_true',
|
|
977
|
+
help='Whether to set up cluster to support sub-slicing',
|
|
978
|
+
)
|
xpk/parser/cluster_test.py
CHANGED
|
@@ -15,7 +15,7 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
|
-
from xpk.parser.cluster import set_cluster_create_parser
|
|
18
|
+
from xpk.parser.cluster import set_cluster_create_parser, set_cluster_create_pathways_parser, set_cluster_create_ray_parser
|
|
19
19
|
import pytest
|
|
20
20
|
from ..utils.feature_flags import FeatureFlags
|
|
21
21
|
|
|
@@ -49,7 +49,7 @@ def test_cluster_create_sub_slicing_is_false_by_default():
|
|
|
49
49
|
|
|
50
50
|
set_cluster_create_parser(parser)
|
|
51
51
|
args = parser.parse_args(
|
|
52
|
-
["--cluster", "test-cluster", "--tpu-type", "
|
|
52
|
+
["--cluster", "test-cluster", "--tpu-type", "tpu7x-2"]
|
|
53
53
|
)
|
|
54
54
|
|
|
55
55
|
assert args.sub_slicing is False
|
|
@@ -60,7 +60,46 @@ def test_cluster_create_sub_slicing_can_be_set():
|
|
|
60
60
|
|
|
61
61
|
set_cluster_create_parser(parser)
|
|
62
62
|
args = parser.parse_args(
|
|
63
|
-
["--cluster", "test-cluster", "--tpu-type", "
|
|
63
|
+
["--cluster", "test-cluster", "--tpu-type", "tpu7x-2", "--sub-slicing"]
|
|
64
64
|
)
|
|
65
65
|
|
|
66
66
|
assert args.sub_slicing is True
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_cluster_create_pathways_sub_slicing_is_hidden_with_flag_off():
|
|
70
|
+
FeatureFlags.SUB_SLICING_ENABLED = False
|
|
71
|
+
parser = argparse.ArgumentParser()
|
|
72
|
+
|
|
73
|
+
set_cluster_create_pathways_parser(parser)
|
|
74
|
+
help_str = parser.format_help()
|
|
75
|
+
|
|
76
|
+
assert "--sub-slicing" not in help_str
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_cluster_create_pathways_sub_slicing_can_be_set():
|
|
80
|
+
parser = argparse.ArgumentParser()
|
|
81
|
+
|
|
82
|
+
set_cluster_create_pathways_parser(parser)
|
|
83
|
+
args = parser.parse_args(
|
|
84
|
+
["--cluster", "test-cluster", "--tpu-type", "tpu7x-2", "--sub-slicing"]
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
assert args.sub_slicing is True
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_cluster_create_ray_sub_slicing_is_hidden_but_set_to_false():
|
|
91
|
+
parser = argparse.ArgumentParser()
|
|
92
|
+
|
|
93
|
+
set_cluster_create_ray_parser(parser)
|
|
94
|
+
args = parser.parse_args([
|
|
95
|
+
"--cluster",
|
|
96
|
+
"test-cluster",
|
|
97
|
+
"--tpu-type",
|
|
98
|
+
"tpu7x-2",
|
|
99
|
+
"--ray-version",
|
|
100
|
+
"19.32.0",
|
|
101
|
+
])
|
|
102
|
+
help_str = parser.format_help()
|
|
103
|
+
|
|
104
|
+
assert args.sub_slicing is False
|
|
105
|
+
assert "--sub-slicing" not in help_str
|
xpk/parser/workload.py
CHANGED
|
@@ -25,6 +25,7 @@ from ..core.docker_image import DEFAULT_DOCKER_IMAGE, DEFAULT_SCRIPT_DIR
|
|
|
25
25
|
from .common import add_shared_arguments
|
|
26
26
|
from .validators import directory_path_type, name_type
|
|
27
27
|
from ..utils.feature_flags import FeatureFlags
|
|
28
|
+
from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType, SUB_SLICING_TOPOLOGIES
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
def set_workload_parsers(workload_parser: ArgumentParser):
|
|
@@ -123,6 +124,10 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
|
|
|
123
124
|
type=str,
|
|
124
125
|
default=None,
|
|
125
126
|
help='The tpu type to use, v5litepod-16, etc.',
|
|
127
|
+
metavar='TPU_TYPE',
|
|
128
|
+
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
129
|
+
[AcceleratorType.TPU]
|
|
130
|
+
),
|
|
126
131
|
)
|
|
127
132
|
workload_device_group.add_argument(
|
|
128
133
|
'--device-type',
|
|
@@ -132,6 +137,8 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
|
|
|
132
137
|
'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
|
|
133
138
|
' h100-80gb-8, n2-standard-32-4 etc.'
|
|
134
139
|
),
|
|
140
|
+
metavar='DEVICE_TYPE',
|
|
141
|
+
choices=get_system_characteristics_keys_by_accelerator_type(),
|
|
135
142
|
)
|
|
136
143
|
|
|
137
144
|
workload_create_parser_optional_arguments.add_argument(
|
|
@@ -285,6 +292,10 @@ def set_workload_create_pathways_parser(
|
|
|
285
292
|
type=str,
|
|
286
293
|
default=None,
|
|
287
294
|
help='The tpu type to use, v5litepod-16, etc.',
|
|
295
|
+
metavar='TPU_TYPE',
|
|
296
|
+
choices=get_system_characteristics_keys_by_accelerator_type(
|
|
297
|
+
[AcceleratorType.TPU]
|
|
298
|
+
),
|
|
288
299
|
)
|
|
289
300
|
|
|
290
301
|
### "workload create-pathways" Optional arguments, specific to Pathways
|
|
@@ -665,6 +676,7 @@ def add_shared_workload_create_optional_arguments(args_parsers):
|
|
|
665
676
|
type=str,
|
|
666
677
|
help='Sub-slicing topology to use.',
|
|
667
678
|
required=False,
|
|
679
|
+
choices=SUB_SLICING_TOPOLOGIES,
|
|
668
680
|
)
|
|
669
681
|
|
|
670
682
|
|
xpk/parser/workload_test.py
CHANGED
|
@@ -56,7 +56,7 @@ def test_workload_create_sub_slicing_topology_is_none_by_default():
|
|
|
56
56
|
"--workload",
|
|
57
57
|
"test",
|
|
58
58
|
"--tpu-type",
|
|
59
|
-
"
|
|
59
|
+
"tpu7x-2",
|
|
60
60
|
])
|
|
61
61
|
|
|
62
62
|
assert args.sub_slicing_topology is None
|
|
@@ -74,9 +74,9 @@ def test_workload_create_sub_slicing_topology_can_be_set():
|
|
|
74
74
|
"--workload",
|
|
75
75
|
"test",
|
|
76
76
|
"--tpu-type",
|
|
77
|
-
"
|
|
77
|
+
"tpu7x-8",
|
|
78
78
|
"--sub-slicing-topology",
|
|
79
|
-
"
|
|
79
|
+
"2x4",
|
|
80
80
|
])
|
|
81
81
|
|
|
82
|
-
assert args.sub_slicing_topology is "
|
|
82
|
+
assert args.sub_slicing_topology is "2x4"
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import sys
|
|
18
|
+
import os
|
|
19
|
+
import requests
|
|
20
|
+
import json
|
|
21
|
+
|
|
22
|
+
file_path = sys.argv[1]
|
|
23
|
+
if os.path.exists(file_path):
|
|
24
|
+
with open(file_path, mode="r", encoding="utf-8") as file:
|
|
25
|
+
kwargs = json.load(file)
|
|
26
|
+
response = requests.request(**kwargs)
|
|
27
|
+
print(f"Telemetry upload finished with {response.status_code} status code")
|
|
28
|
+
|
|
29
|
+
os.remove(file_path)
|
|
@@ -4,11 +4,6 @@ metadata:
|
|
|
4
4
|
name: {{ sub_slice_topology_name }}
|
|
5
5
|
spec:
|
|
6
6
|
levels:
|
|
7
|
-
|
|
8
|
-
- nodeLabel: "
|
|
9
|
-
|
|
10
|
-
- nodeLabel: "cloud.google.com/gke-tpu-slice-4x8-id"
|
|
11
|
-
- nodeLabel: "cloud.google.com/gke-tpu-slice-4x4-id"
|
|
12
|
-
- nodeLabel: "cloud.google.com/gke-tpu-slice-2x4-id"
|
|
13
|
-
- nodeLabel: "cloud.google.com/gke-tpu-slice-2x2-id"
|
|
14
|
-
- nodeLabel: "kubernetes.io/hostname"
|
|
7
|
+
{% for level in levels %}
|
|
8
|
+
- nodeLabel: "{{level}}"
|
|
9
|
+
{% endfor %}
|