xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +89 -32
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +16 -3
- xpk/commands/cluster_test.py +353 -7
- xpk/commands/config.py +3 -5
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +3 -1
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +143 -142
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +73 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +48 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +55 -104
- xpk/core/cluster_test.py +170 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +88 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +17 -19
- xpk/core/kueue_manager.py +205 -51
- xpk/core/kueue_manager_test.py +158 -4
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +37 -43
- xpk/core/nodepool_test.py +42 -19
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +144 -133
- xpk/core/scheduling_test.py +298 -6
- xpk/core/system_characteristics.py +256 -19
- xpk/core/system_characteristics_test.py +128 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +33 -13
- xpk/parser/cluster.py +40 -67
- xpk/parser/cluster_test.py +83 -3
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -29
- xpk/parser/workload_test.py +3 -49
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +10 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +2 -13
- xpk/utils/versions.py +31 -0
- xpk-0.16.0.dist-info/METADATA +127 -0
- xpk-0.16.0.dist-info/RECORD +168 -0
- xpk-0.14.4.dist-info/METADATA +0 -1645
- xpk-0.14.4.dist-info/RECORD +0 -139
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster_test.py
CHANGED
|
@@ -14,16 +14,20 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import json
|
|
17
18
|
from argparse import Namespace
|
|
18
19
|
from dataclasses import dataclass
|
|
19
20
|
from typing import Any
|
|
20
21
|
from unittest.mock import MagicMock, patch
|
|
21
22
|
import pytest
|
|
22
23
|
|
|
23
|
-
from xpk.
|
|
24
|
+
from xpk.core.telemetry import MetricsCollector
|
|
25
|
+
from xpk.commands.cluster import _install_kueue, _validate_cluster_create_args, run_gke_cluster_create_command, cluster_create, _log_cluster_create_telemetry
|
|
26
|
+
from xpk.core.capacity import CapacityType
|
|
24
27
|
from xpk.core.system_characteristics import SystemCharacteristics, UserFacingNameToSystemCharacteristics
|
|
25
28
|
from xpk.core.testing.commands_tester import CommandsTester
|
|
26
29
|
from xpk.utils.feature_flags import FeatureFlags
|
|
30
|
+
from xpk.utils.versions import ReleaseChannel
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
@dataclass
|
|
@@ -31,9 +35,34 @@ class _Mocks:
|
|
|
31
35
|
common_print_mock: MagicMock
|
|
32
36
|
commands_print_mock: MagicMock
|
|
33
37
|
commands_get_reservation_deployment_type: MagicMock
|
|
38
|
+
commands_get_pathways_machine_types: MagicMock
|
|
34
39
|
commands_tester: CommandsTester
|
|
35
40
|
|
|
36
41
|
|
|
42
|
+
@dataclass
|
|
43
|
+
class _ClusterCreateMocks:
|
|
44
|
+
"""Holds all the mocked dependencies for the cluster_create function."""
|
|
45
|
+
|
|
46
|
+
get_all_clusters_programmatic: MagicMock
|
|
47
|
+
get_gke_server_config: MagicMock
|
|
48
|
+
get_gke_control_plane_version: MagicMock
|
|
49
|
+
get_system_characteristics: MagicMock
|
|
50
|
+
authorize_private_cluster_access_if_necessary: MagicMock
|
|
51
|
+
update_coredns_if_necessary: MagicMock
|
|
52
|
+
get_cluster_credentials: MagicMock
|
|
53
|
+
setup_k8s_env: MagicMock
|
|
54
|
+
get_gke_node_pool_version: MagicMock
|
|
55
|
+
run_gke_node_pool_create_command: MagicMock
|
|
56
|
+
create_cluster_configmaps: MagicMock
|
|
57
|
+
set_jobset_on_cluster: MagicMock
|
|
58
|
+
get_cluster_location: MagicMock
|
|
59
|
+
install_kjob: MagicMock
|
|
60
|
+
xpk_exit: MagicMock
|
|
61
|
+
update_jobset_resources_if_necessary: MagicMock
|
|
62
|
+
_install_kueue: MagicMock
|
|
63
|
+
set_pathways_job_on_cluster: MagicMock
|
|
64
|
+
|
|
65
|
+
|
|
37
66
|
@pytest.fixture
|
|
38
67
|
def mocks(mocker) -> _Mocks:
|
|
39
68
|
common_print_mock = mocker.patch(
|
|
@@ -47,15 +76,23 @@ def mocks(mocker) -> _Mocks:
|
|
|
47
76
|
'xpk.commands.cluster.get_reservation_deployment_type',
|
|
48
77
|
return_value='DENSE',
|
|
49
78
|
)
|
|
79
|
+
commands_get_pathways_machine_types = mocker.patch(
|
|
80
|
+
'xpk.commands.cluster.get_pathways_machine_types',
|
|
81
|
+
return_value=(0, []),
|
|
82
|
+
)
|
|
50
83
|
return _Mocks(
|
|
51
84
|
common_print_mock=common_print_mock,
|
|
52
85
|
commands_get_reservation_deployment_type=commands_get_reservation_deployment_type,
|
|
53
86
|
commands_print_mock=commands_print_mock,
|
|
87
|
+
commands_get_pathways_machine_types=commands_get_pathways_machine_types,
|
|
54
88
|
commands_tester=CommandsTester(
|
|
55
89
|
mocker,
|
|
56
90
|
run_command_with_updates_path=(
|
|
57
91
|
'xpk.commands.cluster.run_command_with_updates'
|
|
58
92
|
),
|
|
93
|
+
run_command_for_value_path=(
|
|
94
|
+
'xpk.commands.cluster.run_command_for_value'
|
|
95
|
+
),
|
|
59
96
|
),
|
|
60
97
|
)
|
|
61
98
|
|
|
@@ -65,6 +102,10 @@ def construct_args(**kwargs: Any) -> Namespace:
|
|
|
65
102
|
project='project',
|
|
66
103
|
zone='us-central1-a',
|
|
67
104
|
reservation='',
|
|
105
|
+
on_demand=False,
|
|
106
|
+
tpu_type=None,
|
|
107
|
+
device_type=None,
|
|
108
|
+
spot=False,
|
|
68
109
|
default_pool_cpu_machine_type='test-machine-type',
|
|
69
110
|
cluster='test-cluster',
|
|
70
111
|
default_pool_cpu_num_nodes='100',
|
|
@@ -72,6 +113,7 @@ def construct_args(**kwargs: Any) -> Namespace:
|
|
|
72
113
|
gke_version='',
|
|
73
114
|
private=False,
|
|
74
115
|
authorized_networks=None,
|
|
116
|
+
pathways_gce_machine_type='n2-standard-64',
|
|
75
117
|
enable_pathways=False,
|
|
76
118
|
enable_ray_cluster=False,
|
|
77
119
|
enable_workload_identity=False,
|
|
@@ -87,11 +129,97 @@ def construct_args(**kwargs: Any) -> Namespace:
|
|
|
87
129
|
memory_limit='100Gi',
|
|
88
130
|
cpu_limit=100,
|
|
89
131
|
cluster_cpu_machine_type='',
|
|
132
|
+
create_vertex_tensorboard=False,
|
|
133
|
+
enable_autoprovisioning=False,
|
|
134
|
+
sub_slicing_topology='2x2x2',
|
|
135
|
+
use_vertex_tensorboard=False,
|
|
136
|
+
env_file='',
|
|
137
|
+
env=None,
|
|
138
|
+
use_pathways=False,
|
|
139
|
+
debug_dump_gcs=False,
|
|
140
|
+
storage='',
|
|
141
|
+
restart_on_exit_codes=None,
|
|
142
|
+
ttl_seconds_after_finished=0,
|
|
143
|
+
max_restarts=1,
|
|
144
|
+
priority=0,
|
|
145
|
+
termination_grace_period_seconds=0,
|
|
146
|
+
docker_image_pull_secret='',
|
|
147
|
+
managed_mldiagnostics=False,
|
|
148
|
+
output_manifest_file='',
|
|
90
149
|
)
|
|
91
150
|
args_dict.update(kwargs)
|
|
92
151
|
return Namespace(**args_dict)
|
|
93
152
|
|
|
94
153
|
|
|
154
|
+
@pytest.fixture
|
|
155
|
+
def cluster_create_mocks(mocker) -> _ClusterCreateMocks:
|
|
156
|
+
"""Mocks all dependencies for the cluster_create function."""
|
|
157
|
+
# This fixture patches all the functions called by cluster_create, allowing
|
|
158
|
+
# tests to focus on specific logic paths without executing external commands
|
|
159
|
+
# or complex sub-functions. Each mock can be configured within the test
|
|
160
|
+
# itself if a specific return value or behavior is needed.
|
|
161
|
+
return _ClusterCreateMocks(
|
|
162
|
+
get_all_clusters_programmatic=mocker.patch(
|
|
163
|
+
'xpk.commands.cluster.get_all_clusters_programmatic',
|
|
164
|
+
return_value=([], 0),
|
|
165
|
+
),
|
|
166
|
+
get_gke_server_config=mocker.patch(
|
|
167
|
+
'xpk.commands.cluster.get_gke_server_config',
|
|
168
|
+
return_value=(0, MagicMock()),
|
|
169
|
+
),
|
|
170
|
+
get_gke_control_plane_version=mocker.patch(
|
|
171
|
+
'xpk.commands.cluster.get_gke_control_plane_version'
|
|
172
|
+
),
|
|
173
|
+
get_system_characteristics=mocker.patch(
|
|
174
|
+
'xpk.commands.cluster.get_system_characteristics',
|
|
175
|
+
return_value=(TPU_TEST_SYSTEM, 0),
|
|
176
|
+
),
|
|
177
|
+
authorize_private_cluster_access_if_necessary=mocker.patch(
|
|
178
|
+
'xpk.commands.cluster.authorize_private_cluster_access_if_necessary',
|
|
179
|
+
return_value=0,
|
|
180
|
+
),
|
|
181
|
+
update_coredns_if_necessary=mocker.patch(
|
|
182
|
+
'xpk.commands.cluster.update_coredns_if_necessary', return_value=0
|
|
183
|
+
),
|
|
184
|
+
get_cluster_credentials=mocker.patch(
|
|
185
|
+
'xpk.commands.cluster.get_cluster_credentials', return_value=0
|
|
186
|
+
),
|
|
187
|
+
setup_k8s_env=mocker.patch('xpk.commands.cluster.setup_k8s_env'),
|
|
188
|
+
get_gke_node_pool_version=mocker.patch(
|
|
189
|
+
'xpk.commands.cluster.get_gke_node_pool_version',
|
|
190
|
+
return_value=(0, '1.2.3'),
|
|
191
|
+
),
|
|
192
|
+
run_gke_node_pool_create_command=mocker.patch(
|
|
193
|
+
'xpk.commands.cluster.run_gke_node_pool_create_command',
|
|
194
|
+
return_value=0,
|
|
195
|
+
),
|
|
196
|
+
create_cluster_configmaps=mocker.patch(
|
|
197
|
+
'xpk.commands.cluster.create_cluster_configmaps', return_value=0
|
|
198
|
+
),
|
|
199
|
+
set_jobset_on_cluster=mocker.patch(
|
|
200
|
+
'xpk.commands.cluster.set_jobset_on_cluster', return_value=0
|
|
201
|
+
),
|
|
202
|
+
get_cluster_location=mocker.patch(
|
|
203
|
+
'xpk.commands.cluster.get_cluster_location',
|
|
204
|
+
return_value='us-central1',
|
|
205
|
+
),
|
|
206
|
+
install_kjob=mocker.patch(
|
|
207
|
+
'xpk.commands.cluster.install_kjob', return_value=0
|
|
208
|
+
),
|
|
209
|
+
xpk_exit=mocker.patch('xpk.commands.cluster.xpk_exit'),
|
|
210
|
+
update_jobset_resources_if_necessary=mocker.patch(
|
|
211
|
+
'xpk.commands.cluster.update_jobset_resources_if_necessary',
|
|
212
|
+
return_value=0,
|
|
213
|
+
),
|
|
214
|
+
_install_kueue=mocker.patch(
|
|
215
|
+
'xpk.commands.cluster._install_kueue', return_value=0
|
|
216
|
+
),
|
|
217
|
+
set_pathways_job_on_cluster=mocker.patch(
|
|
218
|
+
'xpk.commands.cluster.set_pathways_job_on_cluster', return_value=0
|
|
219
|
+
),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
95
223
|
GPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
|
|
96
224
|
'l4-1'
|
|
97
225
|
]
|
|
@@ -106,7 +234,7 @@ TPU_TEST_SYSTEM: SystemCharacteristics = UserFacingNameToSystemCharacteristics[
|
|
|
106
234
|
def test_validate_cluster_create_args_for_correct_args_pass(
|
|
107
235
|
mocks: _Mocks,
|
|
108
236
|
):
|
|
109
|
-
args =
|
|
237
|
+
args = construct_args()
|
|
110
238
|
|
|
111
239
|
_validate_cluster_create_args(args, GPU_TEST_SYSTEM)
|
|
112
240
|
|
|
@@ -185,6 +313,64 @@ def test_validate_cluster_create_args_for_invalid_reservation(
|
|
|
185
313
|
)
|
|
186
314
|
|
|
187
315
|
|
|
316
|
+
def test_validate_cluster_create_args_for_enable_pathways_set_to_false(
|
|
317
|
+
mocks: _Mocks,
|
|
318
|
+
):
|
|
319
|
+
args = construct_args(enable_pathways=False)
|
|
320
|
+
mocks.commands_get_pathways_machine_types.return_value = (1, [])
|
|
321
|
+
|
|
322
|
+
_validate_cluster_create_args(args, TPU_TEST_SYSTEM)
|
|
323
|
+
|
|
324
|
+
assert mocks.commands_print_mock.call_count == 0
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def test_validate_cluster_create_args_for_errored_pathways_machine_types_retrieval(
|
|
328
|
+
mocks: _Mocks,
|
|
329
|
+
):
|
|
330
|
+
args = construct_args(enable_pathways=True)
|
|
331
|
+
mocks.commands_get_pathways_machine_types.return_value = (1, [])
|
|
332
|
+
|
|
333
|
+
with pytest.raises(SystemExit):
|
|
334
|
+
_validate_cluster_create_args(args, TPU_TEST_SYSTEM)
|
|
335
|
+
|
|
336
|
+
assert mocks.commands_print_mock.call_count == 1
|
|
337
|
+
assert 'Unable to retrieve' in mocks.commands_print_mock.call_args[0][0]
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def test_validate_cluster_create_args_for_invalid_pathways_machine_type(
|
|
341
|
+
mocks: _Mocks,
|
|
342
|
+
):
|
|
343
|
+
args = construct_args(
|
|
344
|
+
enable_pathways=True, pathways_gce_machine_type='n2-standard-32'
|
|
345
|
+
)
|
|
346
|
+
mocks.commands_get_pathways_machine_types.return_value = (
|
|
347
|
+
0,
|
|
348
|
+
['n2-standard-64'],
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
with pytest.raises(SystemExit):
|
|
352
|
+
_validate_cluster_create_args(args, TPU_TEST_SYSTEM)
|
|
353
|
+
|
|
354
|
+
assert mocks.commands_print_mock.call_count == 2
|
|
355
|
+
assert 'Available machine types' in mocks.commands_print_mock.call_args[0][0]
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def test_validate_cluster_create_args_for_valid_pathways_machine_type(
|
|
359
|
+
mocks: _Mocks,
|
|
360
|
+
):
|
|
361
|
+
args = construct_args(
|
|
362
|
+
enable_pathways=True, pathways_gce_machine_type='n2-standard-32'
|
|
363
|
+
)
|
|
364
|
+
mocks.commands_get_pathways_machine_types.return_value = (
|
|
365
|
+
0,
|
|
366
|
+
['n2-standard-32'],
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
_validate_cluster_create_args(args, TPU_TEST_SYSTEM)
|
|
370
|
+
|
|
371
|
+
assert mocks.commands_print_mock.call_count == 0
|
|
372
|
+
|
|
373
|
+
|
|
188
374
|
@patch('xpk.commands.cluster.KueueManager.install_or_upgrade')
|
|
189
375
|
def test_install_kueue_returns_kueue_installation_code(
|
|
190
376
|
mock_kueue_manager_install: MagicMock,
|
|
@@ -209,6 +395,7 @@ def test_run_gke_cluster_create_command_specifies_custom_cluster_arguments_last(
|
|
|
209
395
|
),
|
|
210
396
|
gke_control_plane_version='1.2.3',
|
|
211
397
|
system=TPU_TEST_SYSTEM,
|
|
398
|
+
release_channel=ReleaseChannel.STABLE,
|
|
212
399
|
)
|
|
213
400
|
|
|
214
401
|
assert result == 0
|
|
@@ -226,12 +413,16 @@ def test_run_gke_cluster_create_command_without_gke_version_does_not_have_no_aut
|
|
|
226
413
|
args=construct_args(gke_version=''),
|
|
227
414
|
gke_control_plane_version='1.2.3',
|
|
228
415
|
system=TPU_TEST_SYSTEM,
|
|
416
|
+
release_channel=ReleaseChannel.RAPID,
|
|
229
417
|
)
|
|
230
418
|
|
|
231
419
|
assert result == 0
|
|
232
420
|
mocks.commands_tester.assert_command_not_run(
|
|
233
421
|
'clusters create', ' --no-enable-autoupgrade'
|
|
234
422
|
)
|
|
423
|
+
mocks.commands_tester.assert_command_run(
|
|
424
|
+
'clusters create', ' --release-channel=rapid'
|
|
425
|
+
)
|
|
235
426
|
|
|
236
427
|
|
|
237
428
|
def test_run_gke_cluster_create_command_with_gke_version_has_no_autoupgrade_flag(
|
|
@@ -241,24 +432,179 @@ def test_run_gke_cluster_create_command_with_gke_version_has_no_autoupgrade_flag
|
|
|
241
432
|
args=construct_args(gke_version='1.2.3'),
|
|
242
433
|
gke_control_plane_version='1.2.3',
|
|
243
434
|
system=TPU_TEST_SYSTEM,
|
|
435
|
+
release_channel=ReleaseChannel.REGULAR,
|
|
244
436
|
)
|
|
245
437
|
|
|
246
438
|
assert result == 0
|
|
247
439
|
mocks.commands_tester.assert_command_run(
|
|
248
|
-
'clusters create', ' --no-enable-autoupgrade'
|
|
440
|
+
'clusters create', '--release-channel=regular', ' --no-enable-autoupgrade'
|
|
249
441
|
)
|
|
250
442
|
|
|
251
443
|
|
|
252
|
-
def
|
|
444
|
+
def test_run_gke_cluster_create_command_with_lustre_runs_correct_command(
|
|
253
445
|
mocks: _Mocks,
|
|
254
446
|
):
|
|
255
447
|
result = run_gke_cluster_create_command(
|
|
256
|
-
args=construct_args(
|
|
448
|
+
args=construct_args(
|
|
449
|
+
enable_lustre_csi_driver=True, enable_legacy_lustre_port=False
|
|
450
|
+
),
|
|
257
451
|
gke_control_plane_version='1.2.3',
|
|
258
|
-
system=
|
|
452
|
+
system=TPU_TEST_SYSTEM,
|
|
453
|
+
release_channel=ReleaseChannel.REGULAR,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
assert result == 0
|
|
457
|
+
commands = mocks.commands_tester.get_matching_commands('clusters create')
|
|
458
|
+
assert len(commands) == 1
|
|
459
|
+
command = commands[0]
|
|
460
|
+
assert (
|
|
461
|
+
'--addons=LustreCsiDriver' in command
|
|
462
|
+
and '--enable-legacy-lustre-port' not in command
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def test_run_gke_cluster_create_command_with_lustre_legacy_port_adds_correct_flag(
|
|
467
|
+
mocks: _Mocks,
|
|
468
|
+
):
|
|
469
|
+
result = run_gke_cluster_create_command(
|
|
470
|
+
args=construct_args(
|
|
471
|
+
enable_lustre_csi_driver=True, enable_legacy_lustre_port=True
|
|
472
|
+
),
|
|
473
|
+
gke_control_plane_version='1.2.3',
|
|
474
|
+
system=TPU_TEST_SYSTEM,
|
|
475
|
+
release_channel=ReleaseChannel.REGULAR,
|
|
259
476
|
)
|
|
260
477
|
|
|
261
478
|
assert result == 0
|
|
262
479
|
mocks.commands_tester.assert_command_run(
|
|
263
|
-
'clusters create',
|
|
480
|
+
'clusters create',
|
|
481
|
+
'--enable-legacy-lustre-port',
|
|
482
|
+
'--addons=LustreCsiDriver',
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def test_log_cluster_create_telemetry_does_not_log_when_feature_flag_is_disabled():
|
|
487
|
+
FeatureFlags.TELEMETRY_ENABLED = False
|
|
488
|
+
_log_cluster_create_telemetry(construct_args())
|
|
489
|
+
events = json.loads(MetricsCollector.flush())['log_event']
|
|
490
|
+
assert len(events) == 0
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def test_log_cluster_create_telemetry_logs_correct_event_when_tpu_type_is_provided(
|
|
494
|
+
mocker: MagicMock,
|
|
495
|
+
):
|
|
496
|
+
FeatureFlags.TELEMETRY_ENABLED = True
|
|
497
|
+
mocker.patch(
|
|
498
|
+
'xpk.commands.cluster.get_capacity_type',
|
|
499
|
+
return_value=(CapacityType.SPOT, 0),
|
|
264
500
|
)
|
|
501
|
+
_log_cluster_create_telemetry(construct_args(device_type='test-device-type'))
|
|
502
|
+
event = json.loads(MetricsCollector.flush())['log_event'][0]
|
|
503
|
+
payload = json.loads(event['source_extension_json'])
|
|
504
|
+
event_metadata = payload['event_metadata']
|
|
505
|
+
assert payload['event_name'] == 'cluster_create'
|
|
506
|
+
assert (
|
|
507
|
+
_get_event_metadata_value_by_key(
|
|
508
|
+
event_metadata,
|
|
509
|
+
'XPK_ZONE',
|
|
510
|
+
)
|
|
511
|
+
== 'us-central1-a'
|
|
512
|
+
)
|
|
513
|
+
assert (
|
|
514
|
+
_get_event_metadata_value_by_key(
|
|
515
|
+
event_metadata,
|
|
516
|
+
'XPK_SYSTEM_CHARACTERISTICS',
|
|
517
|
+
)
|
|
518
|
+
== 'test-device-type'
|
|
519
|
+
)
|
|
520
|
+
assert (
|
|
521
|
+
_get_event_metadata_value_by_key(
|
|
522
|
+
event_metadata,
|
|
523
|
+
'XPK_PROVISIONING_MODE',
|
|
524
|
+
)
|
|
525
|
+
== 'spot'
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def test_log_cluster_create_telemetry_logs_correct_event_when_device_type_is_provided(
|
|
530
|
+
mocker: MagicMock,
|
|
531
|
+
):
|
|
532
|
+
FeatureFlags.TELEMETRY_ENABLED = True
|
|
533
|
+
mocker.patch(
|
|
534
|
+
'xpk.commands.cluster.get_capacity_type',
|
|
535
|
+
return_value=(CapacityType.SPOT, 0),
|
|
536
|
+
)
|
|
537
|
+
_log_cluster_create_telemetry(construct_args(tpu_type='test-tpu-type'))
|
|
538
|
+
event = json.loads(MetricsCollector.flush())['log_event'][0]
|
|
539
|
+
payload = json.loads(event['source_extension_json'])
|
|
540
|
+
event_metadata = payload['event_metadata']
|
|
541
|
+
assert payload['event_name'] == 'cluster_create'
|
|
542
|
+
assert (
|
|
543
|
+
_get_event_metadata_value_by_key(
|
|
544
|
+
event_metadata,
|
|
545
|
+
'XPK_ZONE',
|
|
546
|
+
)
|
|
547
|
+
== 'us-central1-a'
|
|
548
|
+
)
|
|
549
|
+
assert (
|
|
550
|
+
_get_event_metadata_value_by_key(
|
|
551
|
+
event_metadata,
|
|
552
|
+
'XPK_SYSTEM_CHARACTERISTICS',
|
|
553
|
+
)
|
|
554
|
+
== 'test-tpu-type'
|
|
555
|
+
)
|
|
556
|
+
assert (
|
|
557
|
+
_get_event_metadata_value_by_key(
|
|
558
|
+
event_metadata,
|
|
559
|
+
'XPK_PROVISIONING_MODE',
|
|
560
|
+
)
|
|
561
|
+
== 'spot'
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def _get_event_metadata_value_by_key(
|
|
566
|
+
event_metadata: list[dict[str, str]], key: str
|
|
567
|
+
) -> str | None:
|
|
568
|
+
return next(
|
|
569
|
+
(meta['value'] for meta in event_metadata if meta['key'] == key),
|
|
570
|
+
None,
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
@pytest.mark.parametrize(
|
|
575
|
+
'gke_version_arg, expected_channel, expected_version',
|
|
576
|
+
[
|
|
577
|
+
(None, ReleaseChannel.RAPID, '1.2.4'), # No version, should use RAPID
|
|
578
|
+
(
|
|
579
|
+
'1.2.3',
|
|
580
|
+
ReleaseChannel.REGULAR,
|
|
581
|
+
'1.2.3',
|
|
582
|
+
), # Version provided, should use REGULAR
|
|
583
|
+
],
|
|
584
|
+
)
|
|
585
|
+
def test_cluster_create_calls_run_command_with_correct_channel_and_version(
|
|
586
|
+
gke_version_arg,
|
|
587
|
+
expected_channel,
|
|
588
|
+
expected_version,
|
|
589
|
+
mocks: _Mocks,
|
|
590
|
+
cluster_create_mocks: _ClusterCreateMocks,
|
|
591
|
+
):
|
|
592
|
+
"""
|
|
593
|
+
Verifies that cluster_create calls run_gke_cluster_create_command with the correct
|
|
594
|
+
release channel and GKE version based on whether a version is provided.
|
|
595
|
+
"""
|
|
596
|
+
cluster_create_mocks.get_gke_control_plane_version.return_value = (
|
|
597
|
+
0,
|
|
598
|
+
expected_version,
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
args = construct_args(gke_version=gke_version_arg)
|
|
602
|
+
cluster_create(args)
|
|
603
|
+
|
|
604
|
+
expected_command_parts = [
|
|
605
|
+
'clusters create',
|
|
606
|
+
f'--cluster-version={expected_version}',
|
|
607
|
+
f'--release-channel={expected_channel.value.lower()}',
|
|
608
|
+
]
|
|
609
|
+
|
|
610
|
+
mocks.commands_tester.assert_command_run(*expected_command_parts)
|
xpk/commands/config.py
CHANGED
|
@@ -14,16 +14,14 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..core.config import
|
|
17
|
+
from ..core.config import get_config as get_xpk_config
|
|
18
18
|
from ..utils.console import xpk_print
|
|
19
19
|
|
|
20
|
-
xpk_cfg = XpkConfig()
|
|
21
|
-
|
|
22
20
|
|
|
23
21
|
def set_config(args):
|
|
24
|
-
|
|
22
|
+
get_xpk_config().set(args.set_config_args[0], args.set_config_args[1])
|
|
25
23
|
|
|
26
24
|
|
|
27
25
|
def get_config(args):
|
|
28
|
-
value =
|
|
26
|
+
value = get_xpk_config().get(args.get_config_key[0])
|
|
29
27
|
xpk_print(value)
|
xpk/commands/inspector.py
CHANGED
|
@@ -18,7 +18,7 @@ from ..core.cluster import get_cluster_credentials
|
|
|
18
18
|
from ..core.commands import run_command_for_value
|
|
19
19
|
from ..core.gcloud_context import add_zone_and_project, get_cluster_location
|
|
20
20
|
from ..core.kueue_manager import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
|
|
21
|
-
from ..core.resources import
|
|
21
|
+
from ..core.resources import ConfigMapType, get_config_map_name
|
|
22
22
|
from ..utils.console import xpk_exit, xpk_print
|
|
23
23
|
from ..utils.file import append_tmp_file, write_tmp_file
|
|
24
24
|
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
@@ -162,14 +162,16 @@ def inspector(args) -> None:
|
|
|
162
162
|
(
|
|
163
163
|
(
|
|
164
164
|
'kubectl get configmap'
|
|
165
|
-
f' {args.cluster}
|
|
165
|
+
f' {get_config_map_name(args.cluster, ConfigMapType.METADATA)} -o'
|
|
166
|
+
' yaml'
|
|
166
167
|
),
|
|
167
168
|
'GKE: Cluster Metadata ConfigMap Details',
|
|
168
169
|
),
|
|
169
170
|
(
|
|
170
171
|
(
|
|
171
172
|
'kubectl get configmap'
|
|
172
|
-
f' {args.cluster}
|
|
173
|
+
f' {get_config_map_name(args.cluster, ConfigMapType.RESOURCES)} -o'
|
|
174
|
+
' yaml'
|
|
173
175
|
),
|
|
174
176
|
'GKE: Cluster Resources ConfigMap Details',
|
|
175
177
|
),
|
xpk/commands/kind.py
CHANGED
|
@@ -30,6 +30,7 @@ from ..core.storage import install_storage_crd
|
|
|
30
30
|
from ..core.system_characteristics import (
|
|
31
31
|
SystemCharacteristics,
|
|
32
32
|
AcceleratorType,
|
|
33
|
+
DockerPlatform,
|
|
33
34
|
)
|
|
34
35
|
from ..utils.console import (xpk_exit, xpk_print)
|
|
35
36
|
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
@@ -97,9 +98,10 @@ def cluster_create(args) -> None:
|
|
|
97
98
|
AcceleratorType.CPU,
|
|
98
99
|
'kind',
|
|
99
100
|
supports_sub_slicing=False,
|
|
101
|
+
docker_platform=DockerPlatform.ARM,
|
|
100
102
|
)
|
|
101
103
|
|
|
102
|
-
kueue_manager = KueueManager()
|
|
104
|
+
kueue_manager = KueueManager(project='', zone='')
|
|
103
105
|
kueue_manager.install_or_upgrade(
|
|
104
106
|
KueueConfig(
|
|
105
107
|
system,
|