xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +89 -32
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +16 -3
- xpk/commands/cluster_test.py +353 -7
- xpk/commands/config.py +3 -5
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +3 -1
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +143 -142
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +73 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +48 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +55 -104
- xpk/core/cluster_test.py +170 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +88 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +17 -19
- xpk/core/kueue_manager.py +205 -51
- xpk/core/kueue_manager_test.py +158 -4
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +37 -43
- xpk/core/nodepool_test.py +42 -19
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +144 -133
- xpk/core/scheduling_test.py +298 -6
- xpk/core/system_characteristics.py +256 -19
- xpk/core/system_characteristics_test.py +128 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +33 -13
- xpk/parser/cluster.py +40 -67
- xpk/parser/cluster_test.py +83 -3
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -29
- xpk/parser/workload_test.py +3 -49
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +10 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +2 -13
- xpk/utils/versions.py +31 -0
- xpk-0.16.0.dist-info/METADATA +127 -0
- xpk-0.16.0.dist-info/RECORD +168 -0
- xpk-0.14.4.dist-info/METADATA +0 -1645
- xpk-0.14.4.dist-info/RECORD +0 -139
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/core/capacity.py
CHANGED
|
@@ -29,6 +29,8 @@ H100_DEVICE_TYPE = 'h100-80gb-8'
|
|
|
29
29
|
H100_MEGA_DEVICE_TYPE = 'h100-mega-80gb-8'
|
|
30
30
|
H200_DEVICE_TYPE = 'h200-141gb-8'
|
|
31
31
|
B200_DEVICE_TYPE = 'b200-8'
|
|
32
|
+
GB200_DEVICE_TYPE = 'gb200-4'
|
|
33
|
+
GB200_DEVICE_TYPE_NOLSSD = 'gb200-4-no-ssd'
|
|
32
34
|
RESERVATION_CONFIG_KEY = 'reservation_id'
|
|
33
35
|
|
|
34
36
|
|
|
@@ -115,9 +117,12 @@ def get_reservation_maintenance_interval(
|
|
|
115
117
|
Returns:
|
|
116
118
|
0 if successful and 1 otherwise.
|
|
117
119
|
"""
|
|
120
|
+
reservation_project, reservation_name = get_reservation_project_and_name(
|
|
121
|
+
reservation, project
|
|
122
|
+
)
|
|
118
123
|
command = (
|
|
119
|
-
f'gcloud beta compute reservations describe {
|
|
120
|
-
f' --project={
|
|
124
|
+
f'gcloud beta compute reservations describe {reservation_name}'
|
|
125
|
+
f' --project={reservation_project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
|
|
121
126
|
)
|
|
122
127
|
return_code, output = run_command_for_value(
|
|
123
128
|
command, 'Get reservation maintenance interval'
|
|
@@ -139,9 +144,12 @@ def get_reservation_placement_policy(
|
|
|
139
144
|
Returns:
|
|
140
145
|
0 if successful and 1 otherwise.
|
|
141
146
|
"""
|
|
147
|
+
reservation_project, reservation_name = get_reservation_project_and_name(
|
|
148
|
+
reservation, project
|
|
149
|
+
)
|
|
142
150
|
command = (
|
|
143
|
-
f'gcloud beta compute reservations describe {
|
|
144
|
-
f' --project={
|
|
151
|
+
f'gcloud beta compute reservations describe {reservation_name}'
|
|
152
|
+
f' --project={reservation_project} --zone={zone} --format="value(resourcePolicies.policy)"'
|
|
145
153
|
)
|
|
146
154
|
return_code, output = run_command_for_value(
|
|
147
155
|
command, 'Get reservation placement policy'
|
|
@@ -156,9 +164,12 @@ def get_reservation_deployment_type(
|
|
|
156
164
|
reservation: str, zone: str, project: str
|
|
157
165
|
) -> str:
|
|
158
166
|
"""Get reservation deployment type."""
|
|
167
|
+
reservation_project, reservation_name = get_reservation_project_and_name(
|
|
168
|
+
reservation, project
|
|
169
|
+
)
|
|
159
170
|
command = (
|
|
160
|
-
f'gcloud beta compute reservations describe {
|
|
161
|
-
f' --project={
|
|
171
|
+
f'gcloud beta compute reservations describe {reservation_name}'
|
|
172
|
+
f' --project={reservation_project} --zone={zone} --format="value(deploymentType)"'
|
|
162
173
|
)
|
|
163
174
|
return_code, output = run_command_for_value(
|
|
164
175
|
command, 'Get reservation deployment type', dry_run_return_val='DENSE'
|
|
@@ -178,9 +189,12 @@ def verify_reservation_exists(args) -> int:
|
|
|
178
189
|
Returns:
|
|
179
190
|
0 if successful and 1 otherwise.
|
|
180
191
|
"""
|
|
192
|
+
reservation_project, reservation_name = get_reservation_project_and_name(
|
|
193
|
+
args.reservation, args.project
|
|
194
|
+
)
|
|
181
195
|
command = (
|
|
182
|
-
f'gcloud beta compute reservations describe {
|
|
183
|
-
f' --project={
|
|
196
|
+
f'gcloud beta compute reservations describe {reservation_name}'
|
|
197
|
+
f' --project={reservation_project} --zone={args.zone}'
|
|
184
198
|
)
|
|
185
199
|
return_code = run_command_with_updates(command, 'Describe reservation')
|
|
186
200
|
if return_code != 0:
|
|
@@ -264,3 +278,29 @@ def get_capacity_node_selectors_from_capacity_type(
|
|
|
264
278
|
)
|
|
265
279
|
return_code = 1
|
|
266
280
|
return node_selector, return_code
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def get_reservation_project_and_name(
|
|
284
|
+
reservation_name_or_path: str, cluster_project: str
|
|
285
|
+
) -> tuple[str, str]:
|
|
286
|
+
"""Get the reservation project and name.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
reservation_name_or_path: either reservation name or reservation path in format
|
|
290
|
+
projects/RESERVATION_PROJECT_ID/reservations/RESERVATION_NAME
|
|
291
|
+
cluster_project: the cluster project
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Tuple with reservation project and reservation name.
|
|
295
|
+
"""
|
|
296
|
+
if '/' not in reservation_name_or_path:
|
|
297
|
+
return cluster_project, reservation_name_or_path
|
|
298
|
+
reservation_parts = reservation_name_or_path.split('/')
|
|
299
|
+
if (
|
|
300
|
+
len(reservation_parts) != 4
|
|
301
|
+
or reservation_parts[0] != 'projects'
|
|
302
|
+
or reservation_parts[2] != 'reservations'
|
|
303
|
+
):
|
|
304
|
+
xpk_print('Unable to parse reservation: ', reservation_name_or_path)
|
|
305
|
+
xpk_exit(1)
|
|
306
|
+
return reservation_parts[1], reservation_parts[3]
|
xpk/core/capacity_test.py
CHANGED
|
@@ -16,7 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import pytest
|
|
18
18
|
from unittest.mock import MagicMock, patch
|
|
19
|
-
from .capacity import get_reservation_deployment_type
|
|
19
|
+
from .capacity import get_reservation_deployment_type, get_reservation_project_and_name
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
@patch('xpk.core.capacity.xpk_print')
|
|
@@ -48,3 +48,34 @@ def test_get_reservation_deployment_type_returns_deployment_type_when_command_su
|
|
|
48
48
|
reservation='reservation', zone='zone', project='project'
|
|
49
49
|
)
|
|
50
50
|
assert result == 'DENSE'
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_get_reservation_project_and_name_parses_local_reservation():
|
|
54
|
+
project, name = get_reservation_project_and_name(
|
|
55
|
+
'test-reservation', 'cluster-project'
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
assert project == 'cluster-project'
|
|
59
|
+
assert name == 'test-reservation'
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_get_reservation_project_and_name_parses_shared_reservation():
|
|
63
|
+
project, name = get_reservation_project_and_name(
|
|
64
|
+
'projects/reservation-project/reservations/test-reservation',
|
|
65
|
+
'cluster-project',
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
assert project == 'reservation-project'
|
|
69
|
+
assert name == 'test-reservation'
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@patch('xpk.core.capacity.xpk_print')
|
|
73
|
+
def test_get_reservation_project_and_name_fails_for_invalid_reservation(
|
|
74
|
+
xpk_print: MagicMock, mocker
|
|
75
|
+
):
|
|
76
|
+
with pytest.raises(SystemExit):
|
|
77
|
+
get_reservation_project_and_name(
|
|
78
|
+
'invalid/reservation',
|
|
79
|
+
'cluster-project',
|
|
80
|
+
)
|
|
81
|
+
assert 'Unable to parse reservation' in xpk_print.mock_calls[0].args[0]
|
xpk/core/cluster.py
CHANGED
|
@@ -22,7 +22,7 @@ from kubernetes import config
|
|
|
22
22
|
from kubernetes.client.exceptions import ApiException
|
|
23
23
|
|
|
24
24
|
from ..utils.console import xpk_exit, xpk_print
|
|
25
|
-
from .capacity import
|
|
25
|
+
from .capacity import H200_DEVICE_TYPE
|
|
26
26
|
from .commands import (
|
|
27
27
|
run_command_for_value,
|
|
28
28
|
run_command_with_updates,
|
|
@@ -34,16 +34,11 @@ from .gcloud_context import (
|
|
|
34
34
|
zone_to_region,
|
|
35
35
|
)
|
|
36
36
|
from .resources import get_cluster_system_characteristics
|
|
37
|
-
from .system_characteristics import SystemCharacteristics
|
|
37
|
+
from .system_characteristics import INSTALLER_NCCL_TCPXO, SystemCharacteristics
|
|
38
38
|
|
|
39
39
|
JOBSET_VERSION = 'v0.8.0'
|
|
40
40
|
PATHWAYS_JOB_VERSION = 'v0.1.4'
|
|
41
|
-
INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
|
|
42
|
-
INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
|
|
43
|
-
INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
|
|
44
|
-
CONFIG_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-config.yaml'
|
|
45
41
|
NRI_DEVICE_INJECTOR = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nri_device_injector/nri-device-injector.yaml'
|
|
46
|
-
MGLRU_DISABLE = 'https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/main/examples/gke-a3-ultragpu/mglru-disable.yaml'
|
|
47
42
|
|
|
48
43
|
DEFAULT_NAMESPACE = 'default'
|
|
49
44
|
XPK_SA = 'xpk-sa'
|
|
@@ -118,12 +113,12 @@ def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
|
|
|
118
113
|
Returns:
|
|
119
114
|
0 if successful and 1 otherwise.
|
|
120
115
|
"""
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
116
|
+
nccl_installer = (
|
|
117
|
+
system.gpu_config.nccl_installer
|
|
118
|
+
if system.gpu_config and system.gpu_config.nccl_installer
|
|
119
|
+
else INSTALLER_NCCL_TCPXO
|
|
120
|
+
)
|
|
121
|
+
command = f'kubectl apply -f {nccl_installer}'
|
|
127
122
|
|
|
128
123
|
return_code = run_command_with_updates(
|
|
129
124
|
command, 'Install NCCL Plugin On Cluster'
|
|
@@ -135,35 +130,6 @@ def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
|
|
|
135
130
|
)
|
|
136
131
|
return 1
|
|
137
132
|
|
|
138
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
139
|
-
command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
|
|
140
|
-
|
|
141
|
-
return_code = run_command_with_updates(
|
|
142
|
-
command, 'Install NCCL Config On Cluster'
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
if return_code != 0:
|
|
146
|
-
xpk_print(
|
|
147
|
-
f'Install NCCL Config On Cluster request returned ERROR {return_code}'
|
|
148
|
-
)
|
|
149
|
-
return 1
|
|
150
|
-
|
|
151
|
-
return 0
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def disable_mglru_on_cluster() -> int:
|
|
155
|
-
"""Disable MGLRU on the cluster.
|
|
156
|
-
|
|
157
|
-
Returns:
|
|
158
|
-
0 if successful and 1 otherwise.
|
|
159
|
-
"""
|
|
160
|
-
command = f'kubectl apply -f {MGLRU_DISABLE}'
|
|
161
|
-
return_code = run_command_with_updates(command, 'Disable MGLRU On Cluster')
|
|
162
|
-
|
|
163
|
-
if return_code != 0:
|
|
164
|
-
xpk_print('Disablig MGLRU On Cluster request returned ERROR')
|
|
165
|
-
return 1
|
|
166
|
-
|
|
167
133
|
return 0
|
|
168
134
|
|
|
169
135
|
|
|
@@ -309,10 +275,11 @@ def update_cluster_with_lustre_driver_if_necessary(args) -> int:
|
|
|
309
275
|
Returns:
|
|
310
276
|
0 if successful and error code otherwise.
|
|
311
277
|
"""
|
|
312
|
-
if is_driver_enabled_on_cluster(
|
|
313
|
-
args
|
|
314
|
-
|
|
315
|
-
|
|
278
|
+
if is_driver_enabled_on_cluster(args, driver='lustreCsiDriver') and (
|
|
279
|
+
not args.enable_legacy_lustre_port
|
|
280
|
+
or is_driver_enabled_on_cluster(
|
|
281
|
+
args, driver='lustreCsiDriver', config_key='enableLegacyLustrePort'
|
|
282
|
+
)
|
|
316
283
|
):
|
|
317
284
|
return 0
|
|
318
285
|
cluster_update_return_code = update_gke_cluster_with_lustre_driver_enabled(
|
|
@@ -621,9 +588,13 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
|
|
|
621
588
|
"""
|
|
622
589
|
command = (
|
|
623
590
|
'gcloud container clusters update'
|
|
624
|
-
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}
|
|
591
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
625
592
|
' --quiet'
|
|
626
593
|
)
|
|
594
|
+
if args.enable_legacy_lustre_port:
|
|
595
|
+
command += ' --enable-legacy-lustre-port'
|
|
596
|
+
else:
|
|
597
|
+
command += ' --update-addons=LustreCsiDriver=ENABLED'
|
|
627
598
|
xpk_print(
|
|
628
599
|
'Updating GKE cluster to enable Lustre CSI driver, may take a while!'
|
|
629
600
|
)
|
|
@@ -729,78 +700,58 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
|
|
|
729
700
|
return 0
|
|
730
701
|
|
|
731
702
|
|
|
732
|
-
def
|
|
733
|
-
"""
|
|
703
|
+
def get_cluster_credentials(args) -> int:
|
|
704
|
+
"""Run cluster configuration command to set the kubectl config.
|
|
734
705
|
|
|
735
706
|
Args:
|
|
736
707
|
args: user provided arguments for running the command.
|
|
737
708
|
|
|
738
709
|
Returns:
|
|
739
|
-
0 if
|
|
710
|
+
0 if successful and 1 otherwise.
|
|
740
711
|
"""
|
|
741
|
-
|
|
742
|
-
xpk_print('Testing credentials with kubectl...')
|
|
743
|
-
kubectl_command = 'kubectl get pods'
|
|
744
|
-
kubectl_return_code, kubectl_output = run_command_for_value(
|
|
745
|
-
kubectl_command, 'kubectl get pods'
|
|
746
|
-
)
|
|
747
|
-
if kubectl_return_code == 0:
|
|
748
|
-
xpk_print('Credentials test succeeded.')
|
|
749
|
-
return 0
|
|
750
|
-
|
|
751
|
-
dns_endpoint_error = (
|
|
752
|
-
'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic'
|
|
753
|
-
' is disabled'
|
|
754
|
-
)
|
|
755
|
-
if dns_endpoint_error not in kubectl_output:
|
|
756
|
-
xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}')
|
|
757
|
-
xpk_exit(kubectl_return_code)
|
|
758
|
-
xpk_print(
|
|
759
|
-
'Detected DNS endpoint-related error. Retrying without --dns-endpoint'
|
|
760
|
-
' flag...'
|
|
761
|
-
)
|
|
762
|
-
|
|
763
712
|
location = get_cluster_location(args.project, args.cluster, args.zone)
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
)
|
|
771
|
-
return_code = run_command_with_updates(
|
|
772
|
-
without_dns_command, 'get-credentials to cluster', verbose=False
|
|
713
|
+
|
|
714
|
+
return_code = _get_credentials(
|
|
715
|
+
project=args.project,
|
|
716
|
+
cluster=args.cluster,
|
|
717
|
+
location=location,
|
|
718
|
+
dns_endpoint=True,
|
|
773
719
|
)
|
|
774
720
|
if return_code != 0:
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
721
|
+
return return_code
|
|
722
|
+
|
|
723
|
+
if not _are_credentials_valid():
|
|
724
|
+
xpk_print('Detected error. Retrying without --dns-endpoint flag...')
|
|
725
|
+
return_code = _get_credentials(
|
|
726
|
+
project=args.project,
|
|
727
|
+
cluster=args.cluster,
|
|
728
|
+
location=location,
|
|
729
|
+
dns_endpoint=False,
|
|
730
|
+
)
|
|
731
|
+
if return_code != 0:
|
|
732
|
+
return return_code
|
|
779
733
|
|
|
780
|
-
|
|
781
|
-
|
|
734
|
+
xpk_print('Finished get-credentials and kubectl setup.')
|
|
735
|
+
return 0
|
|
782
736
|
|
|
783
|
-
Args:
|
|
784
|
-
args: user provided arguments for running the command.
|
|
785
737
|
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
738
|
+
def _get_credentials(
|
|
739
|
+
project: str, cluster: str, location: str, dns_endpoint: bool
|
|
740
|
+
) -> int:
|
|
741
|
+
dns_endpoint_arg = '--dns-endpoint' if dns_endpoint else ''
|
|
790
742
|
command = (
|
|
791
743
|
'gcloud container clusters get-credentials'
|
|
792
|
-
f' {
|
|
793
|
-
f' --project={
|
|
744
|
+
f' {cluster} --location={location} {dns_endpoint_arg}'
|
|
745
|
+
f' --project={project} && kubectl config view && kubectl config'
|
|
794
746
|
' set-context --current --namespace=default'
|
|
795
747
|
)
|
|
796
|
-
task = f'get-credentials-dns-endpoint to cluster {
|
|
797
|
-
|
|
748
|
+
task = f'get-credentials-dns-endpoint to cluster {cluster}'
|
|
749
|
+
return run_command_with_updates(command, task, verbose=False)
|
|
798
750
|
|
|
799
|
-
if return_code != 0:
|
|
800
|
-
xpk_print(f'{task} returned ERROR {return_code}')
|
|
801
|
-
xpk_exit(return_code)
|
|
802
|
-
|
|
803
|
-
return_code = test_and_retry_credentials_with_dns_logic(args)
|
|
804
|
-
xpk_print('Finished get-credentials and kubectl setup.')
|
|
805
751
|
|
|
806
|
-
|
|
752
|
+
def _are_credentials_valid() -> bool:
|
|
753
|
+
kubectl_command = 'kubectl get pods'
|
|
754
|
+
kubectl_return_code = run_command_with_updates(
|
|
755
|
+
kubectl_command, 'Test kubectl credentials'
|
|
756
|
+
)
|
|
757
|
+
return kubectl_return_code == 0
|
xpk/core/cluster_test.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
from .testing.commands_tester import CommandsTester
|
|
19
|
+
from .cluster import get_cluster_credentials, update_gke_cluster_with_lustre_driver_enabled, update_cluster_with_lustre_driver_if_necessary
|
|
20
|
+
from pytest_mock import MockerFixture
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture(autouse=True)
|
|
24
|
+
def commands_tester(mocker: MockerFixture) -> CommandsTester:
|
|
25
|
+
return CommandsTester(
|
|
26
|
+
mocker=mocker,
|
|
27
|
+
run_command_for_value_path="xpk.core.cluster.run_command_for_value",
|
|
28
|
+
run_command_with_updates_path="xpk.core.cluster.run_command_with_updates",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.fixture(autouse=True)
|
|
33
|
+
def mock_location(mocker: MockerFixture):
|
|
34
|
+
mocker.patch(
|
|
35
|
+
"xpk.core.cluster.get_cluster_location", return_value="us-central1"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.fixture(autouse=True)
|
|
40
|
+
def command_args(mocker: MockerFixture):
|
|
41
|
+
return mocker.Mock(cluster="cluster", project="project", zone="zone")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_get_cluster_credentials_returns_1_when_retrieval_command_fails(
|
|
45
|
+
commands_tester: CommandsTester, command_args
|
|
46
|
+
):
|
|
47
|
+
commands_tester.set_result_for_command(
|
|
48
|
+
(1, ""), "gcloud container clusters get-credentials"
|
|
49
|
+
)
|
|
50
|
+
assert get_cluster_credentials(command_args) == 1
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_get_cluster_credentials_returns_0_when_retrieval_succeeds(
|
|
54
|
+
commands_tester: CommandsTester, command_args
|
|
55
|
+
):
|
|
56
|
+
commands_tester.set_result_for_command(
|
|
57
|
+
(0, ""), "gcloud container clusters get-credentials"
|
|
58
|
+
)
|
|
59
|
+
assert get_cluster_credentials(command_args) == 0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_get_cluster_credentials_does_not_retry_with_dns_when_retrieval_succeeds(
|
|
63
|
+
commands_tester: CommandsTester, command_args
|
|
64
|
+
):
|
|
65
|
+
commands_tester.set_result_for_command(
|
|
66
|
+
(0, ""), "gcloud container clusters get-credentials --dns-endpoint"
|
|
67
|
+
)
|
|
68
|
+
commands_tester.set_result_for_command((0, ""), "kubectl get pods")
|
|
69
|
+
get_cluster_credentials(command_args)
|
|
70
|
+
non_dns_endpoint_commands = [
|
|
71
|
+
c
|
|
72
|
+
for c in commands_tester.get_matching_commands(
|
|
73
|
+
"gcloud container clusters get-credentials"
|
|
74
|
+
)
|
|
75
|
+
if "dns-endpoint" not in c
|
|
76
|
+
]
|
|
77
|
+
assert len(non_dns_endpoint_commands) == 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_get_cluster_credentials_retries_without_dns_when_dns_retrieval_fails(
|
|
81
|
+
commands_tester: CommandsTester, command_args
|
|
82
|
+
):
|
|
83
|
+
commands_tester.set_result_for_command(
|
|
84
|
+
(0, ""), "gcloud container clusters get-credentials --dns-endpoint"
|
|
85
|
+
)
|
|
86
|
+
commands_tester.set_result_for_command((1, ""), "kubectl get pods")
|
|
87
|
+
get_cluster_credentials(command_args)
|
|
88
|
+
non_dns_endpoint_commands = [
|
|
89
|
+
c
|
|
90
|
+
for c in commands_tester.get_matching_commands(
|
|
91
|
+
"gcloud container clusters get-credentials"
|
|
92
|
+
)
|
|
93
|
+
if "dns-endpoint" not in c
|
|
94
|
+
]
|
|
95
|
+
assert len(non_dns_endpoint_commands) == 1
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_update_cluster_with_lustre_driver_if_necessary_with_default_port_runs_correct_checks(
|
|
99
|
+
commands_tester: CommandsTester, command_args
|
|
100
|
+
):
|
|
101
|
+
commands_tester.set_result_for_command(
|
|
102
|
+
(0, "True"),
|
|
103
|
+
"gcloud container clusters describe",
|
|
104
|
+
)
|
|
105
|
+
command_args.enable_legacy_lustre_port = None
|
|
106
|
+
update_cluster_with_lustre_driver_if_necessary(command_args)
|
|
107
|
+
|
|
108
|
+
executed_commands = commands_tester.get_matching_commands()
|
|
109
|
+
assert executed_commands == [
|
|
110
|
+
"gcloud container clusters describe cluster --project=project"
|
|
111
|
+
" --location=us-central1"
|
|
112
|
+
' --format="value(addonsConfig.lustreCsiDriverConfig.enabled)"'
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_update_cluster_with_lustre_driver_if_necessary_with_legacy_port_runs_correct_checks(
|
|
117
|
+
commands_tester: CommandsTester, command_args
|
|
118
|
+
):
|
|
119
|
+
commands_tester.set_result_for_command(
|
|
120
|
+
(0, "True"),
|
|
121
|
+
"gcloud container clusters describe",
|
|
122
|
+
)
|
|
123
|
+
command_args.enable_legacy_lustre_port = True
|
|
124
|
+
update_cluster_with_lustre_driver_if_necessary(command_args)
|
|
125
|
+
|
|
126
|
+
executed_commands = commands_tester.get_matching_commands()
|
|
127
|
+
assert executed_commands == [
|
|
128
|
+
(
|
|
129
|
+
"gcloud container clusters describe cluster --project=project"
|
|
130
|
+
" --location=us-central1"
|
|
131
|
+
' --format="value(addonsConfig.lustreCsiDriverConfig.enabled)"'
|
|
132
|
+
),
|
|
133
|
+
(
|
|
134
|
+
"gcloud container clusters describe cluster --project=project"
|
|
135
|
+
" --location=us-central1"
|
|
136
|
+
' --format="value(addonsConfig.lustreCsiDriverConfig.enableLegacyLustrePort)"'
|
|
137
|
+
),
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_update_gke_cluster_with_lustre_driver_enabled_default_port(
|
|
142
|
+
commands_tester: CommandsTester, command_args
|
|
143
|
+
):
|
|
144
|
+
commands_tester.set_result_for_command(
|
|
145
|
+
(0, ""), "gcloud container clusters update"
|
|
146
|
+
)
|
|
147
|
+
command_args.enable_legacy_lustre_port = None
|
|
148
|
+
update_gke_cluster_with_lustre_driver_enabled(command_args)
|
|
149
|
+
|
|
150
|
+
executed_commands = commands_tester.get_matching_commands()
|
|
151
|
+
assert executed_commands == [
|
|
152
|
+
"gcloud container clusters update cluster --project=project"
|
|
153
|
+
" --location=us-central1 --quiet --update-addons=LustreCsiDriver=ENABLED"
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_update_gke_cluster_with_lustre_driver_enabled_legacy_port(
|
|
158
|
+
commands_tester: CommandsTester, command_args
|
|
159
|
+
):
|
|
160
|
+
commands_tester.set_result_for_command(
|
|
161
|
+
(0, ""), "gcloud container clusters update"
|
|
162
|
+
)
|
|
163
|
+
command_args.enable_legacy_lustre_port = True
|
|
164
|
+
update_gke_cluster_with_lustre_driver_enabled(command_args)
|
|
165
|
+
|
|
166
|
+
executed_commands = commands_tester.get_matching_commands()
|
|
167
|
+
assert executed_commands == [
|
|
168
|
+
"gcloud container clusters update cluster --project=project"
|
|
169
|
+
" --location=us-central1 --quiet --enable-legacy-lustre-port"
|
|
170
|
+
]
|
xpk/core/commands.py
CHANGED
|
@@ -195,16 +195,13 @@ def run_command_with_updates(command, task, verbose=True) -> int:
|
|
|
195
195
|
return_code = child.poll()
|
|
196
196
|
if return_code is None:
|
|
197
197
|
xpk_print(f'Waiting for `{task}`, for {i} seconds...', end='\r')
|
|
198
|
-
time.sleep(
|
|
199
|
-
i +=
|
|
198
|
+
time.sleep(10)
|
|
199
|
+
i += 10
|
|
200
200
|
else:
|
|
201
201
|
xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
|
|
202
202
|
return return_code
|
|
203
203
|
else:
|
|
204
|
-
xpk_print(
|
|
205
|
-
f'Task: `{task}` is implemented by `{command}`, hiding output unless'
|
|
206
|
-
' there is an error.'
|
|
207
|
-
)
|
|
204
|
+
xpk_print(f'Task: `{task}` is implemented by `{command}`')
|
|
208
205
|
try:
|
|
209
206
|
subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
|
|
210
207
|
except subprocess.CalledProcessError as e:
|
|
@@ -277,10 +274,7 @@ def run_command_for_value(
|
|
|
277
274
|
return return_code, f'{out_str}\n{err_str}'
|
|
278
275
|
else:
|
|
279
276
|
if not quiet:
|
|
280
|
-
xpk_print(
|
|
281
|
-
f'Task: `{task}` is implemented by `{command}`, hiding output unless'
|
|
282
|
-
' there is an error.'
|
|
283
|
-
)
|
|
277
|
+
xpk_print(f'Task: `{task}` is implemented by `{command}`')
|
|
284
278
|
try:
|
|
285
279
|
output = subprocess.check_output(
|
|
286
280
|
command,
|