xpk 0.14.4__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/commands/cluster.py +57 -21
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +11 -2
- xpk/commands/cluster_test.py +233 -12
- xpk/commands/config.py +3 -5
- xpk/commands/kind.py +1 -1
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +28 -12
- xpk/commands/workload_test.py +3 -3
- xpk/core/blueprint/blueprint_generator.py +70 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/capacity.py +46 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +37 -57
- xpk/core/cluster_test.py +95 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +9 -2
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +6 -9
- xpk/core/kueue_manager.py +192 -32
- xpk/core/kueue_manager_test.py +132 -4
- xpk/core/nodepool.py +21 -29
- xpk/core/nodepool_test.py +17 -15
- xpk/core/scheduling.py +16 -1
- xpk/core/scheduling_test.py +85 -6
- xpk/core/system_characteristics.py +77 -19
- xpk/core/system_characteristics_test.py +80 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/main.py +31 -13
- xpk/parser/cluster.py +48 -9
- xpk/parser/cluster_test.py +42 -3
- xpk/parser/workload.py +12 -0
- xpk/parser/workload_test.py +4 -4
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +7 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +0 -11
- xpk/utils/versions.py +31 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/METADATA +113 -92
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/RECORD +58 -48
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,7 @@ from xpk.commands.cluster_gcluster import get_unique_name
|
|
|
18
18
|
from xpk.core.docker_manager import DockerManager
|
|
19
19
|
from xpk.core.gcluster_manager import GclusterManager
|
|
20
20
|
from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
|
|
21
|
+
from xpk.utils.versions import ReleaseChannel
|
|
21
22
|
import pytest
|
|
22
23
|
import os
|
|
23
24
|
import shutil
|
|
@@ -28,6 +29,8 @@ region = os.getenv("REGION")
|
|
|
28
29
|
zone = os.getenv("ZONE")
|
|
29
30
|
auth_cidr = os.getenv("AUTH_CIDR")
|
|
30
31
|
cluster_name = os.getenv("A3_MEGA_TEST_CLUSTER_NAME")
|
|
32
|
+
release_channel = os.getenv("RELEASE_CHANNEL")
|
|
33
|
+
cluster_version = os.getenv("CLUSTER_VERSION")
|
|
31
34
|
|
|
32
35
|
uploads_dir = "uploads"
|
|
33
36
|
|
|
@@ -87,6 +90,8 @@ def test_create_a3_mega_deployment_files(setup_tests):
|
|
|
87
90
|
assert auth_cidr is not None
|
|
88
91
|
assert ctk_gcloud_cfg is not None
|
|
89
92
|
assert cluster_name is not None
|
|
93
|
+
assert release_channel is not None
|
|
94
|
+
assert cluster_version is not None
|
|
90
95
|
docker_path, bp_path = setup_tests[0], setup_tests[1]
|
|
91
96
|
|
|
92
97
|
blueprint_name = f"{cluster_name}-a3-mega-xpk"
|
|
@@ -107,6 +112,8 @@ def test_create_a3_mega_deployment_files(setup_tests):
|
|
|
107
112
|
auth_cidr=auth_cidr,
|
|
108
113
|
zone=zone,
|
|
109
114
|
system_node_pool_min_node_count=3,
|
|
115
|
+
release_channel=ReleaseChannel(release_channel),
|
|
116
|
+
cluster_version=cluster_version,
|
|
110
117
|
)
|
|
111
118
|
blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
|
|
112
119
|
blueprint_deps_test_path = os.path.join(bp_path, prefix, blueprint_name)
|
|
@@ -164,6 +171,8 @@ def create_test_a3_mega_deployment(docker_path: str, bp_path: str):
|
|
|
164
171
|
assert auth_cidr is not None
|
|
165
172
|
assert ctk_gcloud_cfg is not None
|
|
166
173
|
assert cluster_name is not None
|
|
174
|
+
assert release_channel is not None
|
|
175
|
+
assert cluster_version is not None
|
|
167
176
|
|
|
168
177
|
blueprint_name = f"{cluster_name}-a3-mega-xpk"
|
|
169
178
|
prefix = "prefix"
|
|
@@ -183,6 +192,8 @@ def create_test_a3_mega_deployment(docker_path: str, bp_path: str):
|
|
|
183
192
|
auth_cidr=auth_cidr,
|
|
184
193
|
zone=zone,
|
|
185
194
|
system_node_pool_min_node_count=3,
|
|
195
|
+
release_channel=ReleaseChannel(release_channel),
|
|
196
|
+
cluster_version=cluster_version,
|
|
186
197
|
)
|
|
187
198
|
|
|
188
199
|
gcluster_manager = GclusterManager(
|
|
@@ -24,6 +24,7 @@ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
|
|
|
24
24
|
from xpk.core.capacity import CapacityType
|
|
25
25
|
from xpk.core.docker_manager import DockerManager
|
|
26
26
|
from xpk.core.gcluster_manager import GclusterManager
|
|
27
|
+
from xpk.utils.versions import ReleaseChannel
|
|
27
28
|
|
|
28
29
|
ctk_gcloud_cfg = os.getenv("GCLOUD_CFG_PATH")
|
|
29
30
|
project_id = os.getenv("PROJECT_ID")
|
|
@@ -31,6 +32,8 @@ region = os.getenv("REGION")
|
|
|
31
32
|
zone = os.getenv("ZONE")
|
|
32
33
|
auth_cidr = os.getenv("AUTH_CIDR")
|
|
33
34
|
cluster_name = os.getenv("A3_ULTRA_TEST_CLUSTER_NAME")
|
|
35
|
+
release_channel = os.getenv("RELEASE_CHANNEL")
|
|
36
|
+
cluster_version = os.getenv("CLUSTER_VERSION")
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
@pytest.fixture(name="setup_tests")
|
|
@@ -60,6 +63,8 @@ def test_create_a3_ultra_deployment_files(setup_tests):
|
|
|
60
63
|
assert auth_cidr is not None
|
|
61
64
|
assert ctk_gcloud_cfg is not None
|
|
62
65
|
assert cluster_name is not None
|
|
66
|
+
assert release_channel is not None
|
|
67
|
+
assert cluster_version is not None
|
|
63
68
|
docker_path, bp_path = setup_tests[0], setup_tests[1]
|
|
64
69
|
blueprint_name = f"{cluster_name}-a3-ultra-xpk"
|
|
65
70
|
|
|
@@ -80,6 +85,8 @@ def test_create_a3_ultra_deployment_files(setup_tests):
|
|
|
80
85
|
num_nodes=1,
|
|
81
86
|
system_node_pool_machine_type="e2-standard-16",
|
|
82
87
|
prefix=prefix,
|
|
88
|
+
release_channel=ReleaseChannel(release_channel),
|
|
89
|
+
cluster_version=cluster_version,
|
|
83
90
|
)
|
|
84
91
|
blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
|
|
85
92
|
blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
|
|
@@ -125,6 +132,8 @@ def test_create_a3_ultra_deployment(setup_tests):
|
|
|
125
132
|
assert auth_cidr is not None
|
|
126
133
|
assert ctk_gcloud_cfg is not None
|
|
127
134
|
assert cluster_name is not None
|
|
135
|
+
assert release_channel is not None
|
|
136
|
+
assert cluster_version is not None
|
|
128
137
|
docker_path, bp_path = setup_tests[0], setup_tests[1]
|
|
129
138
|
blueprint_name = f"{cluster_name}-a3-ultra-xpk"
|
|
130
139
|
|
|
@@ -144,6 +153,8 @@ def test_create_a3_ultra_deployment(setup_tests):
|
|
|
144
153
|
capacity_type=CapacityType.SPOT,
|
|
145
154
|
num_nodes=1,
|
|
146
155
|
system_node_pool_machine_type="e2-standard-16",
|
|
156
|
+
release_channel=ReleaseChannel(release_channel),
|
|
157
|
+
cluster_version=cluster_version,
|
|
147
158
|
)
|
|
148
159
|
blueprint_test_path = os.path.join(bp_path, f"{blueprint_name}.yaml")
|
|
149
160
|
blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
|
integration/gcluster_a4_test.py
CHANGED
|
@@ -24,6 +24,7 @@ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
|
|
|
24
24
|
from xpk.core.capacity import CapacityType
|
|
25
25
|
from xpk.core.docker_manager import DockerManager
|
|
26
26
|
from xpk.core.gcluster_manager import GclusterManager
|
|
27
|
+
from xpk.utils.versions import ReleaseChannel
|
|
27
28
|
|
|
28
29
|
ctk_gcloud_cfg = os.getenv("GCLOUD_CFG_PATH")
|
|
29
30
|
project_id = os.getenv("PROJECT_ID")
|
|
@@ -31,6 +32,8 @@ region = os.getenv("REGION")
|
|
|
31
32
|
zone = os.getenv("ZONE")
|
|
32
33
|
auth_cidr = os.getenv("AUTH_CIDR")
|
|
33
34
|
cluster_name = os.getenv("A4_TEST_CLUSTER_NAME")
|
|
35
|
+
release_channel = os.getenv("RELEASE_CHANNEL")
|
|
36
|
+
cluster_version = os.getenv("CLUSTER_VERSION")
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
@pytest.fixture(name="setup_tests")
|
|
@@ -60,6 +63,8 @@ def test_create_a4_deployment_files(setup_tests):
|
|
|
60
63
|
assert auth_cidr is not None
|
|
61
64
|
assert ctk_gcloud_cfg is not None
|
|
62
65
|
assert cluster_name is not None
|
|
66
|
+
assert release_channel is not None
|
|
67
|
+
assert cluster_version is not None
|
|
63
68
|
docker_path, bp_path = setup_tests[0], setup_tests[1]
|
|
64
69
|
blueprint_name = f"{cluster_name}-a4-xpk"
|
|
65
70
|
|
|
@@ -80,6 +85,8 @@ def test_create_a4_deployment_files(setup_tests):
|
|
|
80
85
|
num_nodes=1,
|
|
81
86
|
system_node_pool_machine_type="e2-standard-16",
|
|
82
87
|
prefix=prefix,
|
|
88
|
+
release_channel=ReleaseChannel(release_channel),
|
|
89
|
+
cluster_version=cluster_version,
|
|
83
90
|
)
|
|
84
91
|
blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
|
|
85
92
|
blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
|
|
@@ -125,6 +132,8 @@ def test_create_a4_deployment(setup_tests):
|
|
|
125
132
|
assert auth_cidr is not None
|
|
126
133
|
assert ctk_gcloud_cfg is not None
|
|
127
134
|
assert cluster_name is not None
|
|
135
|
+
assert release_channel is not None
|
|
136
|
+
assert cluster_version is not None
|
|
128
137
|
docker_path, bp_path = setup_tests[0], setup_tests[1]
|
|
129
138
|
blueprint_name = f"{cluster_name}-a4-xpk"
|
|
130
139
|
|
|
@@ -144,6 +153,8 @@ def test_create_a4_deployment(setup_tests):
|
|
|
144
153
|
capacity_type=CapacityType.SPOT,
|
|
145
154
|
num_nodes=1,
|
|
146
155
|
system_node_pool_machine_type="e2-standard-16",
|
|
156
|
+
release_channel=ReleaseChannel(release_channel),
|
|
157
|
+
cluster_version=cluster_version,
|
|
147
158
|
)
|
|
148
159
|
blueprint_test_path = os.path.join(bp_path, f"{blueprint_name}.yaml")
|
|
149
160
|
blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
|
xpk/commands/cluster.py
CHANGED
|
@@ -17,6 +17,7 @@ limitations under the License.
|
|
|
17
17
|
from tabulate import tabulate
|
|
18
18
|
|
|
19
19
|
from ..utils.feature_flags import FeatureFlags
|
|
20
|
+
from ..utils.versions import ReleaseChannel
|
|
20
21
|
from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE, get_reservation_deployment_type
|
|
21
22
|
from ..core.cluster import (
|
|
22
23
|
get_all_clusters_programmatic,
|
|
@@ -38,6 +39,8 @@ from ..core.cluster import (
|
|
|
38
39
|
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
|
|
39
40
|
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
40
41
|
from ..core.config import VERTEX_TENSORBOARD_FEATURE_FLAG
|
|
42
|
+
from ..core.telemetry import MetricsCollector, MetricsEventMetadataKey
|
|
43
|
+
from ..core.capacity import get_capacity_type
|
|
41
44
|
from ..core.gcloud_context import (
|
|
42
45
|
add_zone_and_project,
|
|
43
46
|
get_gke_control_plane_version,
|
|
@@ -71,9 +74,9 @@ from ..core.system_characteristics import (
|
|
|
71
74
|
)
|
|
72
75
|
from ..core.vertex import create_vertex_tensorboard
|
|
73
76
|
from ..core.workload import get_workload_list
|
|
74
|
-
from ..utils.console import
|
|
77
|
+
from ..utils.console import ask_for_user_consent, xpk_exit, xpk_print
|
|
75
78
|
from ..utils.file import write_tmp_file
|
|
76
|
-
from ..utils.execution_context import is_dry_run
|
|
79
|
+
from ..utils.execution_context import is_dry_run, is_quiet
|
|
77
80
|
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
78
81
|
from . import cluster_gcluster
|
|
79
82
|
from .common import set_cluster_command, validate_sub_slicing_system
|
|
@@ -263,15 +266,15 @@ def cluster_create(args) -> None:
|
|
|
263
266
|
xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
|
|
264
267
|
add_zone_and_project(args)
|
|
265
268
|
|
|
266
|
-
|
|
267
|
-
xpk_print(
|
|
268
|
-
'Creating the cluster using Cluster Toolkit. Machine Type:'
|
|
269
|
-
f' {system.gce_machine_type} ...'
|
|
270
|
-
)
|
|
271
|
-
cluster_gcluster.cluster_create(args)
|
|
272
|
-
xpk_exit(0)
|
|
269
|
+
_log_cluster_create_telemetry(args)
|
|
273
270
|
|
|
274
|
-
|
|
271
|
+
release_channel = (
|
|
272
|
+
ReleaseChannel.REGULAR if args.gke_version else ReleaseChannel.RAPID
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
return_code, gke_server_config = get_gke_server_config(
|
|
276
|
+
args, release_channel=release_channel
|
|
277
|
+
)
|
|
275
278
|
if return_code != 0 or gke_server_config is None:
|
|
276
279
|
xpk_exit(return_code)
|
|
277
280
|
|
|
@@ -281,8 +284,20 @@ def cluster_create(args) -> None:
|
|
|
281
284
|
if return_code != 0 or gke_control_plane_version is None:
|
|
282
285
|
xpk_exit(return_code)
|
|
283
286
|
|
|
287
|
+
if system.device_type in cluster_gcluster.supported_device_types:
|
|
288
|
+
xpk_print(
|
|
289
|
+
'Creating the cluster using Cluster Toolkit. Machine Type:'
|
|
290
|
+
f' {system.gce_machine_type} ...'
|
|
291
|
+
)
|
|
292
|
+
cluster_gcluster.cluster_create(
|
|
293
|
+
args,
|
|
294
|
+
gke_control_plane_version=gke_control_plane_version,
|
|
295
|
+
release_channel=release_channel,
|
|
296
|
+
)
|
|
297
|
+
xpk_exit(0)
|
|
298
|
+
|
|
284
299
|
create_cluster_command_code = create_cluster_if_necessary(
|
|
285
|
-
args, gke_control_plane_version, system
|
|
300
|
+
args, gke_control_plane_version, system, release_channel=release_channel
|
|
286
301
|
)
|
|
287
302
|
if create_cluster_command_code != 0:
|
|
288
303
|
xpk_exit(create_cluster_command_code)
|
|
@@ -1022,7 +1037,10 @@ def update_coredns_if_necessary() -> int:
|
|
|
1022
1037
|
|
|
1023
1038
|
|
|
1024
1039
|
def create_cluster_if_necessary(
|
|
1025
|
-
args,
|
|
1040
|
+
args,
|
|
1041
|
+
gke_control_plane_version: str,
|
|
1042
|
+
system: SystemCharacteristics,
|
|
1043
|
+
release_channel: ReleaseChannel,
|
|
1026
1044
|
) -> int:
|
|
1027
1045
|
"""Creates cluster if not present in the project.
|
|
1028
1046
|
|
|
@@ -1043,7 +1061,7 @@ def create_cluster_if_necessary(
|
|
|
1043
1061
|
return 0
|
|
1044
1062
|
else:
|
|
1045
1063
|
return run_gke_cluster_create_command(
|
|
1046
|
-
args, gke_control_plane_version, system
|
|
1064
|
+
args, gke_control_plane_version, system, release_channel=release_channel
|
|
1047
1065
|
)
|
|
1048
1066
|
|
|
1049
1067
|
|
|
@@ -1056,7 +1074,7 @@ def run_gke_cluster_delete_command(args) -> int:
|
|
|
1056
1074
|
Returns:
|
|
1057
1075
|
0 if successful and 1 otherwise.
|
|
1058
1076
|
"""
|
|
1059
|
-
if not
|
|
1077
|
+
if not is_quiet():
|
|
1060
1078
|
xpk_print('Get the name of the workloads in the cluster.')
|
|
1061
1079
|
args.filter_by_status = 'EVERYTHING'
|
|
1062
1080
|
return_code, return_value = get_workload_list(args)
|
|
@@ -1067,10 +1085,9 @@ def run_gke_cluster_delete_command(args) -> int:
|
|
|
1067
1085
|
# Ignore Column Names line.
|
|
1068
1086
|
if len(return_value) > 1:
|
|
1069
1087
|
workloads = [x.split(' ')[0] for x in return_value.splitlines()][1:]
|
|
1070
|
-
if workloads and not
|
|
1088
|
+
if workloads and not ask_for_user_consent(
|
|
1071
1089
|
f'Planning to delete {len(workloads)} workloads in the cluster'
|
|
1072
|
-
f' {args.cluster} including {workloads}. \nDo you wish to delete
|
|
1073
|
-
' (yes) / n (no):\n'
|
|
1090
|
+
f' {args.cluster} including {workloads}. \nDo you wish to delete?'
|
|
1074
1091
|
):
|
|
1075
1092
|
xpk_print('Skipping delete command.')
|
|
1076
1093
|
return 0
|
|
@@ -1115,7 +1132,10 @@ def run_gke_clusters_list_command(args) -> int:
|
|
|
1115
1132
|
|
|
1116
1133
|
|
|
1117
1134
|
def run_gke_cluster_create_command(
|
|
1118
|
-
args,
|
|
1135
|
+
args,
|
|
1136
|
+
gke_control_plane_version: str,
|
|
1137
|
+
system: SystemCharacteristics,
|
|
1138
|
+
release_channel: ReleaseChannel,
|
|
1119
1139
|
) -> int:
|
|
1120
1140
|
"""Run the Create GKE Cluster request.
|
|
1121
1141
|
|
|
@@ -1155,9 +1175,10 @@ def run_gke_cluster_create_command(
|
|
|
1155
1175
|
' --enable-dns-access'
|
|
1156
1176
|
' --autoscaling-profile=optimize-utilization'
|
|
1157
1177
|
' --labels=gke_product_type=xpk'
|
|
1178
|
+
f' --release-channel={release_channel.value.lower()}'
|
|
1158
1179
|
)
|
|
1159
1180
|
|
|
1160
|
-
if args.gke_version
|
|
1181
|
+
if args.gke_version:
|
|
1161
1182
|
command += ' --no-enable-autoupgrade'
|
|
1162
1183
|
|
|
1163
1184
|
enable_ip_alias = False
|
|
@@ -1285,7 +1306,7 @@ def _install_kueue(
|
|
|
1285
1306
|
else:
|
|
1286
1307
|
# Determine total chips based on user specified topology.
|
|
1287
1308
|
total_chips = get_total_chips_requested_from_args(args, system)
|
|
1288
|
-
kueue_manager = KueueManager()
|
|
1309
|
+
kueue_manager = KueueManager(args.project, args.zone)
|
|
1289
1310
|
return kueue_manager.install_or_upgrade(
|
|
1290
1311
|
KueueConfig(
|
|
1291
1312
|
system,
|
|
@@ -1299,7 +1320,7 @@ def _install_kueue(
|
|
|
1299
1320
|
configure_sub_slicing=(
|
|
1300
1321
|
FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
|
|
1301
1322
|
),
|
|
1302
|
-
)
|
|
1323
|
+
)
|
|
1303
1324
|
)
|
|
1304
1325
|
|
|
1305
1326
|
|
|
@@ -1320,3 +1341,18 @@ def prepare_gpus(system: SystemCharacteristics):
|
|
|
1320
1341
|
err_code = disable_mglru_on_cluster()
|
|
1321
1342
|
if err_code > 0:
|
|
1322
1343
|
xpk_exit(err_code)
|
|
1344
|
+
|
|
1345
|
+
|
|
1346
|
+
def _log_cluster_create_telemetry(args) -> None:
|
|
1347
|
+
if FeatureFlags.TELEMETRY_ENABLED:
|
|
1348
|
+
capacity_type, _ = get_capacity_type(args)
|
|
1349
|
+
MetricsCollector.log_custom(
|
|
1350
|
+
name='cluster_create',
|
|
1351
|
+
metadata={
|
|
1352
|
+
MetricsEventMetadataKey.ZONE: args.zone,
|
|
1353
|
+
MetricsEventMetadataKey.SYSTEM_CHARACTERISTICS: (
|
|
1354
|
+
args.tpu_type if args.tpu_type else args.device_type
|
|
1355
|
+
),
|
|
1356
|
+
MetricsEventMetadataKey.PROVISIONING_MODE: capacity_type.value,
|
|
1357
|
+
},
|
|
1358
|
+
)
|
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -17,6 +17,7 @@ limitations under the License.
|
|
|
17
17
|
import os
|
|
18
18
|
|
|
19
19
|
from ..utils.feature_flags import FeatureFlags
|
|
20
|
+
from ..utils.versions import ReleaseChannel
|
|
20
21
|
from ..utils.execution_context import is_dry_run
|
|
21
22
|
from ..core.kueue_manager import KueueConfig, KueueManager
|
|
22
23
|
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
@@ -51,11 +52,15 @@ gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
|
|
|
51
52
|
gcloud_cfg_path = os.path.expanduser('~/.config/gcloud')
|
|
52
53
|
|
|
53
54
|
|
|
54
|
-
def cluster_create(
|
|
55
|
+
def cluster_create(
|
|
56
|
+
args, gke_control_plane_version: str, release_channel: ReleaseChannel
|
|
57
|
+
) -> None:
|
|
55
58
|
"""Function around cluster creation using Cluster toolkit.
|
|
56
59
|
|
|
57
60
|
Args:
|
|
58
61
|
args: user provided arguments for running the command.
|
|
62
|
+
gke_control_plane_version: the GKE version used for the new cluster.
|
|
63
|
+
release_channel:t the release channel used for the new cluster.
|
|
59
64
|
|
|
60
65
|
Returns:
|
|
61
66
|
0 if successful and 1 otherwise.
|
|
@@ -79,7 +84,13 @@ def cluster_create(args) -> None:
|
|
|
79
84
|
)
|
|
80
85
|
gcm = prepare_gcluster_manager(remote_state_client)
|
|
81
86
|
|
|
82
|
-
bp = generate_blueprint(
|
|
87
|
+
bp = generate_blueprint(
|
|
88
|
+
blueprint_name=unique_name,
|
|
89
|
+
args=args,
|
|
90
|
+
prefix=prefix,
|
|
91
|
+
gke_control_plane_version=gke_control_plane_version,
|
|
92
|
+
release_channel=release_channel,
|
|
93
|
+
)
|
|
83
94
|
|
|
84
95
|
# staging: sending the blueprint file(s) to gcluster's working directory
|
|
85
96
|
if is_dry_run():
|
|
@@ -141,7 +152,7 @@ def __install_kueue(args) -> int:
|
|
|
141
152
|
else:
|
|
142
153
|
# Determine total chips based on user specified topology.
|
|
143
154
|
total_chips = get_total_chips_requested_from_args(args, system)
|
|
144
|
-
kueue_manager = KueueManager()
|
|
155
|
+
kueue_manager = KueueManager(args.project, args.zone)
|
|
145
156
|
|
|
146
157
|
tolerations = [{
|
|
147
158
|
'key': 'components.gke.io/gke-managed-components',
|
|
@@ -149,7 +160,6 @@ def __install_kueue(args) -> int:
|
|
|
149
160
|
'value': 'true',
|
|
150
161
|
'effect': 'NoSchedule',
|
|
151
162
|
}]
|
|
152
|
-
|
|
153
163
|
kueue_manager.install_or_upgrade(
|
|
154
164
|
KueueConfig(
|
|
155
165
|
system,
|
|
@@ -287,7 +297,11 @@ def validate_state_gcs_bucket(args):
|
|
|
287
297
|
|
|
288
298
|
|
|
289
299
|
def generate_blueprint(
|
|
290
|
-
blueprint_name,
|
|
300
|
+
blueprint_name,
|
|
301
|
+
args,
|
|
302
|
+
gke_control_plane_version: str,
|
|
303
|
+
release_channel: ReleaseChannel,
|
|
304
|
+
prefix=None,
|
|
291
305
|
) -> BlueprintGeneratorOutput:
|
|
292
306
|
capacity_type, return_code = get_capacity_type(args)
|
|
293
307
|
if return_code != 0:
|
|
@@ -342,6 +356,8 @@ def generate_blueprint(
|
|
|
342
356
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
343
357
|
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
344
358
|
gcs_bucket=args.cluster_state_gcs_bucket,
|
|
359
|
+
cluster_version=gke_control_plane_version,
|
|
360
|
+
release_channel=release_channel,
|
|
345
361
|
)
|
|
346
362
|
if args.device_type == a3ultra_device_type:
|
|
347
363
|
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
@@ -360,6 +376,8 @@ def generate_blueprint(
|
|
|
360
376
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
361
377
|
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
362
378
|
gcs_bucket=args.cluster_state_gcs_bucket,
|
|
379
|
+
cluster_version=gke_control_plane_version,
|
|
380
|
+
release_channel=release_channel,
|
|
363
381
|
)
|
|
364
382
|
if args.device_type == a4_device_type:
|
|
365
383
|
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
@@ -376,6 +394,8 @@ def generate_blueprint(
|
|
|
376
394
|
capacity_type=capacity_type,
|
|
377
395
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
378
396
|
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
397
|
+
cluster_version=gke_control_plane_version,
|
|
398
|
+
release_channel=release_channel,
|
|
379
399
|
)
|
|
380
400
|
xpk_print('Device type is not supported.')
|
|
381
401
|
xpk_exit(1)
|
|
@@ -21,6 +21,7 @@ import pytest
|
|
|
21
21
|
from xpk.commands.cluster_gcluster import cluster_create
|
|
22
22
|
from xpk.core.kueue_manager import KueueConfig
|
|
23
23
|
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
|
|
24
|
+
from xpk.utils.versions import ReleaseChannel
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
@pytest.fixture
|
|
@@ -103,7 +104,11 @@ def test_install_kueue_standard(
|
|
|
103
104
|
)
|
|
104
105
|
mock_get_total_chips.return_value = 16
|
|
105
106
|
|
|
106
|
-
cluster_create(
|
|
107
|
+
cluster_create(
|
|
108
|
+
mock_args,
|
|
109
|
+
release_channel=ReleaseChannel.RAPID,
|
|
110
|
+
gke_control_plane_version="1.2.3",
|
|
111
|
+
)
|
|
107
112
|
|
|
108
113
|
mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
|
|
109
114
|
mock_kueue_manager = mock_cluster_create_deps["KueueManager"]
|
|
@@ -153,7 +158,11 @@ def test_install_kueue_with_autoprovisioning(
|
|
|
153
158
|
mock_autoprovisioning_config.maximum_chips = 128
|
|
154
159
|
mock_enable_autoprovisioning.return_value = (mock_autoprovisioning_config, 0)
|
|
155
160
|
|
|
156
|
-
cluster_create(
|
|
161
|
+
cluster_create(
|
|
162
|
+
mock_args,
|
|
163
|
+
release_channel=ReleaseChannel.RAPID,
|
|
164
|
+
gke_control_plane_version="1.2.3",
|
|
165
|
+
)
|
|
157
166
|
|
|
158
167
|
mock_cluster_create_deps["xpk_exit"].assert_called_with(0)
|
|
159
168
|
mock_enable_autoprovisioning.assert_called_once_with(mock_args, mock_system)
|