xpk 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -12
- xpk/commands/cluster.py +33 -16
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +23 -20
- xpk/commands/run.py +17 -11
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +64 -19
- xpk/commands/workload.py +154 -319
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +322 -32
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +75 -5
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +11 -3
- xpk/core/gcsfuse.py +8 -5
- xpk/core/kjob.py +57 -18
- xpk/core/nap.py +4 -0
- xpk/core/network.py +11 -21
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +27 -82
- xpk/core/workload_decorators/rdma_decorator.py +3 -3
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -2
- xpk/parser/cluster.py +15 -6
- xpk/parser/storage.py +14 -3
- xpk/parser/workload.py +59 -31
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/METADATA +60 -4
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/RECORD +40 -40
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/top_level.txt +0 -0
xpk/core/docker_resources.py
CHANGED
|
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
17
|
+
from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
|
|
18
18
|
from .cluster import setup_k8s_env
|
|
19
19
|
from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, Storage, get_storages_to_mount
|
|
20
20
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
@@ -64,22 +64,6 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
|
|
|
64
64
|
str:
|
|
65
65
|
YAML with the env config for the main container, as a YAML string.
|
|
66
66
|
"""
|
|
67
|
-
pw_env_yaml = """
|
|
68
|
-
- name: XCLOUD_ENVIRONMENT
|
|
69
|
-
value: GCP
|
|
70
|
-
- name: JAX_PLATFORMS
|
|
71
|
-
value: proxy
|
|
72
|
-
- name: JAX_BACKEND_TARGET
|
|
73
|
-
value: {proxy_address}
|
|
74
|
-
- name: JOBSET_NAME
|
|
75
|
-
valueFrom:
|
|
76
|
-
fieldRef:
|
|
77
|
-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']"""
|
|
78
|
-
if args.use_pathways:
|
|
79
|
-
return pw_env_yaml.format(
|
|
80
|
-
args=args, proxy_address=args.pathways_proxy_address
|
|
81
|
-
)
|
|
82
|
-
|
|
83
67
|
gpu_env_yaml = """
|
|
84
68
|
- name: REPLICATED_JOB_NAME
|
|
85
69
|
valueFrom:
|
|
@@ -182,11 +166,14 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
|
|
|
182
166
|
name: dshm-2
|
|
183
167
|
"""
|
|
184
168
|
|
|
185
|
-
if args.ramdisk_directory != '':
|
|
186
|
-
|
|
169
|
+
if hasattr(args, 'ramdisk_directory') and args.ramdisk_directory != '':
|
|
170
|
+
driver = 'phase1-checkpoint.csi.storage.gke.io'
|
|
171
|
+
if hasattr(args, 'mtc_enabled') and args.mtc_enabled:
|
|
172
|
+
driver = 'multitier-checkpoint.csi.storage.gke.io'
|
|
173
|
+
volumes += f"""
|
|
187
174
|
- name: cache
|
|
188
175
|
csi:
|
|
189
|
-
driver:
|
|
176
|
+
driver: {driver}"""
|
|
190
177
|
|
|
191
178
|
if (
|
|
192
179
|
system.accelerator_type == AcceleratorType['TPU']
|
|
@@ -229,7 +216,7 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
|
|
|
229
216
|
name: dshm-2
|
|
230
217
|
"""
|
|
231
218
|
|
|
232
|
-
if args.ramdisk_directory != '':
|
|
219
|
+
if hasattr(args, 'ramdisk_directory') and args.ramdisk_directory != '':
|
|
233
220
|
volume_mount_yaml += f"""
|
|
234
221
|
- mountPath: /{args.ramdisk_directory}
|
|
235
222
|
name: cache"""
|
|
@@ -262,6 +249,7 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
|
|
|
262
249
|
elif (
|
|
263
250
|
system.device_type == H100_MEGA_DEVICE_TYPE
|
|
264
251
|
or system.device_type == H200_DEVICE_TYPE
|
|
252
|
+
or system.device_type == B200_DEVICE_TYPE
|
|
265
253
|
):
|
|
266
254
|
volume_mount_yaml = ''
|
|
267
255
|
|
xpk/core/filestore.py
CHANGED
|
@@ -200,7 +200,9 @@ class FilestoreClient:
|
|
|
200
200
|
] = f"projects/{self.project}/global/networks/{network}"
|
|
201
201
|
return data
|
|
202
202
|
|
|
203
|
-
def create_pv(
|
|
203
|
+
def create_pv(
|
|
204
|
+
self, name: str, vol: str, access_mode: str, mount_options: str
|
|
205
|
+
) -> dict:
|
|
204
206
|
"""Create a yaml representing filestore PersistentVolume."""
|
|
205
207
|
data = templates.load(FS_PV_PATH)
|
|
206
208
|
data["metadata"]["name"] = get_pv_name(name)
|
|
@@ -215,6 +217,7 @@ class FilestoreClient:
|
|
|
215
217
|
0
|
|
216
218
|
].ip_addresses[0]
|
|
217
219
|
data["spec"]["csi"]["volumeAttributes"]["volume"] = vol
|
|
220
|
+
data["spec"]["mountOptions"] = mount_options.split(",")
|
|
218
221
|
return data
|
|
219
222
|
|
|
220
223
|
def create_pvc(self, name: str, access_mode: str) -> dict:
|
|
@@ -230,10 +233,15 @@ class FilestoreClient:
|
|
|
230
233
|
return data
|
|
231
234
|
|
|
232
235
|
def manifest(
|
|
233
|
-
self,
|
|
236
|
+
self,
|
|
237
|
+
name: str,
|
|
238
|
+
vol: str,
|
|
239
|
+
access_mode: str,
|
|
240
|
+
network: str,
|
|
241
|
+
mount_options: str,
|
|
234
242
|
) -> list[dict]:
|
|
235
243
|
self.load_instance()
|
|
236
|
-
pv = self.create_pv(name, vol, access_mode)
|
|
244
|
+
pv = self.create_pv(name, vol, access_mode, mount_options)
|
|
237
245
|
pvc = self.create_pvc(name, access_mode)
|
|
238
246
|
sc = self.create_sc(name, network)
|
|
239
247
|
return [pv, pvc, sc]
|
xpk/core/gcsfuse.py
CHANGED
|
@@ -20,11 +20,12 @@ FUSE_PV_PATH = "/../templates/fuse-pv.yaml"
|
|
|
20
20
|
FUSE_PVC_PATH = "/../templates/fuse-pvc.yaml"
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def create_pv(name: str, size: int, bucket: str) -> dict:
|
|
23
|
+
def create_pv(name: str, size: int, bucket: str, mount_options: str) -> dict:
|
|
24
24
|
data = templates.load(FUSE_PV_PATH)
|
|
25
25
|
data["metadata"]["name"] = f"{name}-pv"
|
|
26
26
|
data["spec"]["capacity"]["storage"] = f"{size}Gi"
|
|
27
27
|
data["spec"]["csi"]["volumeHandle"] = bucket
|
|
28
|
+
data["spec"]["mountOptions"] = mount_options.split(",")
|
|
28
29
|
return data
|
|
29
30
|
|
|
30
31
|
|
|
@@ -36,15 +37,17 @@ def create_pvc(name: str, size: int) -> dict:
|
|
|
36
37
|
return data
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
def manifest(
|
|
40
|
+
def manifest(
|
|
41
|
+
name: str, bucket: str, size: int, mount_options: str
|
|
42
|
+
) -> list[dict]:
|
|
40
43
|
"""Creates GCS FUSE manifest file.
|
|
41
44
|
|
|
42
45
|
Args:
|
|
43
|
-
path (str): path to the file where the manifest will be created
|
|
44
46
|
name (str): base name of the volumes
|
|
45
47
|
bucket (str): name of the storage bucket
|
|
46
|
-
size (str): size of the storage
|
|
48
|
+
size (str): size of the storage (in GB)
|
|
49
|
+
mount_options (str): comma-separated list of mountOptions for PersistentVolume
|
|
47
50
|
"""
|
|
48
|
-
pv = create_pv(name, size, bucket)
|
|
51
|
+
pv = create_pv(name, size, bucket, mount_options)
|
|
49
52
|
pvc = create_pvc(name, size)
|
|
50
53
|
return [pv, pvc]
|
xpk/core/kjob.py
CHANGED
|
@@ -14,27 +14,45 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..core.blueprint.blueprint_generator import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
|
|
18
|
-
from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
19
17
|
from argparse import Namespace
|
|
20
|
-
import
|
|
21
|
-
from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry
|
|
22
|
-
from ..utils.console import xpk_print, xpk_exit
|
|
18
|
+
from enum import Enum
|
|
23
19
|
|
|
24
|
-
|
|
20
|
+
import yaml
|
|
25
21
|
from kubernetes import client as k8s_client
|
|
26
22
|
from kubernetes.client import ApiClient
|
|
27
23
|
from kubernetes.client.rest import ApiException
|
|
28
|
-
from .cluster import setup_k8s_env, XPK_SA, DEFAULT_NAMESPACE
|
|
29
|
-
from .storage import get_auto_mount_storages, get_auto_mount_gcsfuse_storages
|
|
30
|
-
from .commands import run_command_for_value, run_kubectl_apply, run_command_with_updates
|
|
31
|
-
from .config import XpkConfig, KJOB_SHELL_IMAGE, KJOB_SHELL_INTERACTIVE_COMMAND, KJOB_SHELL_WORKING_DIRECTORY, KJOB_BATCH_IMAGE, KJOB_BATCH_WORKING_DIRECTORY
|
|
32
|
-
from .resources import get_cluster_system_characteristics, SystemCharacteristics, AcceleratorType
|
|
33
|
-
from enum import Enum
|
|
34
|
-
|
|
35
|
-
from ..core.workload_decorators import tcpxo_decorator
|
|
36
24
|
|
|
37
|
-
from ..core.
|
|
25
|
+
from ..core.blueprint.blueprint_generator import (
|
|
26
|
+
get_subnetworks_for_a3mega,
|
|
27
|
+
get_subnetworks_for_a3ultra,
|
|
28
|
+
get_subnetworks_for_a4,
|
|
29
|
+
)
|
|
30
|
+
from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
31
|
+
from ..core.storage import GCS_FUSE_ANNOTATIONS, PARALLELSTORE_ANNOTATIONS
|
|
32
|
+
from ..core.workload_decorators import rdma_decorator, tcpxo_decorator
|
|
33
|
+
from ..utils import templates
|
|
34
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
35
|
+
from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
|
|
36
|
+
from .commands import (
|
|
37
|
+
run_command_for_value,
|
|
38
|
+
run_command_with_updates,
|
|
39
|
+
run_kubectl_apply,
|
|
40
|
+
)
|
|
41
|
+
from .config import (
|
|
42
|
+
KJOB_BATCH_IMAGE,
|
|
43
|
+
KJOB_BATCH_WORKING_DIRECTORY,
|
|
44
|
+
KJOB_SHELL_IMAGE,
|
|
45
|
+
KJOB_SHELL_INTERACTIVE_COMMAND,
|
|
46
|
+
KJOB_SHELL_WORKING_DIRECTORY,
|
|
47
|
+
XpkConfig,
|
|
48
|
+
)
|
|
49
|
+
from .resources import (
|
|
50
|
+
AcceleratorType,
|
|
51
|
+
SystemCharacteristics,
|
|
52
|
+
get_cluster_system_characteristics,
|
|
53
|
+
)
|
|
54
|
+
from .storage import get_auto_mount_gcsfuse_storages, get_auto_mount_storages, get_auto_mount_parallelstore_storages
|
|
55
|
+
from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry
|
|
38
56
|
|
|
39
57
|
KJOB_API_GROUP_NAME = "kjobctl.x-k8s.io"
|
|
40
58
|
KJOB_API_GROUP_VERSION = "v1alpha1"
|
|
@@ -146,6 +164,18 @@ Kueue_TAS_annotation = "kueue.x-k8s.io/podset-preferred-topology=cloud.google.co
|
|
|
146
164
|
default_interface_annotation = "networking.gke.io/default-interface=eth0"
|
|
147
165
|
|
|
148
166
|
|
|
167
|
+
def get_a4_pod_template_annotations() -> tuple[str, str]:
|
|
168
|
+
sub_networks = get_subnetworks_for_a4()
|
|
169
|
+
interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
|
|
170
|
+
sub_networks
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
return (
|
|
174
|
+
default_interface_annotation,
|
|
175
|
+
f"{interfaces_key}=$'{interfaces_value}'",
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
149
179
|
def get_a3ultra_pod_template_annotations(args: Namespace) -> tuple[str, str]:
|
|
150
180
|
sub_networks = get_subnetworks_for_a3ultra(args.cluster)
|
|
151
181
|
interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
|
|
@@ -436,9 +466,18 @@ def create_volume_bundle_instance(
|
|
|
436
466
|
xpk_exit(1)
|
|
437
467
|
|
|
438
468
|
|
|
439
|
-
def
|
|
469
|
+
def get_storage_annotations(args: Namespace) -> list[str]:
|
|
470
|
+
annotations = []
|
|
440
471
|
k8s_api_client = setup_k8s_env(args)
|
|
472
|
+
|
|
441
473
|
gcsfuse_storages = get_auto_mount_gcsfuse_storages(k8s_api_client)
|
|
442
474
|
if len(gcsfuse_storages) > 0:
|
|
443
|
-
|
|
444
|
-
|
|
475
|
+
for key, value in GCS_FUSE_ANNOTATIONS.items():
|
|
476
|
+
annotations.append(f"{key}={value}")
|
|
477
|
+
|
|
478
|
+
parallelstore_storages = get_auto_mount_parallelstore_storages(k8s_api_client)
|
|
479
|
+
if len(parallelstore_storages) > 0:
|
|
480
|
+
for key, value in PARALLELSTORE_ANNOTATIONS.items():
|
|
481
|
+
annotations.append(f"{key}={value}")
|
|
482
|
+
|
|
483
|
+
return annotations
|
xpk/core/nap.py
CHANGED
|
@@ -255,6 +255,10 @@ def is_autoprovisioning_enabled(
|
|
|
255
255
|
bool is true if autoprovisioning is enabled, false otherwise.
|
|
256
256
|
int of 0 if successful and 1 otherwise.
|
|
257
257
|
"""
|
|
258
|
+
# Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
|
|
259
|
+
if args.use_pathways:
|
|
260
|
+
return False, 0
|
|
261
|
+
|
|
258
262
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
259
263
|
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
260
264
|
|
xpk/core/network.py
CHANGED
|
@@ -16,10 +16,8 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from ..utils.console import xpk_print
|
|
18
18
|
from ..utils.file import write_tmp_file
|
|
19
|
-
from .capacity import H100_DEVICE_TYPE
|
|
20
19
|
from .commands import run_command_for_value, run_command_with_updates
|
|
21
20
|
from .gcloud_context import zone_to_region
|
|
22
|
-
from .system_characteristics import SystemCharacteristics
|
|
23
21
|
|
|
24
22
|
# cluster_network_yaml: the config when creating the network for a3 cluster
|
|
25
23
|
CLUSTER_NETWORK_YAML = """
|
|
@@ -175,16 +173,6 @@ def create_cluster_subnet(args, index) -> int:
|
|
|
175
173
|
return 0
|
|
176
174
|
|
|
177
175
|
|
|
178
|
-
def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
|
|
179
|
-
return [f'{cluster_name}-gpunet-{i}-subnet' for i in range(8)]
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
|
|
183
|
-
return [f'{cluster_name}-sub-1'] + [
|
|
184
|
-
f'{cluster_name}-rdma-sub-{i}' for i in range(8)
|
|
185
|
-
]
|
|
186
|
-
|
|
187
|
-
|
|
188
176
|
def create_cluster_firewall_rule(args, index) -> int:
|
|
189
177
|
"""Create one GKE Cluster firewall rule.
|
|
190
178
|
|
|
@@ -247,20 +235,18 @@ def create_cluster_network_config(args) -> int:
|
|
|
247
235
|
return 0
|
|
248
236
|
|
|
249
237
|
|
|
250
|
-
def
|
|
251
|
-
"""Set up GKE Cluster networks, subnets and firewall rules for A3
|
|
252
|
-
Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node
|
|
253
|
-
and there are 8 NICs for GPU-GPU bw and 1 NIC for host in an A3+ node.
|
|
238
|
+
def set_up_cluster_network_for_a3(args) -> int:
|
|
239
|
+
"""Set up GKE Cluster networks, subnets and firewall rules for A3.
|
|
240
|
+
Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node.
|
|
254
241
|
|
|
255
242
|
Args:
|
|
256
243
|
args: user provided arguments for running the command.
|
|
257
|
-
system: system characteristics.
|
|
258
244
|
|
|
259
245
|
Returns:
|
|
260
246
|
0 if successful and 1 otherwise.
|
|
261
247
|
"""
|
|
262
|
-
num_networks =
|
|
263
|
-
for i in range(1, num_networks):
|
|
248
|
+
num_networks = 4
|
|
249
|
+
for i in range(1, num_networks + 1):
|
|
264
250
|
return_code = create_cluster_network(args, i)
|
|
265
251
|
if return_code != 0:
|
|
266
252
|
return 1
|
|
@@ -315,7 +301,10 @@ def get_all_networks_programmatic(args) -> tuple[list[str], int]:
|
|
|
315
301
|
Returns:
|
|
316
302
|
List of networks and 0 if successful and 1 otherwise.
|
|
317
303
|
"""
|
|
318
|
-
command =
|
|
304
|
+
command = (
|
|
305
|
+
'gcloud compute networks list --format="csv[no-heading](name)" '
|
|
306
|
+
f' --project={args.project}'
|
|
307
|
+
)
|
|
319
308
|
return_code, raw_network_output = run_command_for_value(
|
|
320
309
|
command, 'Get All Networks', args
|
|
321
310
|
)
|
|
@@ -365,7 +354,8 @@ def get_all_firewall_rules_programmatic(args) -> tuple[list[str], int]:
|
|
|
365
354
|
List of firewall rules and 0 if successful and 1 otherwise.
|
|
366
355
|
"""
|
|
367
356
|
command = (
|
|
368
|
-
'gcloud compute firewall-rules list --format="csv[no-heading](name)"'
|
|
357
|
+
'gcloud compute firewall-rules list --format="csv[no-heading](name)" '
|
|
358
|
+
f' --project={args.project}'
|
|
369
359
|
)
|
|
370
360
|
return_code, raw_subnets_output = run_command_for_value(
|
|
371
361
|
command, 'Get All Firewall Rules', args
|
xpk/core/nodepool.py
CHANGED
|
@@ -37,6 +37,8 @@ CLOUD_PLATFORM_AUTH_SCOPE_URL = (
|
|
|
37
37
|
'"https://www.googleapis.com/auth/cloud-platform"'
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
+
OLDER_PATHWAYS_CPU_NP_TO_DELETE = ['cpu-rm-np', 'cpu-proxy-np', 'cpu-user-np']
|
|
41
|
+
|
|
40
42
|
|
|
41
43
|
def run_gke_node_pool_create_command(
|
|
42
44
|
args, system, gke_node_pool_version
|
|
@@ -122,7 +124,10 @@ def run_gke_node_pool_create_command(
|
|
|
122
124
|
args, system, existing_node_pool_names, desired_node_pool_names
|
|
123
125
|
)
|
|
124
126
|
for node_pool_name in existing_node_pool_names:
|
|
125
|
-
if
|
|
127
|
+
if (
|
|
128
|
+
node_pool_name.find(f'{args.cluster}-np-') != 0
|
|
129
|
+
and node_pool_name not in OLDER_PATHWAYS_CPU_NP_TO_DELETE
|
|
130
|
+
):
|
|
126
131
|
continue
|
|
127
132
|
|
|
128
133
|
if node_pool_name in node_pools_to_delete:
|
|
@@ -283,28 +288,15 @@ def run_gke_node_pool_create_command(
|
|
|
283
288
|
command += (
|
|
284
289
|
' --accelerator'
|
|
285
290
|
f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
|
|
286
|
-
' --no-enable-autoupgrade '
|
|
287
|
-
f' --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL} --additional-node-network'
|
|
288
|
-
f' network={args.cluster}-net-1,subnetwork={subnet_prefix}-sub-1'
|
|
289
|
-
' --additional-node-network'
|
|
290
|
-
f' network={args.cluster}-net-2,subnetwork={subnet_prefix}-sub-2'
|
|
291
|
-
' --additional-node-network'
|
|
292
|
-
f' network={args.cluster}-net-3,subnetwork={subnet_prefix}-sub-3'
|
|
293
|
-
' --additional-node-network'
|
|
294
|
-
f' network={args.cluster}-net-4,subnetwork={subnet_prefix}-sub-4'
|
|
291
|
+
f' --no-enable-autoupgrade --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL}'
|
|
295
292
|
)
|
|
296
293
|
if device_type == H100_MEGA_DEVICE_TYPE:
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
f' network={args.cluster}-net-7,subnetwork={subnet_prefix}-sub-7'
|
|
304
|
-
' --additional-node-network'
|
|
305
|
-
f' network={args.cluster}-net-8,subnetwork={subnet_prefix}-sub-8'
|
|
306
|
-
' --max-pods-per-node=32'
|
|
307
|
-
)
|
|
294
|
+
for i in range(1, 9):
|
|
295
|
+
command += (
|
|
296
|
+
' --additional-node-network'
|
|
297
|
+
f' network={args.cluster}-net-{i},subnetwork={subnet_prefix}-sub-{i}'
|
|
298
|
+
)
|
|
299
|
+
command += ' --max-pods-per-node=32'
|
|
308
300
|
elif system.accelerator_type == AcceleratorType['CPU']:
|
|
309
301
|
command += f' --num-nodes={system.vms_per_slice}'
|
|
310
302
|
command += (
|
|
@@ -318,7 +310,7 @@ def run_gke_node_pool_create_command(
|
|
|
318
310
|
create_commands.append(command)
|
|
319
311
|
create_task_names.append(task)
|
|
320
312
|
|
|
321
|
-
desired_pw_cpu_node_pools = ['cpu-
|
|
313
|
+
desired_pw_cpu_node_pools = ['cpu-np']
|
|
322
314
|
if args.enable_pathways:
|
|
323
315
|
# Pathways needs CPU nodepools in addition to TPU nodepools
|
|
324
316
|
for node_pool_name in desired_pw_cpu_node_pools:
|
|
@@ -368,11 +360,9 @@ def get_node_pools_to_delete(
|
|
|
368
360
|
check_resource, is_requested_resource_in_cluster = check_cluster_resources(
|
|
369
361
|
args, system
|
|
370
362
|
)
|
|
371
|
-
|
|
372
|
-
# Deletion logic would leave behind any Pathways CPU nodepools.
|
|
373
|
-
if existing_node_pool_name.find(f'{args.cluster}-np-') != 0:
|
|
374
|
-
continue
|
|
363
|
+
xpk_print('Existing node pool names ', existing_node_pool_names)
|
|
375
364
|
|
|
365
|
+
for existing_node_pool_name in existing_node_pool_names:
|
|
376
366
|
# Nodepools will be deleted in two scenarios:
|
|
377
367
|
# Scenario 1: Cluster exists with 3 nodepools of 'x' device_type/gke_accelerator and now we are updating
|
|
378
368
|
# the cluster to 2 nodepools of 'x' device_type/gke_accelerator. In this case, we will delete
|
|
@@ -380,6 +370,18 @@ def get_node_pools_to_delete(
|
|
|
380
370
|
# Scenario 2: Cluster exists with 2 nodepools of 'x' device_type/gke_accelerator and now we are updating
|
|
381
371
|
# the cluster to 2 nodepools of 'y' device_type/gke_accelerator. In this case, we will delete
|
|
382
372
|
# '{args.cluster}-np-0' and '{args.cluster}-np-1' from the cluster.
|
|
373
|
+
# Scenario 3: Deletes older Pathways CPU nodepools named cpu-rm-np, cpu-proxy-np and cpu-user-np
|
|
374
|
+
|
|
375
|
+
if existing_node_pool_name in OLDER_PATHWAYS_CPU_NP_TO_DELETE:
|
|
376
|
+
node_pools_to_delete.append(existing_node_pool_name)
|
|
377
|
+
xpk_print(
|
|
378
|
+
'Upgrading Pathways version on the cluster. Deleting older pathways'
|
|
379
|
+
' nodepool ',
|
|
380
|
+
existing_node_pool_name,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
if existing_node_pool_name.find(f'{args.cluster}-np-') != 0:
|
|
384
|
+
continue
|
|
383
385
|
if existing_node_pool_name not in desired_node_pool_names or (
|
|
384
386
|
check_resource and not is_requested_resource_in_cluster
|
|
385
387
|
):
|