xpk 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/batch.py +27 -5
- xpk/commands/cluster.py +104 -80
- xpk/commands/cluster_gcluster.py +94 -10
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +8 -10
- xpk/commands/inspector.py +5 -11
- xpk/commands/job.py +9 -7
- xpk/commands/kind.py +34 -4
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +27 -7
- xpk/commands/storage.py +267 -0
- xpk/commands/version.py +6 -18
- xpk/commands/workload.py +381 -184
- xpk/core/blueprint/blueprint_definitions.py +1 -0
- xpk/core/blueprint/blueprint_generator.py +132 -76
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +6 -3
- xpk/core/commands.py +18 -14
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +20 -2
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +257 -18
- xpk/core/kueue.py +12 -6
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +32 -20
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +124 -45
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +38 -1
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +209 -1
- xpk/core/workload_decorators/rdma_decorator.py +25 -5
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
- xpk/main.py +3 -1
- xpk/parser/batch.py +10 -151
- xpk/parser/cluster.py +49 -8
- xpk/parser/common.py +189 -1
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +27 -1
- xpk/parser/info.py +2 -1
- xpk/parser/inspector.py +3 -3
- xpk/parser/job.py +25 -4
- xpk/parser/kind.py +3 -2
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +10 -1
- xpk/parser/storage.py +316 -0
- xpk/parser/validators.py +3 -3
- xpk/parser/workload.py +118 -76
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/objects.py +8 -5
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/METADATA +165 -14
- xpk-0.7.0.dist-info/RECORD +92 -0
- {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/WHEEL +1 -1
- xpk/core/core.py +0 -2824
- xpk-0.6.0.dist-info/RECORD +0 -57
- {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/LICENSE +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.0.dist-info}/top_level.txt +0 -0
xpk/core/kueue.py
CHANGED
|
@@ -15,18 +15,24 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from argparse import Namespace
|
|
18
|
-
|
|
18
|
+
|
|
19
19
|
import packaging
|
|
20
|
+
from packaging.version import Version
|
|
21
|
+
|
|
22
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
20
23
|
from ..utils.file import write_tmp_file
|
|
21
|
-
from
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
from .commands import (
|
|
25
|
+
run_command_for_value,
|
|
26
|
+
run_command_with_updates,
|
|
27
|
+
run_command_with_updates_retry,
|
|
28
|
+
)
|
|
29
|
+
from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
|
|
30
|
+
from .resources import AutoprovisioningConfig
|
|
31
|
+
from .scheduling import (
|
|
25
32
|
create_accelerator_label,
|
|
26
33
|
create_machine_label,
|
|
27
34
|
get_total_chips_requested_from_args,
|
|
28
35
|
)
|
|
29
|
-
from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
|
|
30
36
|
from .system_characteristics import (
|
|
31
37
|
AcceleratorTypeToAcceleratorCharacteristics,
|
|
32
38
|
SystemCharacteristics,
|
xpk/core/monitoring.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..utils.console import xpk_print
|
|
18
|
+
from .commands import run_command_for_value
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_gke_dashboard(args, dashboard_filter) -> tuple[bool, str | None]:
|
|
22
|
+
"""Get the identifier of GKE dashboard deployed in the project.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
args: user provided arguments for running the command.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
bool:
|
|
29
|
+
True if 'gcloud monitoring dashboards list' returned an error or
|
|
30
|
+
multiple dashboards with same filter exist in the project,
|
|
31
|
+
False otherwise.
|
|
32
|
+
str:
|
|
33
|
+
identifier of dashboard if deployed in project,
|
|
34
|
+
None otherwise.
|
|
35
|
+
"""
|
|
36
|
+
command = (
|
|
37
|
+
'gcloud monitoring dashboards list'
|
|
38
|
+
f' --project={args.project} --filter="{dashboard_filter}"'
|
|
39
|
+
' --format="value(name)" --verbosity=error'
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
return_code, return_value = run_command_for_value(
|
|
43
|
+
command, 'GKE Dashboard List', args
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if return_code != 0:
|
|
47
|
+
xpk_print(
|
|
48
|
+
f'GKE Dashboard List request returned ERROR {return_code}. If there is'
|
|
49
|
+
' a permissions error, please check'
|
|
50
|
+
' https://github.com/google/xpk/blob/main/README.md#roles-needed-based-on-permission-errors'
|
|
51
|
+
' for possible solutions.'
|
|
52
|
+
)
|
|
53
|
+
return True, None
|
|
54
|
+
|
|
55
|
+
if not return_value:
|
|
56
|
+
xpk_print(
|
|
57
|
+
f'No dashboard with {dashboard_filter} found in the'
|
|
58
|
+
f' project:{args.project}.'
|
|
59
|
+
)
|
|
60
|
+
return False, return_value
|
|
61
|
+
|
|
62
|
+
dashboards = return_value.strip().split('\n')
|
|
63
|
+
if len(dashboards) > 1:
|
|
64
|
+
xpk_print(
|
|
65
|
+
f'Multiple dashboards with same {dashboard_filter} exist in the'
|
|
66
|
+
f' project:{args.project}. Delete all but one dashboard deployed using'
|
|
67
|
+
' https://github.com/google/cloud-tpu-monitoring-debugging.'
|
|
68
|
+
)
|
|
69
|
+
return True, None
|
|
70
|
+
|
|
71
|
+
if dashboards[0]:
|
|
72
|
+
return False, dashboards[0].strip().split('/')[-1]
|
|
73
|
+
|
|
74
|
+
return True, None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_gke_outlier_dashboard(args) -> str | None:
|
|
78
|
+
"""Get the identifier of GKE outlier dashboard deployed in the project.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
args: user provided arguments for running the command.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
str:
|
|
85
|
+
identifier of outlier dashboard if deployed in project,
|
|
86
|
+
None otherwise.
|
|
87
|
+
"""
|
|
88
|
+
outlier_dashboard_filter = "displayName:'GKE - TPU Monitoring Dashboard'"
|
|
89
|
+
is_error, dashboard_id = get_gke_dashboard(args, outlier_dashboard_filter)
|
|
90
|
+
|
|
91
|
+
# 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
|
|
92
|
+
if is_error:
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
# 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
|
|
96
|
+
if not is_error and not dashboard_id:
|
|
97
|
+
xpk_print(
|
|
98
|
+
'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
|
|
99
|
+
' deploy monitoring dashboard to view statistics and outlier mode of'
|
|
100
|
+
' GKE metrics.'
|
|
101
|
+
)
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
return str(dashboard_id)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_gke_debugging_dashboard(args) -> str | None:
|
|
108
|
+
"""Get the identifier of GKE debugging dashboard deployed in the project.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
args: user provided arguments for running the command.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
str:
|
|
115
|
+
identifier of debugging dashboard if deployed in project,
|
|
116
|
+
None otherwise.
|
|
117
|
+
"""
|
|
118
|
+
debugging_dashboard_filter = "displayName:'GKE - TPU Logging Dashboard'"
|
|
119
|
+
is_error, dashboard_id = get_gke_dashboard(args, debugging_dashboard_filter)
|
|
120
|
+
|
|
121
|
+
# 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
|
|
122
|
+
if is_error:
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
# 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
|
|
126
|
+
if not is_error and not dashboard_id:
|
|
127
|
+
xpk_print(
|
|
128
|
+
'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
|
|
129
|
+
' deploy debugging dashboard to view stack traces collected in Cloud'
|
|
130
|
+
' Logging.'
|
|
131
|
+
)
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
return str(dashboard_id)
|
xpk/core/nap.py
CHANGED
|
@@ -14,29 +14,31 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..
|
|
17
|
+
from ..utils.console import xpk_print
|
|
18
|
+
from ..utils.file import write_tmp_file
|
|
19
|
+
from ..utils.objects import get_value_from_map
|
|
20
|
+
from .capacity import (
|
|
18
21
|
AUTOPROVISIONING_CONFIG_VALUE,
|
|
19
22
|
CAPACITY_TYPE_CONFIG_KEY,
|
|
20
|
-
CLUSTER_METADATA_CONFIGMAP,
|
|
21
|
-
CLUSTER_RESOURCES_CONFIGMAP,
|
|
22
23
|
RESERVATION_CONFIG_KEY,
|
|
23
|
-
AutoprovisioningConfig,
|
|
24
24
|
CapacityType,
|
|
25
|
-
get_all_nodepools_programmatic,
|
|
26
25
|
get_capacity_node_selectors_from_capacity_type,
|
|
27
26
|
get_capacity_type,
|
|
28
|
-
get_cluster_configmap,
|
|
29
|
-
get_total_chips_requested_from_args,
|
|
30
27
|
verify_reservation_exists,
|
|
31
|
-
zone_to_region,
|
|
32
28
|
)
|
|
33
|
-
from ..utils.objects import get_value_from_map
|
|
34
|
-
from ..utils.file import write_tmp_file
|
|
35
|
-
from ..utils.console import xpk_print
|
|
36
29
|
from .commands import run_command_with_updates, run_commands
|
|
30
|
+
from .gcloud_context import zone_to_region
|
|
31
|
+
from .nodepool import get_all_nodepools_programmatic
|
|
32
|
+
from .resources import (
|
|
33
|
+
CLUSTER_METADATA_CONFIGMAP,
|
|
34
|
+
CLUSTER_RESOURCES_CONFIGMAP,
|
|
35
|
+
AutoprovisioningConfig,
|
|
36
|
+
get_cluster_configmap,
|
|
37
|
+
)
|
|
38
|
+
from .scheduling import get_total_chips_requested_from_args
|
|
37
39
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
38
40
|
|
|
39
|
-
|
|
41
|
+
AUTOPROVISIONING_CONFIG_FILE = """
|
|
40
42
|
management:
|
|
41
43
|
autoRepair: true
|
|
42
44
|
autoUpgrade: true
|
|
@@ -44,8 +46,7 @@ autoprovisioningLocations:
|
|
|
44
46
|
{zones}
|
|
45
47
|
{resource_limits}
|
|
46
48
|
"""
|
|
47
|
-
|
|
48
|
-
autoprovisioning_resource_limits = """
|
|
49
|
+
AUTOPROVISIONING_RESOURCE_LIMITS = """
|
|
49
50
|
resourceLimits:
|
|
50
51
|
- resourceType: 'cpu'
|
|
51
52
|
{cpu_limits}
|
|
@@ -53,8 +54,7 @@ resourceLimits:
|
|
|
53
54
|
{memory_limits}
|
|
54
55
|
{custom_resource_type}
|
|
55
56
|
"""
|
|
56
|
-
|
|
57
|
-
autoprovisioning_custom_resource_type = """
|
|
57
|
+
AUTOPROVISIONING_CUSTOM_RESOURCE_TYPE = """
|
|
58
58
|
- resourceType: {resource_type}
|
|
59
59
|
minimum: {minimum}
|
|
60
60
|
maximum: {maximum}
|
|
@@ -218,19 +218,19 @@ def create_autoprovisioning_config(
|
|
|
218
218
|
' small, rescaling will not work well.'
|
|
219
219
|
)
|
|
220
220
|
|
|
221
|
-
custom_resource_string =
|
|
221
|
+
custom_resource_string = AUTOPROVISIONING_CUSTOM_RESOURCE_TYPE.format(
|
|
222
222
|
resource_type=system.gke_accelerator,
|
|
223
223
|
minimum=minimum,
|
|
224
224
|
maximum=maximum,
|
|
225
225
|
)
|
|
226
226
|
|
|
227
|
-
resource_limits =
|
|
227
|
+
resource_limits = AUTOPROVISIONING_RESOURCE_LIMITS.format(
|
|
228
228
|
cpu_limits=cpu_limits,
|
|
229
229
|
memory_limits=memory_limits,
|
|
230
230
|
custom_resource_type=custom_resource_string,
|
|
231
231
|
)
|
|
232
232
|
|
|
233
|
-
yml_string =
|
|
233
|
+
yml_string = AUTOPROVISIONING_CONFIG_FILE.format(
|
|
234
234
|
resource_limits=resource_limits,
|
|
235
235
|
zones=f'- {args.zone}',
|
|
236
236
|
)
|
|
@@ -266,7 +266,7 @@ def is_autoprovisioning_enabled(
|
|
|
266
266
|
return False, 0
|
|
267
267
|
|
|
268
268
|
return_code, autoprovisioning_value = get_value_from_map(
|
|
269
|
-
system.gke_accelerator, cluster_config_map
|
|
269
|
+
system.gke_accelerator, cluster_config_map, verbose=False
|
|
270
270
|
)
|
|
271
271
|
if return_code != 0:
|
|
272
272
|
xpk_print(
|
|
@@ -347,3 +347,15 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
|
|
|
347
347
|
return node_selector_args, return_code
|
|
348
348
|
|
|
349
349
|
return node_selector_args, return_code
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def get_cluster_provisioner(args) -> str:
|
|
353
|
+
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
354
|
+
cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
|
|
355
|
+
cluster_provisioner = 'gcloud'
|
|
356
|
+
if not cluster_config_map is None:
|
|
357
|
+
provisioner = cluster_config_map.get('provisioner')
|
|
358
|
+
if not provisioner is None:
|
|
359
|
+
cluster_provisioner = provisioner
|
|
360
|
+
xpk_print(f'Cluster provisioner: {cluster_provisioner}')
|
|
361
|
+
return cluster_provisioner
|
xpk/core/network.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..utils.console import xpk_print
|
|
18
|
+
from ..utils.file import write_tmp_file
|
|
19
|
+
from .capacity import H100_DEVICE_TYPE
|
|
20
|
+
from .commands import run_command_for_value, run_command_with_updates
|
|
21
|
+
from .gcloud_context import zone_to_region
|
|
22
|
+
from .system_characteristics import SystemCharacteristics
|
|
23
|
+
|
|
24
|
+
# cluster_network_yaml: the config when creating the network for a3 cluster
|
|
25
|
+
CLUSTER_NETWORK_YAML = """
|
|
26
|
+
apiVersion: networking.gke.io/v1
|
|
27
|
+
kind: Network
|
|
28
|
+
metadata:
|
|
29
|
+
name: vpc1
|
|
30
|
+
spec:
|
|
31
|
+
parametersRef:
|
|
32
|
+
group: networking.gke.io
|
|
33
|
+
kind: GKENetworkParamSet
|
|
34
|
+
name: vpc1
|
|
35
|
+
type: Device
|
|
36
|
+
---
|
|
37
|
+
apiVersion: networking.gke.io/v1
|
|
38
|
+
kind: Network
|
|
39
|
+
metadata:
|
|
40
|
+
name: vpc2
|
|
41
|
+
spec:
|
|
42
|
+
parametersRef:
|
|
43
|
+
group: networking.gke.io
|
|
44
|
+
kind: GKENetworkParamSet
|
|
45
|
+
name: vpc2
|
|
46
|
+
type: Device
|
|
47
|
+
---
|
|
48
|
+
apiVersion: networking.gke.io/v1
|
|
49
|
+
kind: Network
|
|
50
|
+
metadata:
|
|
51
|
+
name: vpc3
|
|
52
|
+
spec:
|
|
53
|
+
parametersRef:
|
|
54
|
+
group: networking.gke.io
|
|
55
|
+
kind: GKENetworkParamSet
|
|
56
|
+
name: vpc3
|
|
57
|
+
type: Device
|
|
58
|
+
---
|
|
59
|
+
apiVersion: networking.gke.io/v1
|
|
60
|
+
kind: Network
|
|
61
|
+
metadata:
|
|
62
|
+
name: vpc4
|
|
63
|
+
spec:
|
|
64
|
+
parametersRef:
|
|
65
|
+
group: networking.gke.io
|
|
66
|
+
kind: GKENetworkParamSet
|
|
67
|
+
name: vpc4
|
|
68
|
+
type: Device
|
|
69
|
+
---
|
|
70
|
+
apiVersion: networking.gke.io/v1
|
|
71
|
+
kind: GKENetworkParamSet
|
|
72
|
+
metadata:
|
|
73
|
+
name: vpc1
|
|
74
|
+
spec:
|
|
75
|
+
vpc: {cluster_name}-net-1
|
|
76
|
+
vpcSubnet: {cluster_name}-sub-1
|
|
77
|
+
deviceMode: NetDevice
|
|
78
|
+
---
|
|
79
|
+
apiVersion: networking.gke.io/v1
|
|
80
|
+
kind: GKENetworkParamSet
|
|
81
|
+
metadata:
|
|
82
|
+
name: vpc2
|
|
83
|
+
spec:
|
|
84
|
+
vpc: {cluster_name}-net-2
|
|
85
|
+
vpcSubnet: {cluster_name}-sub-2
|
|
86
|
+
deviceMode: NetDevice
|
|
87
|
+
---
|
|
88
|
+
apiVersion: networking.gke.io/v1
|
|
89
|
+
kind: GKENetworkParamSet
|
|
90
|
+
metadata:
|
|
91
|
+
name: vpc3
|
|
92
|
+
spec:
|
|
93
|
+
vpc: {cluster_name}-net-3
|
|
94
|
+
vpcSubnet: {cluster_name}-sub-3
|
|
95
|
+
deviceMode: NetDevice
|
|
96
|
+
---
|
|
97
|
+
apiVersion: networking.gke.io/v1
|
|
98
|
+
kind: GKENetworkParamSet
|
|
99
|
+
metadata:
|
|
100
|
+
name: vpc4
|
|
101
|
+
spec:
|
|
102
|
+
vpc: {cluster_name}-net-4
|
|
103
|
+
vpcSubnet: {cluster_name}-sub-4
|
|
104
|
+
deviceMode: NetDevice
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def create_cluster_network(args, index) -> int:
|
|
109
|
+
"""Create one GKE Cluster network.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
args: user provided arguments for running the command.
|
|
113
|
+
index: index number for the network to be created.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
0 if successful and 1 otherwise.
|
|
117
|
+
"""
|
|
118
|
+
existing_network_names, return_code = get_all_networks_programmatic(args)
|
|
119
|
+
if return_code > 0:
|
|
120
|
+
xpk_print('Listing all networks failed!')
|
|
121
|
+
return return_code
|
|
122
|
+
|
|
123
|
+
network_name = f'{args.cluster}-net-{index}'
|
|
124
|
+
if network_name not in existing_network_names:
|
|
125
|
+
command = (
|
|
126
|
+
f'gcloud compute --project={args.project}'
|
|
127
|
+
f' networks create {network_name}'
|
|
128
|
+
' --subnet-mode=custom --mtu=8244'
|
|
129
|
+
)
|
|
130
|
+
return_code = run_command_with_updates(
|
|
131
|
+
command, 'Create Cluster Network', args, verbose=False
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if return_code != 0:
|
|
135
|
+
xpk_print(f'Create Cluster Network request returned ERROR {return_code}')
|
|
136
|
+
return 1
|
|
137
|
+
else:
|
|
138
|
+
xpk_print(f'Reusing existing network {network_name}')
|
|
139
|
+
|
|
140
|
+
return 0
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def create_cluster_subnet(args, index) -> int:
|
|
144
|
+
"""Create one GKE Cluster subnet.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
args: user provided arguments for running the command.
|
|
148
|
+
index: index number for the subnet to be created.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
0 if successful and 1 otherwise.
|
|
152
|
+
"""
|
|
153
|
+
existing_subnet_names, return_code = get_all_subnets_programmatic(args)
|
|
154
|
+
if return_code > 0:
|
|
155
|
+
xpk_print('Listing all subnets failed!')
|
|
156
|
+
return return_code
|
|
157
|
+
subnet_name = f'{args.cluster}-{zone_to_region(args.zone)}-sub-{index}'
|
|
158
|
+
if subnet_name not in existing_subnet_names:
|
|
159
|
+
command = (
|
|
160
|
+
f'gcloud compute --project={args.project}'
|
|
161
|
+
f' networks subnets create {subnet_name}'
|
|
162
|
+
f' --network={args.cluster}-net-{index}'
|
|
163
|
+
f' --region={zone_to_region(args.zone)} --range=192.168.{index}.0/24'
|
|
164
|
+
)
|
|
165
|
+
return_code = run_command_with_updates(
|
|
166
|
+
command, 'Create Cluster Subnet', args, verbose=False
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if return_code != 0:
|
|
170
|
+
xpk_print(f'Create Cluster Subnet request returned ERROR {return_code}')
|
|
171
|
+
return 1
|
|
172
|
+
else:
|
|
173
|
+
xpk_print(f'Reusing existing subnet {subnet_name}')
|
|
174
|
+
|
|
175
|
+
return 0
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
|
|
179
|
+
return [f'{cluster_name}-gpunet-{i}-subnet' for i in range(8)]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
|
|
183
|
+
return [f'{cluster_name}-sub-1'] + [
|
|
184
|
+
f'{cluster_name}-rdma-sub-{i}' for i in range(8)
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def create_cluster_firewall_rule(args, index) -> int:
|
|
189
|
+
"""Create one GKE Cluster firewall rule.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
args: user provided arguments for running the command.
|
|
193
|
+
index: index number for the firewall rule to be created.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
0 if successful and 1 otherwise.
|
|
197
|
+
"""
|
|
198
|
+
existing_firewall_rules_names, return_code = (
|
|
199
|
+
get_all_firewall_rules_programmatic(args)
|
|
200
|
+
)
|
|
201
|
+
if return_code > 0:
|
|
202
|
+
xpk_print('Listing all firewall rules failed!')
|
|
203
|
+
return return_code
|
|
204
|
+
firewall_rule_name = f'{args.cluster}-internal-{index}'
|
|
205
|
+
if firewall_rule_name not in existing_firewall_rules_names:
|
|
206
|
+
command = (
|
|
207
|
+
f'gcloud compute --project={args.project} firewall-rules create'
|
|
208
|
+
f' {firewall_rule_name} --network={args.cluster}-net-{index} --action=ALLOW'
|
|
209
|
+
' --rules=tcp:0-65535,udp:0-65535,icmp --source-ranges=192.168.0.0/16'
|
|
210
|
+
)
|
|
211
|
+
return_code = run_command_with_updates(
|
|
212
|
+
command, 'Create Cluster Firewall Rule', args, verbose=False
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
if return_code != 0:
|
|
216
|
+
xpk_print(
|
|
217
|
+
f'Create Cluster Firewall Rule request returned ERROR {return_code}'
|
|
218
|
+
)
|
|
219
|
+
return 1
|
|
220
|
+
else:
|
|
221
|
+
xpk_print(f'Reusing existing firewall rule {firewall_rule_name}')
|
|
222
|
+
return 0
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def create_cluster_network_config(args) -> int:
|
|
226
|
+
"""Run the Create GKE Cluster Network Config request.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
args: user provided arguments for running the command.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
0 if successful and 1 otherwise.
|
|
233
|
+
"""
|
|
234
|
+
yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
|
|
235
|
+
tmp = write_tmp_file(yml_string)
|
|
236
|
+
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
237
|
+
|
|
238
|
+
return_code = run_command_with_updates(
|
|
239
|
+
command, 'GKE Cluster Create Network Config', args
|
|
240
|
+
)
|
|
241
|
+
if return_code != 0:
|
|
242
|
+
xpk_print(
|
|
243
|
+
f'GKE Cluster Create ConfigMap request returned ERROR {return_code}'
|
|
244
|
+
)
|
|
245
|
+
return 1
|
|
246
|
+
|
|
247
|
+
return 0
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def set_up_cluster_network_for_gpu(args, system: SystemCharacteristics) -> int:
|
|
251
|
+
"""Set up GKE Cluster networks, subnets and firewall rules for A3/A3+.
|
|
252
|
+
Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node,
|
|
253
|
+
and there are 8 NICs for GPU-GPU bw and 1 NIC for host in an A3+ node.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
args: user provided arguments for running the command.
|
|
257
|
+
system: system characteristics.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
0 if successful and 1 otherwise.
|
|
261
|
+
"""
|
|
262
|
+
num_networks = 5 if system.device_type == H100_DEVICE_TYPE else 9
|
|
263
|
+
for i in range(1, num_networks):
|
|
264
|
+
return_code = create_cluster_network(args, i)
|
|
265
|
+
if return_code != 0:
|
|
266
|
+
return 1
|
|
267
|
+
return_code = create_cluster_subnet(args, i)
|
|
268
|
+
if return_code != 0:
|
|
269
|
+
return 1
|
|
270
|
+
return_code = create_cluster_firewall_rule(args, i)
|
|
271
|
+
if return_code != 0:
|
|
272
|
+
return 1
|
|
273
|
+
return 0
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def delete_cluster_subnets(args) -> int:
|
|
277
|
+
"""Delete GKE Cluster subnets.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
args: user provided arguments for running the command.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
0 if successful and 1 otherwise.
|
|
284
|
+
"""
|
|
285
|
+
existing_subnet_names, return_code = get_all_subnets_programmatic(args)
|
|
286
|
+
if return_code > 0:
|
|
287
|
+
xpk_print('Listing all subnets failed!')
|
|
288
|
+
return return_code
|
|
289
|
+
|
|
290
|
+
for subnet_name in existing_subnet_names:
|
|
291
|
+
command = (
|
|
292
|
+
f'gcloud compute networks subnets delete {subnet_name}'
|
|
293
|
+
f' --region={zone_to_region(args.zone)} --project={args.project} --quiet'
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
return_code = run_command_with_updates(
|
|
297
|
+
command, 'Delete Cluster Subnet', args, verbose=False
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
if return_code != 0:
|
|
301
|
+
xpk_print(f'Delete Cluster Subnet request returned ERROR {return_code}')
|
|
302
|
+
return 1
|
|
303
|
+
else:
|
|
304
|
+
xpk_print(f'Deleted existing subnet {subnet_name}')
|
|
305
|
+
|
|
306
|
+
return 0
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def get_all_networks_programmatic(args) -> tuple[list[str], int]:
|
|
310
|
+
"""Gets all the networks associated with project .
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
args: user provided arguments for running the command.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
List of networks and 0 if successful and 1 otherwise.
|
|
317
|
+
"""
|
|
318
|
+
command = 'gcloud compute networks list --format="csv[no-heading](name)"'
|
|
319
|
+
return_code, raw_network_output = run_command_for_value(
|
|
320
|
+
command, 'Get All Networks', args
|
|
321
|
+
)
|
|
322
|
+
if return_code != 0:
|
|
323
|
+
xpk_print(f'Get All Networks returned ERROR {return_code}')
|
|
324
|
+
return [], 1
|
|
325
|
+
|
|
326
|
+
return raw_network_output.splitlines(), 0
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def get_all_subnets_programmatic(args) -> tuple[list[str], int]:
|
|
330
|
+
"""Gets all the subnets associated with the project.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
args: user provided arguments for running the command.
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
List of subnets and 0 if successful and 1 otherwise.
|
|
337
|
+
"""
|
|
338
|
+
subnet_name_filter = f'{args.cluster}-{zone_to_region(args.zone)}-sub-*'
|
|
339
|
+
|
|
340
|
+
command = (
|
|
341
|
+
'gcloud compute networks subnets list'
|
|
342
|
+
f' --filter=name~"{subnet_name_filter}" --project={args.project}'
|
|
343
|
+
)
|
|
344
|
+
return_code, raw_subnets_output = run_command_for_value(
|
|
345
|
+
command, 'Get All Subnets', args
|
|
346
|
+
)
|
|
347
|
+
if return_code != 0:
|
|
348
|
+
xpk_print(f'Get All Subnets returned ERROR {return_code}')
|
|
349
|
+
return [], 1
|
|
350
|
+
|
|
351
|
+
all_outputs = raw_subnets_output.splitlines()
|
|
352
|
+
all_networks = [
|
|
353
|
+
all_outputs[i].split(' ')[0] for i in range(1, len(all_outputs))
|
|
354
|
+
]
|
|
355
|
+
return all_networks, 0
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def get_all_firewall_rules_programmatic(args) -> tuple[list[str], int]:
|
|
359
|
+
"""Gets all the firewall rules associated with the project.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
args: user provided arguments for running the command.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
List of firewall rules and 0 if successful and 1 otherwise.
|
|
366
|
+
"""
|
|
367
|
+
command = (
|
|
368
|
+
'gcloud compute firewall-rules list --format="csv[no-heading](name)"'
|
|
369
|
+
)
|
|
370
|
+
return_code, raw_subnets_output = run_command_for_value(
|
|
371
|
+
command, 'Get All Firewall Rules', args
|
|
372
|
+
)
|
|
373
|
+
if return_code != 0:
|
|
374
|
+
xpk_print(f'Get All Firewall Rules returned ERROR {return_code}')
|
|
375
|
+
return [], 1
|
|
376
|
+
|
|
377
|
+
return raw_subnets_output.splitlines(), 0
|