xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/batch.py +27 -5
- xpk/commands/cluster.py +104 -80
- xpk/commands/cluster_gcluster.py +94 -10
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +8 -10
- xpk/commands/inspector.py +5 -11
- xpk/commands/job.py +9 -7
- xpk/commands/kind.py +34 -4
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +27 -7
- xpk/commands/storage.py +280 -0
- xpk/commands/version.py +6 -18
- xpk/commands/workload.py +381 -184
- xpk/core/blueprint/blueprint_definitions.py +1 -0
- xpk/core/blueprint/blueprint_generator.py +132 -76
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +6 -3
- xpk/core/commands.py +18 -14
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +20 -2
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +257 -18
- xpk/core/kueue.py +12 -6
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +32 -20
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +124 -45
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +38 -1
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +209 -1
- xpk/core/workload_decorators/rdma_decorator.py +25 -5
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
- xpk/main.py +3 -1
- xpk/parser/batch.py +10 -151
- xpk/parser/cluster.py +49 -8
- xpk/parser/common.py +189 -1
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +27 -1
- xpk/parser/info.py +2 -1
- xpk/parser/inspector.py +3 -3
- xpk/parser/job.py +25 -4
- xpk/parser/kind.py +3 -2
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +10 -1
- xpk/parser/storage.py +326 -0
- xpk/parser/validators.py +3 -3
- xpk/parser/workload.py +118 -76
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/objects.py +8 -5
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
- xpk-0.7.1.dist-info/RECORD +92 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
- xpk/core/core.py +0 -2824
- xpk-0.6.0.dist-info/RECORD +0 -57
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/api/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
xpk/api/storage_crd.yaml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
apiVersion: apiextensions.k8s.io/v1
|
|
2
|
+
kind: CustomResourceDefinition
|
|
3
|
+
metadata:
|
|
4
|
+
name: storages.xpk.x-k8s.io
|
|
5
|
+
spec:
|
|
6
|
+
group: xpk.x-k8s.io
|
|
7
|
+
versions:
|
|
8
|
+
- name: v1
|
|
9
|
+
served: true
|
|
10
|
+
storage: true
|
|
11
|
+
schema:
|
|
12
|
+
openAPIV3Schema:
|
|
13
|
+
type: object
|
|
14
|
+
properties:
|
|
15
|
+
spec:
|
|
16
|
+
type: object
|
|
17
|
+
properties:
|
|
18
|
+
type:
|
|
19
|
+
type: string
|
|
20
|
+
cluster:
|
|
21
|
+
type: string
|
|
22
|
+
auto_mount:
|
|
23
|
+
type: boolean
|
|
24
|
+
mount_point:
|
|
25
|
+
type: string
|
|
26
|
+
readonly:
|
|
27
|
+
type: boolean
|
|
28
|
+
manifest:
|
|
29
|
+
type: string
|
|
30
|
+
pv:
|
|
31
|
+
type: string
|
|
32
|
+
pvc:
|
|
33
|
+
type: string
|
|
34
|
+
required:
|
|
35
|
+
- type
|
|
36
|
+
- cluster
|
|
37
|
+
- auto_mount
|
|
38
|
+
- mount_point
|
|
39
|
+
- readonly
|
|
40
|
+
- manifest
|
|
41
|
+
- pvc
|
|
42
|
+
- pv
|
|
43
|
+
x-kubernetes-validations:
|
|
44
|
+
- message: Value is immutable
|
|
45
|
+
rule: self == oldSelf
|
|
46
|
+
scope: Cluster
|
|
47
|
+
names:
|
|
48
|
+
plural: storages
|
|
49
|
+
singular: storage
|
|
50
|
+
kind: Storage
|
|
51
|
+
shortNames:
|
|
52
|
+
- stg
|
xpk/commands/batch.py
CHANGED
|
@@ -16,13 +16,16 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
|
|
19
|
+
from ..core.cluster import create_xpk_k8s_service_account
|
|
20
|
+
from ..core.commands import run_command_for_value
|
|
21
|
+
from ..core.gcloud_context import add_zone_and_project
|
|
19
22
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
20
23
|
from ..utils.console import xpk_exit, xpk_print
|
|
21
|
-
from .
|
|
22
|
-
from ..core.
|
|
23
|
-
from
|
|
24
|
-
from ..core.commands import run_command_for_value
|
|
24
|
+
from .common import set_cluster_command
|
|
25
|
+
from ..core.kjob import AppProfileDefaults, JobTemplateDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
|
|
26
|
+
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
25
27
|
from .kind import set_local_cluster_command
|
|
28
|
+
import re
|
|
26
29
|
|
|
27
30
|
|
|
28
31
|
def batch(args: Namespace) -> None:
|
|
@@ -42,15 +45,30 @@ def batch(args: Namespace) -> None:
|
|
|
42
45
|
if set_cluster_command_code != 0:
|
|
43
46
|
xpk_exit(set_cluster_command_code)
|
|
44
47
|
|
|
48
|
+
err_code = prepare_kjob(args)
|
|
49
|
+
if err_code > 0:
|
|
50
|
+
xpk_exit(err_code)
|
|
51
|
+
create_xpk_k8s_service_account()
|
|
52
|
+
|
|
45
53
|
submit_job(args)
|
|
46
54
|
|
|
47
55
|
|
|
48
56
|
def submit_job(args: Namespace) -> None:
|
|
57
|
+
|
|
58
|
+
create_xpk_k8s_service_account()
|
|
59
|
+
|
|
49
60
|
cmd = (
|
|
50
61
|
'kubectl kjob create slurm'
|
|
51
62
|
f' --profile {AppProfileDefaults.NAME.value}'
|
|
52
63
|
f' --localqueue {LOCAL_QUEUE_NAME}'
|
|
64
|
+
f' --pod-template-annotation {Kueue_TAS_annotation}'
|
|
65
|
+
f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
66
|
+
' --first-node-ip'
|
|
53
67
|
)
|
|
68
|
+
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
69
|
+
gcsfuse_annotation = get_gcsfuse_annotation(args)
|
|
70
|
+
if gcsfuse_annotation is not None:
|
|
71
|
+
cmd += f' --pod-template-annotation {gcsfuse_annotation}'
|
|
54
72
|
|
|
55
73
|
if args.ignore_unknown_flags:
|
|
56
74
|
cmd += ' --ignore-unknown-flags'
|
|
@@ -102,8 +120,12 @@ def submit_job(args: Namespace) -> None:
|
|
|
102
120
|
if args.time is not None:
|
|
103
121
|
cmd += f' --time {args.time}'
|
|
104
122
|
|
|
105
|
-
return_code,
|
|
123
|
+
return_code, return_value = run_command_for_value(cmd, 'submit job', args)
|
|
106
124
|
|
|
107
125
|
if return_code != 0:
|
|
108
126
|
xpk_print(f'Running batch job returned ERROR {return_code}')
|
|
109
127
|
xpk_exit(return_code)
|
|
128
|
+
|
|
129
|
+
m = re.match(r'job\.batch/([-a-z0-9]+)', return_value)
|
|
130
|
+
if m:
|
|
131
|
+
xpk_print(f'Job name: {m.group(1)}')
|
xpk/commands/cluster.py
CHANGED
|
@@ -14,36 +14,28 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
)
|
|
22
|
-
from ..core.core import (
|
|
23
|
-
VERTEX_TENSORBOARD_FEATURE_FLAG,
|
|
24
|
-
add_zone_and_project,
|
|
25
|
-
create_cluster_configmaps,
|
|
26
|
-
create_cluster_network_config,
|
|
27
|
-
create_vertex_tensorboard,
|
|
28
|
-
delete_cluster_subnets,
|
|
17
|
+
from tabulate import tabulate
|
|
18
|
+
|
|
19
|
+
from ..core.capacity import H100_DEVICE_TYPE
|
|
20
|
+
from ..core.cluster import (
|
|
29
21
|
get_all_clusters_programmatic,
|
|
30
|
-
|
|
31
|
-
get_gke_node_pool_version,
|
|
32
|
-
get_gke_server_config,
|
|
33
|
-
h100_device_type,
|
|
22
|
+
get_cluster_credentials,
|
|
34
23
|
install_nccl_on_cluster,
|
|
35
|
-
run_gke_node_pool_create_command,
|
|
36
24
|
set_jobset_on_cluster,
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
25
|
+
setup_k8s_env,
|
|
26
|
+
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
27
|
+
update_cluster_with_workload_identity_if_necessary,
|
|
40
28
|
)
|
|
41
29
|
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
|
|
42
|
-
from ..core.
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
30
|
+
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
31
|
+
from ..core.config import VERTEX_TENSORBOARD_FEATURE_FLAG
|
|
32
|
+
from ..core.gcloud_context import (
|
|
33
|
+
add_zone_and_project,
|
|
34
|
+
get_gke_control_plane_version,
|
|
35
|
+
get_gke_server_config,
|
|
36
|
+
zone_to_region,
|
|
46
37
|
)
|
|
38
|
+
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
|
|
47
39
|
from ..core.kueue import (
|
|
48
40
|
cluster_preheat_yml,
|
|
49
41
|
install_kueue_crs,
|
|
@@ -51,19 +43,28 @@ from ..core.kueue import (
|
|
|
51
43
|
wait_for_kueue_available,
|
|
52
44
|
)
|
|
53
45
|
from ..core.nap import enable_autoprovisioning_on_cluster
|
|
46
|
+
from ..core.network import (
|
|
47
|
+
create_cluster_network_config,
|
|
48
|
+
delete_cluster_subnets,
|
|
49
|
+
set_up_cluster_network_for_gpu,
|
|
50
|
+
)
|
|
51
|
+
from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
|
|
54
52
|
from ..core.ray import install_ray_cluster
|
|
53
|
+
from ..core.resources import create_cluster_configmaps
|
|
54
|
+
from ..core.storage import install_storage_crd
|
|
55
55
|
from ..core.system_characteristics import (
|
|
56
56
|
AcceleratorType,
|
|
57
57
|
AcceleratorTypeToAcceleratorCharacteristics,
|
|
58
58
|
SystemCharacteristics,
|
|
59
59
|
get_system_characteristics,
|
|
60
60
|
)
|
|
61
|
+
from ..core.vertex import create_vertex_tensorboard
|
|
61
62
|
from ..core.workload import get_workload_list
|
|
63
|
+
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
62
64
|
from ..utils.file import write_tmp_file
|
|
63
|
-
from ..utils.console import xpk_exit, xpk_print
|
|
64
65
|
from . import cluster_gcluster
|
|
65
|
-
|
|
66
|
-
from
|
|
66
|
+
from .common import set_cluster_command
|
|
67
|
+
from ..core.cluster import update_cluster_with_gcpfilestore_driver_if_necessary
|
|
67
68
|
|
|
68
69
|
|
|
69
70
|
def cluster_create(args) -> None:
|
|
@@ -115,10 +116,36 @@ def cluster_create(args) -> None:
|
|
|
115
116
|
xpk_exit(authorize_private_cluster_access_command_code)
|
|
116
117
|
|
|
117
118
|
# ToDo(roshanin@) - Re-enable CloudDNS on Pathways clusters conditionally.
|
|
119
|
+
# Enable WorkloadIdentity if not enabled already.
|
|
120
|
+
if (
|
|
121
|
+
args.enable_workload_identity
|
|
122
|
+
or args.enable_gcsfuse_csi_driver
|
|
123
|
+
or args.enable_gcpfilestore_csi_driver
|
|
124
|
+
):
|
|
125
|
+
update_cluster_command_code = (
|
|
126
|
+
update_cluster_with_workload_identity_if_necessary(args)
|
|
127
|
+
)
|
|
128
|
+
if update_cluster_command_code != 0:
|
|
129
|
+
xpk_exit(update_cluster_command_code)
|
|
118
130
|
|
|
119
|
-
|
|
120
|
-
if
|
|
121
|
-
|
|
131
|
+
# Enable GCSFuse CSI Driver if not enabled already.
|
|
132
|
+
if args.enable_gcsfuse_csi_driver:
|
|
133
|
+
update_cluster_command_code = (
|
|
134
|
+
update_cluster_with_gcsfuse_driver_if_necessary(args)
|
|
135
|
+
)
|
|
136
|
+
if update_cluster_command_code != 0:
|
|
137
|
+
xpk_exit(update_cluster_command_code)
|
|
138
|
+
|
|
139
|
+
if args.enable_gcpfilestore_csi_driver:
|
|
140
|
+
update_cluster_command_code = (
|
|
141
|
+
update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
142
|
+
)
|
|
143
|
+
if update_cluster_command_code != 0:
|
|
144
|
+
xpk_exit(update_cluster_command_code)
|
|
145
|
+
|
|
146
|
+
# Update Pathways clusters with CloudDNS if not enabled already.
|
|
147
|
+
|
|
148
|
+
get_cluster_credentials(args)
|
|
122
149
|
|
|
123
150
|
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
|
|
124
151
|
tensorboard_config = {}
|
|
@@ -134,7 +161,7 @@ def cluster_create(args) -> None:
|
|
|
134
161
|
if set_up_cluster_network_code != 0:
|
|
135
162
|
xpk_exit(set_up_cluster_network_code)
|
|
136
163
|
|
|
137
|
-
if system.device_type ==
|
|
164
|
+
if system.device_type == H100_DEVICE_TYPE:
|
|
138
165
|
xpk_print('Creating Network Config for cluster')
|
|
139
166
|
create_cluster_network_config_code = create_cluster_network_config(args)
|
|
140
167
|
if create_cluster_network_config_code != 0:
|
|
@@ -154,6 +181,24 @@ def cluster_create(args) -> None:
|
|
|
154
181
|
if run_gke_node_pool_create_command_code != 0:
|
|
155
182
|
xpk_exit(run_gke_node_pool_create_command_code)
|
|
156
183
|
|
|
184
|
+
# Provision node pools dynamically based on incoming workloads:
|
|
185
|
+
# Currently autoprovisioning is not supported with Pathways.
|
|
186
|
+
autoprovisioning_config = None
|
|
187
|
+
if not args.enable_pathways and args.enable_autoprovisioning:
|
|
188
|
+
xpk_print('Enabling Autoprovisioning')
|
|
189
|
+
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
|
|
190
|
+
args, system
|
|
191
|
+
)
|
|
192
|
+
if return_code != 0:
|
|
193
|
+
xpk_exit(return_code)
|
|
194
|
+
|
|
195
|
+
xpk_print('Creating ConfigMap for cluster')
|
|
196
|
+
create_cluster_configmaps_code = create_cluster_configmaps(
|
|
197
|
+
args, system, tensorboard_config, autoprovisioning_config
|
|
198
|
+
)
|
|
199
|
+
if create_cluster_configmaps_code != 0:
|
|
200
|
+
xpk_exit(create_cluster_configmaps_code)
|
|
201
|
+
|
|
157
202
|
xpk_print(
|
|
158
203
|
'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
|
|
159
204
|
' globally available'
|
|
@@ -177,20 +222,12 @@ def cluster_create(args) -> None:
|
|
|
177
222
|
if err_code > 0:
|
|
178
223
|
xpk_exit(err_code)
|
|
179
224
|
|
|
180
|
-
xpk_print('Preparing kjob')
|
|
181
225
|
err_code = prepare_kjob(args)
|
|
182
226
|
if err_code > 0:
|
|
183
227
|
xpk_exit(err_code)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
if not args.enable_pathways and args.enable_autoprovisioning:
|
|
188
|
-
xpk_print('Enabling Autoprovisioning')
|
|
189
|
-
autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
|
|
190
|
-
args, system
|
|
191
|
-
)
|
|
192
|
-
if return_code != 0:
|
|
193
|
-
xpk_exit(return_code)
|
|
228
|
+
|
|
229
|
+
k8s_client = setup_k8s_env(args)
|
|
230
|
+
install_storage_crd(k8s_client)
|
|
194
231
|
|
|
195
232
|
xpk_print('Wait for Kueue to be fully available')
|
|
196
233
|
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
@@ -210,13 +247,6 @@ def cluster_create(args) -> None:
|
|
|
210
247
|
if install_nccl_code != 0:
|
|
211
248
|
xpk_exit(install_nccl_code)
|
|
212
249
|
|
|
213
|
-
xpk_print('Creating ConfigMap for cluster')
|
|
214
|
-
create_cluster_configmaps_code = create_cluster_configmaps(
|
|
215
|
-
args, system, tensorboard_config, autoprovisioning_config
|
|
216
|
-
)
|
|
217
|
-
if create_cluster_configmaps_code != 0:
|
|
218
|
-
xpk_exit(create_cluster_configmaps_code)
|
|
219
|
-
|
|
220
250
|
if args.enable_ray_cluster:
|
|
221
251
|
return_code = install_ray_cluster(args, system)
|
|
222
252
|
if return_code != 0:
|
|
@@ -249,7 +279,12 @@ def cluster_delete(args) -> None:
|
|
|
249
279
|
cluster_gcluster.cluster_delete(args)
|
|
250
280
|
xpk_exit(0)
|
|
251
281
|
|
|
282
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
283
|
+
if set_cluster_command_code != 0:
|
|
284
|
+
xpk_exit(set_cluster_command_code)
|
|
285
|
+
|
|
252
286
|
run_gke_cluster_delete_command_code = run_gke_cluster_delete_command(args)
|
|
287
|
+
|
|
253
288
|
if run_gke_cluster_delete_command_code != 0:
|
|
254
289
|
xpk_exit(run_gke_cluster_delete_command_code)
|
|
255
290
|
xpk_print(f'GKE commands done! Cluster {args.cluster} deleted.\n')
|
|
@@ -270,9 +305,7 @@ def cluster_cacheimage(args) -> None:
|
|
|
270
305
|
)
|
|
271
306
|
add_zone_and_project(args)
|
|
272
307
|
|
|
273
|
-
|
|
274
|
-
if set_cluster_command_code != 0:
|
|
275
|
-
xpk_exit(set_cluster_command_code)
|
|
308
|
+
get_cluster_credentials(args)
|
|
276
309
|
system, return_code = get_system_characteristics(args)
|
|
277
310
|
|
|
278
311
|
if return_code > 0:
|
|
@@ -321,9 +354,7 @@ def cluster_describe(args) -> None:
|
|
|
321
354
|
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
|
|
322
355
|
add_zone_and_project(args)
|
|
323
356
|
|
|
324
|
-
|
|
325
|
-
if set_cluster_command_code != 0:
|
|
326
|
-
xpk_exit(set_cluster_command_code)
|
|
357
|
+
get_cluster_credentials(args)
|
|
327
358
|
|
|
328
359
|
return_code, data_table = nodepools_build_table(args)
|
|
329
360
|
if return_code != 0:
|
|
@@ -752,33 +783,26 @@ def run_gke_cluster_create_command(
|
|
|
752
783
|
if args.enable_ray_cluster:
|
|
753
784
|
command += ' --addons RayOperator'
|
|
754
785
|
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
786
|
+
if (
|
|
787
|
+
args.enable_workload_identity
|
|
788
|
+
or args.enable_gcsfuse_csi_driver
|
|
789
|
+
or args.enable_gcpfilestore_csi_driver
|
|
790
|
+
):
|
|
791
|
+
command += f' --workload-pool={args.project}.svc.id.goog'
|
|
760
792
|
|
|
793
|
+
addons = []
|
|
794
|
+
if args.enable_gcsfuse_csi_driver:
|
|
795
|
+
addons.append('GcsFuseCsiDriver')
|
|
761
796
|
|
|
762
|
-
|
|
763
|
-
|
|
797
|
+
if args.enable_gcpfilestore_csi_driver:
|
|
798
|
+
addons.append('GcpFilestoreCsiDriver')
|
|
764
799
|
|
|
765
|
-
|
|
766
|
-
|
|
800
|
+
if len(addons) > 0:
|
|
801
|
+
addons_str = ','.join(addons)
|
|
802
|
+
command += f' --addons={addons_str}'
|
|
767
803
|
|
|
768
|
-
|
|
769
|
-
0 if successful and 1 otherwise.
|
|
770
|
-
"""
|
|
771
|
-
command = (
|
|
772
|
-
'gcloud container clusters get-credentials'
|
|
773
|
-
f' {args.cluster} --region={zone_to_region(args.zone)}'
|
|
774
|
-
f' --project={args.project} &&'
|
|
775
|
-
' kubectl config view && kubectl config set-context --current'
|
|
776
|
-
' --namespace=default'
|
|
777
|
-
)
|
|
778
|
-
task = f'get-credentials to cluster {args.cluster}'
|
|
779
|
-
return_code = run_command_with_updates_retry(
|
|
780
|
-
command, task, args, verbose=False
|
|
781
|
-
)
|
|
804
|
+
return_code = run_command_with_updates(command, 'GKE Cluster Create', args)
|
|
782
805
|
if return_code != 0:
|
|
783
|
-
xpk_print(f'
|
|
784
|
-
|
|
806
|
+
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
807
|
+
return 1
|
|
808
|
+
return 0
|
xpk/commands/cluster_gcluster.py
CHANGED
|
@@ -14,15 +14,28 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
from ..core.remote_state.remote_state_client import RemoteStateClient
|
|
20
|
+
from ..core.remote_state.fuse_remote_state import FuseStateClient
|
|
21
|
+
from ..core.blueprint.blueprint_generator import (
|
|
22
|
+
BlueprintGenerator,
|
|
23
|
+
BlueprintGeneratorOutput,
|
|
24
|
+
a3mega_device_type,
|
|
25
|
+
a3ultra_device_type,
|
|
26
|
+
supported_device_types,
|
|
27
|
+
)
|
|
28
|
+
from ..core.commands import run_command_for_value
|
|
29
|
+
from ..core.capacity import get_capacity_type
|
|
18
30
|
from ..core.docker_manager import DockerManager
|
|
31
|
+
from ..core.gcloud_context import zone_to_region
|
|
19
32
|
from ..core.gcluster_manager import GclusterManager
|
|
20
|
-
from ..core.core import zone_to_region, get_capacity_type
|
|
21
33
|
from ..utils.console import xpk_exit, xpk_print
|
|
22
|
-
from ..utils.network import all_IPs_cidr
|
|
23
34
|
from ..utils.file import ensure_directory_exists
|
|
35
|
+
from ..utils.network import all_IPs_cidr
|
|
24
36
|
from ..utils.objects import hash_string
|
|
25
|
-
import
|
|
37
|
+
from ..core.cluster import get_cluster_credentials
|
|
38
|
+
from ..core.kjob import apply_kjob_crds, prepare_kjob
|
|
26
39
|
|
|
27
40
|
blueprints_path = os.path.abspath('xpkclusters/blueprints')
|
|
28
41
|
gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
|
|
@@ -40,13 +53,22 @@ def cluster_create(args) -> None:
|
|
|
40
53
|
"""
|
|
41
54
|
check_gcloud_authenticated()
|
|
42
55
|
prepare_directories()
|
|
43
|
-
gcm = prepare_gcluster_manager()
|
|
44
56
|
region = zone_to_region(args.zone)
|
|
45
57
|
|
|
46
58
|
# unique_name uses shortened hash string, so still name collision is possible
|
|
47
59
|
unique_name = get_unique_name(args.project, region, args.cluster)
|
|
48
60
|
# prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
|
|
49
61
|
prefix = get_prefix_path(args.project, region)
|
|
62
|
+
remote_state_client = None
|
|
63
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
64
|
+
remote_state_client = FuseStateClient(
|
|
65
|
+
bucket=args.cluster_state_gcs_bucket,
|
|
66
|
+
state_directory=os.path.join(blueprints_path, prefix, unique_name),
|
|
67
|
+
prefix=prefix,
|
|
68
|
+
cluster=args.cluster,
|
|
69
|
+
deployment_name=unique_name,
|
|
70
|
+
)
|
|
71
|
+
gcm = prepare_gcluster_manager(remote_state_client)
|
|
50
72
|
|
|
51
73
|
bp = generate_blueprint(blueprint_name=unique_name, args=args, prefix=prefix)
|
|
52
74
|
|
|
@@ -61,6 +83,18 @@ def cluster_create(args) -> None:
|
|
|
61
83
|
deployment_name=unique_name,
|
|
62
84
|
prefix=prefix,
|
|
63
85
|
)
|
|
86
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
87
|
+
gcm.upload_state()
|
|
88
|
+
|
|
89
|
+
get_cluster_credentials(args)
|
|
90
|
+
|
|
91
|
+
err_code = apply_kjob_crds(args)
|
|
92
|
+
if err_code > 0:
|
|
93
|
+
xpk_exit(err_code)
|
|
94
|
+
|
|
95
|
+
err_code = prepare_kjob(args)
|
|
96
|
+
if err_code > 0:
|
|
97
|
+
xpk_exit(err_code)
|
|
64
98
|
|
|
65
99
|
xpk_exit(0)
|
|
66
100
|
|
|
@@ -76,15 +110,42 @@ def cluster_delete(args) -> None:
|
|
|
76
110
|
"""
|
|
77
111
|
check_gcloud_authenticated()
|
|
78
112
|
prepare_directories()
|
|
79
|
-
gcm = prepare_gcluster_manager()
|
|
80
113
|
region = zone_to_region(args.zone)
|
|
114
|
+
unique_name = get_unique_name(args.project, region, args.cluster)
|
|
115
|
+
# prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
|
|
116
|
+
prefix = get_prefix_path(args.project, region)
|
|
117
|
+
remote_state_client = None
|
|
118
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
119
|
+
remote_state_client = FuseStateClient(
|
|
120
|
+
bucket=args.cluster_state_gcs_bucket,
|
|
121
|
+
state_directory=os.path.join(blueprints_path, prefix, unique_name),
|
|
122
|
+
prefix=prefix,
|
|
123
|
+
cluster=args.cluster,
|
|
124
|
+
deployment_name=unique_name,
|
|
125
|
+
)
|
|
126
|
+
gcm = prepare_gcluster_manager(remote_state_client)
|
|
81
127
|
|
|
82
128
|
# unique_name uses shortened hash string, so still name collision is possible
|
|
83
129
|
unique_name = get_unique_name(args.project, region, args.cluster)
|
|
84
130
|
# prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
|
|
85
|
-
|
|
131
|
+
prefix = get_prefix_path(args.project, region)
|
|
132
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
133
|
+
gcm.download_state()
|
|
134
|
+
|
|
135
|
+
bp = BlueprintGeneratorOutput(
|
|
136
|
+
blueprint_file=os.path.join(blueprints_path, prefix, unique_name)
|
|
137
|
+
+ '.yaml',
|
|
138
|
+
blueprint_dependencies=os.path.join(
|
|
139
|
+
blueprints_path, prefix, unique_name
|
|
140
|
+
),
|
|
141
|
+
)
|
|
86
142
|
|
|
87
|
-
|
|
143
|
+
gcm.stage_files(
|
|
144
|
+
blueprint_file=bp.blueprint_file,
|
|
145
|
+
blueprint_dependencies=bp.blueprint_dependencies,
|
|
146
|
+
prefix=prefix,
|
|
147
|
+
)
|
|
148
|
+
gcm.destroy_deployment(deployment_name=unique_name, prefix=prefix)
|
|
88
149
|
|
|
89
150
|
xpk_exit(0)
|
|
90
151
|
|
|
@@ -127,18 +188,35 @@ def check_gcloud_authenticated():
|
|
|
127
188
|
xpk_exit(1)
|
|
128
189
|
|
|
129
190
|
|
|
130
|
-
def prepare_gcluster_manager(
|
|
191
|
+
def prepare_gcluster_manager(
|
|
192
|
+
remote_state_client: RemoteStateClient | None,
|
|
193
|
+
) -> GclusterManager:
|
|
131
194
|
dm = DockerManager(
|
|
132
195
|
working_dir=gcluster_working_dir, gcloud_cfg_path=gcloud_cfg_path
|
|
133
196
|
)
|
|
134
197
|
dm.initialize()
|
|
135
|
-
return GclusterManager(
|
|
198
|
+
return GclusterManager(
|
|
199
|
+
gcluster_command_runner=dm, remote_state_client=remote_state_client
|
|
200
|
+
)
|
|
136
201
|
|
|
137
202
|
|
|
138
203
|
def prepare_blueprint_generator() -> BlueprintGenerator:
|
|
139
204
|
return BlueprintGenerator(storage_path=blueprints_path)
|
|
140
205
|
|
|
141
206
|
|
|
207
|
+
def validate_state_gcs_bucket(args):
|
|
208
|
+
bucket_validate_cmd = (
|
|
209
|
+
f'gcloud storage buckets describe gs://{args.cluster_state_gcs_bucket}'
|
|
210
|
+
)
|
|
211
|
+
err_code, _ = run_command_for_value(
|
|
212
|
+
bucket_validate_cmd,
|
|
213
|
+
'Validate remote state bucket existence.',
|
|
214
|
+
global_args=args,
|
|
215
|
+
)
|
|
216
|
+
if err_code != 0:
|
|
217
|
+
xpk_exit(err_code)
|
|
218
|
+
|
|
219
|
+
|
|
142
220
|
def generate_blueprint(
|
|
143
221
|
blueprint_name, args, prefix=None
|
|
144
222
|
) -> BlueprintGeneratorOutput:
|
|
@@ -149,6 +227,9 @@ def generate_blueprint(
|
|
|
149
227
|
|
|
150
228
|
bpg = prepare_blueprint_generator()
|
|
151
229
|
|
|
230
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
231
|
+
validate_state_gcs_bucket(args)
|
|
232
|
+
|
|
152
233
|
if args.device_type in supported_device_types:
|
|
153
234
|
if args.device_type == a3mega_device_type:
|
|
154
235
|
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
@@ -165,6 +246,7 @@ def generate_blueprint(
|
|
|
165
246
|
capacity_type=capacity_type,
|
|
166
247
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
167
248
|
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
249
|
+
gcs_bucket=args.cluster_state_gcs_bucket,
|
|
168
250
|
)
|
|
169
251
|
if args.device_type == a3ultra_device_type:
|
|
170
252
|
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
@@ -178,8 +260,10 @@ def generate_blueprint(
|
|
|
178
260
|
auth_cidr=all_IPs_cidr,
|
|
179
261
|
num_nodes=num_nodes,
|
|
180
262
|
reservation=args.reservation if args.reservation else None,
|
|
263
|
+
enable_filestore_csi_driver=args.enable_gcpfilestore_csi_driver,
|
|
181
264
|
capacity_type=capacity_type,
|
|
182
265
|
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
183
266
|
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
267
|
+
gcs_bucket=args.cluster_state_gcs_bucket,
|
|
184
268
|
)
|
|
185
269
|
return None
|
xpk/commands/common.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..core.commands import run_command_with_updates_retry
|
|
18
|
+
from ..core.gcloud_context import zone_to_region
|
|
19
|
+
from ..utils.console import xpk_print
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def set_cluster_command(args) -> int:
|
|
23
|
+
"""Run cluster configuration command to set the kubectl config.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
args: user provided arguments for running the command.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
0 if successful and 1 otherwise.
|
|
30
|
+
"""
|
|
31
|
+
command = (
|
|
32
|
+
'gcloud container clusters get-credentials'
|
|
33
|
+
f' {args.cluster} --region={zone_to_region(args.zone)}'
|
|
34
|
+
f' --project={args.project} &&'
|
|
35
|
+
' kubectl config view && kubectl config set-context --current'
|
|
36
|
+
' --namespace=default'
|
|
37
|
+
)
|
|
38
|
+
task = f'get-credentials to cluster {args.cluster}'
|
|
39
|
+
return_code = run_command_with_updates_retry(
|
|
40
|
+
command, task, args, verbose=False
|
|
41
|
+
)
|
|
42
|
+
if return_code != 0:
|
|
43
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
44
|
+
return return_code
|
xpk/commands/config.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..core.config import XpkConfig
|
|
18
|
+
from ..utils.console import xpk_print
|
|
19
|
+
|
|
20
|
+
xpk_cfg = XpkConfig()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def set_config(args):
|
|
24
|
+
xpk_cfg.set(args.set_config_args[0], args.set_config_args[1])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_config(args):
|
|
28
|
+
value = xpk_cfg.get(args.get_config_key[0])
|
|
29
|
+
xpk_print(value)
|