xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/batch.py +27 -5
- xpk/commands/cluster.py +104 -80
- xpk/commands/cluster_gcluster.py +94 -10
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +8 -10
- xpk/commands/inspector.py +5 -11
- xpk/commands/job.py +9 -7
- xpk/commands/kind.py +34 -4
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +27 -7
- xpk/commands/storage.py +280 -0
- xpk/commands/version.py +6 -18
- xpk/commands/workload.py +381 -184
- xpk/core/blueprint/blueprint_definitions.py +1 -0
- xpk/core/blueprint/blueprint_generator.py +132 -76
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +6 -3
- xpk/core/commands.py +18 -14
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +20 -2
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +257 -18
- xpk/core/kueue.py +12 -6
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +32 -20
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +124 -45
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +38 -1
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +209 -1
- xpk/core/workload_decorators/rdma_decorator.py +25 -5
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
- xpk/main.py +3 -1
- xpk/parser/batch.py +10 -151
- xpk/parser/cluster.py +49 -8
- xpk/parser/common.py +189 -1
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +27 -1
- xpk/parser/info.py +2 -1
- xpk/parser/inspector.py +3 -3
- xpk/parser/job.py +25 -4
- xpk/parser/kind.py +3 -2
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +10 -1
- xpk/parser/storage.py +326 -0
- xpk/parser/validators.py +3 -3
- xpk/parser/workload.py +118 -76
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/objects.py +8 -5
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
- xpk-0.7.1.dist-info/RECORD +92 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
- xpk/core/core.py +0 -2824
- xpk-0.6.0.dist-info/RECORD +0 -57
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/core/storage.py
ADDED
|
@@ -0,0 +1,581 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
from argparse import Namespace
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
import ruamel.yaml
|
|
23
|
+
from google.cloud import storage as gcp_storage
|
|
24
|
+
from kubernetes import client as k8s_client
|
|
25
|
+
from kubernetes import utils
|
|
26
|
+
from kubernetes.client import ApiClient
|
|
27
|
+
from kubernetes.client.models.v1_persistent_volume import V1PersistentVolume
|
|
28
|
+
from kubernetes.client.rest import ApiException
|
|
29
|
+
from kubernetes.utils import FailToCreateError
|
|
30
|
+
from tabulate import tabulate
|
|
31
|
+
|
|
32
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
33
|
+
from ..utils.file import ensure_directory_exists
|
|
34
|
+
from ..utils import templates
|
|
35
|
+
from .cluster import XPK_SA
|
|
36
|
+
|
|
37
|
+
yaml = ruamel.yaml.YAML()
|
|
38
|
+
|
|
39
|
+
STORAGE_CRD_PATH = "/../api/storage_crd.yaml"
|
|
40
|
+
STORAGE_TEMPLATE_PATH = "/../templates/storage.yaml"
|
|
41
|
+
XPK_API_GROUP_NAME = "xpk.x-k8s.io"
|
|
42
|
+
XPK_API_GROUP_VERSION = "v1"
|
|
43
|
+
STORAGE_CRD_KIND = "Storage"
|
|
44
|
+
STORAGE_CRD_PLURAL = "storages"
|
|
45
|
+
STORAGE_CRD_NAME = f"{XPK_API_GROUP_NAME}.{STORAGE_CRD_PLURAL}"
|
|
46
|
+
GCS_FUSE_TYPE = "gcsfuse"
|
|
47
|
+
GCP_FILESTORE_TYPE = "gcpfilestore"
|
|
48
|
+
MANIFESTS_PATH = os.path.abspath("xpkclusters/storage-manifests")
|
|
49
|
+
GCS_FUSE_ANNOTATION = 'gke-gcsfuse/volumes: "true"'
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class Storage:
|
|
54
|
+
"""
|
|
55
|
+
Represents a Storage custom resource in Kubernetes.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
name: The name of the Storage resource.
|
|
59
|
+
type: The type of storage (e.g., 'GCSFuse').
|
|
60
|
+
cluster: The cluster where the storage is located.
|
|
61
|
+
auto_mount: Whether the storage should be automatically mounted to every workload.
|
|
62
|
+
mount_point: The path on which a given storage should be mounted for a workload.
|
|
63
|
+
readonly: Whether the storage is read-only.
|
|
64
|
+
manifest: The path to a yaml file containing PersistentVolume and PersistentVolumeClaim for a given storage.
|
|
65
|
+
pvc: The name of the PersistentVolumeClaim associated with the storage.
|
|
66
|
+
pv: The name of the PersistentVolume associated with the storage.
|
|
67
|
+
bucket: The name of the GCS Fuse bucket/ GCP Filestore PersistentVolume refers to.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
name: str
|
|
71
|
+
type: str
|
|
72
|
+
auto_mount: bool
|
|
73
|
+
mount_point: str
|
|
74
|
+
readonly: bool
|
|
75
|
+
manifest: str
|
|
76
|
+
pvc: str
|
|
77
|
+
pv: str
|
|
78
|
+
bucket: str
|
|
79
|
+
|
|
80
|
+
def __init__(self, data: dict):
|
|
81
|
+
"""
|
|
82
|
+
Initializes a Storage object from a dictionary.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
data: A dictionary containing the Storage resource definition.
|
|
86
|
+
"""
|
|
87
|
+
metadata: k8s_client.V1ObjectMeta = data.get("metadata", {})
|
|
88
|
+
self.name = metadata.get("name")
|
|
89
|
+
spec = data.get("spec", {})
|
|
90
|
+
self.type: str = spec.get("type")
|
|
91
|
+
self.auto_mount: bool = spec.get("auto_mount")
|
|
92
|
+
self.mount_point: bool = spec.get("mount_point")
|
|
93
|
+
self.readonly: bool = spec.get("readonly")
|
|
94
|
+
self.manifest: str = spec.get("manifest")
|
|
95
|
+
self.pvc: str = spec.get("pvc")
|
|
96
|
+
self.pv: str = spec.get("pv")
|
|
97
|
+
self.bucket: str = self._get_bucket()
|
|
98
|
+
|
|
99
|
+
def fields_as_list(self) -> list[str]:
|
|
100
|
+
"""
|
|
101
|
+
Returns a list of fields for display purposes.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
A list of strings representing the Storage object's fields.
|
|
105
|
+
"""
|
|
106
|
+
return [
|
|
107
|
+
self.name,
|
|
108
|
+
self.type,
|
|
109
|
+
self.auto_mount,
|
|
110
|
+
self.mount_point,
|
|
111
|
+
self.readonly,
|
|
112
|
+
self.manifest,
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
def _get_bucket(self) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Retrieves the bucket name from PersistentVolume definition associated with the storage.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
The name of the bucket.
|
|
121
|
+
"""
|
|
122
|
+
client = k8s_client.CoreV1Api()
|
|
123
|
+
try:
|
|
124
|
+
pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
|
|
125
|
+
return pv.spec.csi.volume_handle
|
|
126
|
+
except ApiException as e:
|
|
127
|
+
xpk_print(
|
|
128
|
+
f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
|
|
129
|
+
)
|
|
130
|
+
return ""
|
|
131
|
+
|
|
132
|
+
def get_mount_options(self) -> list[str]:
|
|
133
|
+
"""
|
|
134
|
+
Retrieves the mount options for the PersistentVolume.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
A list of mount options.
|
|
138
|
+
"""
|
|
139
|
+
client = k8s_client.CoreV1Api()
|
|
140
|
+
try:
|
|
141
|
+
pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
|
|
142
|
+
return pv.spec.mount_options
|
|
143
|
+
except ApiException as e:
|
|
144
|
+
xpk_print(
|
|
145
|
+
f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
|
|
146
|
+
)
|
|
147
|
+
return []
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def list_storages(k8s_api_client: ApiClient) -> list[Storage]:
|
|
151
|
+
"""
|
|
152
|
+
Lists all Storage custom resources in the cluster.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
A list of Storage objects representing the Storage resources.
|
|
159
|
+
"""
|
|
160
|
+
api_instance = k8s_client.CustomObjectsApi(k8s_api_client)
|
|
161
|
+
try:
|
|
162
|
+
resp = api_instance.list_cluster_custom_object(
|
|
163
|
+
group=XPK_API_GROUP_NAME,
|
|
164
|
+
version=XPK_API_GROUP_VERSION,
|
|
165
|
+
plural=STORAGE_CRD_PLURAL,
|
|
166
|
+
)
|
|
167
|
+
except ApiException as e:
|
|
168
|
+
xpk_print(f"Kubernetes API exception while listing Storages: {e}")
|
|
169
|
+
if e.status == 404:
|
|
170
|
+
xpk_print("Storages not found, skipping")
|
|
171
|
+
return []
|
|
172
|
+
# If it's a different error, then we should just exit.
|
|
173
|
+
xpk_exit(1)
|
|
174
|
+
|
|
175
|
+
storages = []
|
|
176
|
+
for stg in resp["items"]:
|
|
177
|
+
storage = Storage(stg)
|
|
178
|
+
storages.append(storage)
|
|
179
|
+
return storages
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_auto_mount_storages(k8s_api_client: ApiClient) -> list[Storage]:
|
|
183
|
+
"""
|
|
184
|
+
Retrieves all Storage resources that have --auto-mount flag set to true.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
A list of Storage objects that have `auto_mount` set to True.
|
|
191
|
+
"""
|
|
192
|
+
auto_mount_storages: list[Storage] = []
|
|
193
|
+
for storage in list_storages(k8s_api_client):
|
|
194
|
+
if storage.auto_mount is True:
|
|
195
|
+
auto_mount_storages.append(storage)
|
|
196
|
+
return auto_mount_storages
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def get_auto_mount_gcsfuse_storages(k8s_api_client: ApiClient) -> list[Storage]:
|
|
200
|
+
"""
|
|
201
|
+
Retrieves all GCS Fuse Storage resources that have --auto-mount flag set to true.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
A list of GCS Fuse Storage objects that have `auto_mount` set to True.
|
|
208
|
+
"""
|
|
209
|
+
storages: list[Storage] = get_auto_mount_storages(k8s_api_client)
|
|
210
|
+
return list(filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages))
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def get_storages(
|
|
214
|
+
k8s_api_client: ApiClient, requested_storages: list[str]
|
|
215
|
+
) -> list[Storage]:
|
|
216
|
+
"""
|
|
217
|
+
Retrieves a list of Storage resources by their names.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
221
|
+
names: A list of Storage resource names to retrieve.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
A list of Storage objects matching the given names.
|
|
225
|
+
"""
|
|
226
|
+
all_storages = list_storages(k8s_api_client)
|
|
227
|
+
all_storage_names = {storage.name for storage in all_storages}
|
|
228
|
+
|
|
229
|
+
for storage_name in requested_storages:
|
|
230
|
+
if storage_name not in all_storage_names:
|
|
231
|
+
xpk_print(
|
|
232
|
+
f"Storage: {storage_name} not found. Choose one of the available"
|
|
233
|
+
f" storages: {list(all_storage_names)}"
|
|
234
|
+
)
|
|
235
|
+
xpk_exit(1)
|
|
236
|
+
|
|
237
|
+
storages: list[Storage] = list(
|
|
238
|
+
storage for storage in all_storages if storage.name in requested_storages
|
|
239
|
+
)
|
|
240
|
+
return storages
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def get_storages_to_mount(
|
|
244
|
+
k8s_api_client: ApiClient, requested_storages: list[str]
|
|
245
|
+
) -> list[Storage]:
|
|
246
|
+
"""
|
|
247
|
+
Retrieves a list of Storage resources by their names, including auto-mounted storages.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
251
|
+
names: A list of Storage resource names to retrieve.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
A list of Storage objects matching the given names and any auto-mounted storages.
|
|
255
|
+
"""
|
|
256
|
+
storages = get_storages(k8s_api_client, requested_storages)
|
|
257
|
+
for auto_mounted_stg in get_auto_mount_storages(k8s_api_client):
|
|
258
|
+
# prevent duplicating storages
|
|
259
|
+
if auto_mounted_stg.name not in requested_storages:
|
|
260
|
+
storages.append(auto_mounted_stg)
|
|
261
|
+
|
|
262
|
+
return storages
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def get_storage(k8s_api_client: ApiClient, name: str) -> Storage:
|
|
266
|
+
"""
|
|
267
|
+
Retrieves a specific Storage custom resource by its name.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
271
|
+
name: The name of the Storage resource to retrieve.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
A Storage object representing the retrieved Storage resource.
|
|
275
|
+
"""
|
|
276
|
+
api_instance = k8s_client.CustomObjectsApi(k8s_api_client)
|
|
277
|
+
try:
|
|
278
|
+
resp = api_instance.get_cluster_custom_object(
|
|
279
|
+
name=name,
|
|
280
|
+
group=XPK_API_GROUP_NAME,
|
|
281
|
+
version=XPK_API_GROUP_VERSION,
|
|
282
|
+
plural=STORAGE_CRD_PLURAL,
|
|
283
|
+
)
|
|
284
|
+
return Storage(resp)
|
|
285
|
+
except ApiException as e:
|
|
286
|
+
xpk_print(f"Kubernetes API exception while getting Storage {name}: {e}")
|
|
287
|
+
xpk_exit(1)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def install_storage_crd(k8s_api_client: ApiClient) -> None:
|
|
291
|
+
"""
|
|
292
|
+
Installs the Storage custom resource definition (CRD) in the Kubernetes cluster.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
296
|
+
"""
|
|
297
|
+
xpk_print(f"Creating a new CRD: {STORAGE_CRD_NAME}")
|
|
298
|
+
try:
|
|
299
|
+
utils.create_from_yaml(
|
|
300
|
+
k8s_api_client,
|
|
301
|
+
f"{os.path.dirname(__file__)}{STORAGE_CRD_PATH}",
|
|
302
|
+
verbose=True,
|
|
303
|
+
)
|
|
304
|
+
xpk_print(f"Created a CRD: {STORAGE_CRD_NAME} successfully")
|
|
305
|
+
except FailToCreateError as e:
|
|
306
|
+
for api_exception in e.api_exceptions:
|
|
307
|
+
if api_exception.status == 409:
|
|
308
|
+
xpk_print(
|
|
309
|
+
f"CRD: {STORAGE_CRD_NAME} already exists. Skipping its creation"
|
|
310
|
+
)
|
|
311
|
+
break
|
|
312
|
+
else:
|
|
313
|
+
xpk_print(f"Encountered error during installing Storage CRD: {e}")
|
|
314
|
+
xpk_exit(1)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def get_storage_volume_mounts_yaml(storages: list[Storage]) -> str:
|
|
318
|
+
"""
|
|
319
|
+
Generates the YAML representation of the volumeMounts section for the given Storages.
|
|
320
|
+
|
|
321
|
+
This function creates the YAML snippet that defines how the storage volumes
|
|
322
|
+
should be mounted within a Pod's containers.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
storages: A list of Storage objects.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
A string containing the YAML representation of the volumeMounts section.
|
|
329
|
+
"""
|
|
330
|
+
yaml_str = ""
|
|
331
|
+
for storage in storages:
|
|
332
|
+
yaml_str += f"""- name: {storage.pv}
|
|
333
|
+
mountPath: {storage.mount_point}
|
|
334
|
+
readOnly: {storage.readonly}
|
|
335
|
+
"""
|
|
336
|
+
return yaml_str
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def get_storage_volumes_yaml(storages: list[Storage]) -> str:
|
|
340
|
+
"""
|
|
341
|
+
Generates the YAML representation of the volumes section for the given Storages.
|
|
342
|
+
|
|
343
|
+
This function creates the YAML snippet that defines the volumes to be
|
|
344
|
+
mounted in a Pod, including the PersistentVolumeClaim associated with
|
|
345
|
+
each Storage.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
storages: A list of Storage objects.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
A string containing the YAML representation of the volumes section.
|
|
352
|
+
"""
|
|
353
|
+
yaml_str = ""
|
|
354
|
+
for storage in storages:
|
|
355
|
+
yaml_str += f"""- name: {storage.pv}
|
|
356
|
+
persistentVolumeClaim:
|
|
357
|
+
claimName: {storage.pvc}
|
|
358
|
+
readOnly: {storage.readonly}
|
|
359
|
+
"""
|
|
360
|
+
return yaml_str
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def get_storage_volume_mounts_yaml_for_gpu(storages: list[Storage]) -> str:
|
|
364
|
+
"""
|
|
365
|
+
Generates the YAML representation of the volumeMounts section for the given Storages.
|
|
366
|
+
|
|
367
|
+
This function creates the YAML snippet that defines how the storage volumes
|
|
368
|
+
should be mounted within a Pod's containers.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
storages: A list of Storage objects.
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
A string containing the YAML representation of the volumeMounts section.
|
|
375
|
+
"""
|
|
376
|
+
yaml_str = ""
|
|
377
|
+
for storage in storages:
|
|
378
|
+
yaml_str += f"""- name: {storage.pv}
|
|
379
|
+
mountPath: {storage.mount_point}
|
|
380
|
+
readOnly: {storage.readonly}
|
|
381
|
+
"""
|
|
382
|
+
return yaml_str
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def get_storage_volumes_yaml_for_gpu(storages: list[Storage]) -> str:
|
|
386
|
+
"""
|
|
387
|
+
Generates the YAML representation of the volumes section for the given Storages.
|
|
388
|
+
|
|
389
|
+
This function creates the YAML snippet that defines the volumes to be
|
|
390
|
+
mounted in a Pod, including the PersistentVolumeClaim associated with
|
|
391
|
+
each Storage.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
storages: A list of Storage objects.
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
A string containing the YAML representation of the volumes section.
|
|
398
|
+
"""
|
|
399
|
+
yaml_str = ""
|
|
400
|
+
for storage in storages:
|
|
401
|
+
yaml_str += f"""- name: {storage.pv}
|
|
402
|
+
persistentVolumeClaim:
|
|
403
|
+
claimName: {storage.pvc}
|
|
404
|
+
readOnly: {storage.readonly}
|
|
405
|
+
"""
|
|
406
|
+
return yaml_str
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def get_storage_volumes_yaml_dict(storages: list[Storage]) -> list[dict]:
|
|
410
|
+
vols = []
|
|
411
|
+
for storage in storages:
|
|
412
|
+
vols.append({
|
|
413
|
+
"name": storage.pv,
|
|
414
|
+
"persistentVolumeClaim": {
|
|
415
|
+
"claimName": storage.pvc,
|
|
416
|
+
"readOnly": storage.readonly,
|
|
417
|
+
},
|
|
418
|
+
})
|
|
419
|
+
return vols
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def add_bucket_iam_members(args: Namespace, storages: list[Storage]) -> None:
|
|
423
|
+
"""
|
|
424
|
+
Adds IAM members to the GCS buckets associated with the given Storages.
|
|
425
|
+
|
|
426
|
+
This function grants the necessary permissions to the XPK service account
|
|
427
|
+
to access the GCS buckets. The specific role (viewer or user) is determined
|
|
428
|
+
based on the `readonly` attribute of each Storage object.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
args: An argparse Namespace object containing command-line arguments.
|
|
432
|
+
storages: A list of Storage objects.
|
|
433
|
+
"""
|
|
434
|
+
storage_client = gcp_storage.Client()
|
|
435
|
+
|
|
436
|
+
for storage in storages:
|
|
437
|
+
if storage.type == GCS_FUSE_TYPE:
|
|
438
|
+
bucket = storage_client.bucket(storage.bucket)
|
|
439
|
+
policy = bucket.get_iam_policy(requested_policy_version=3)
|
|
440
|
+
if storage.readonly:
|
|
441
|
+
role = "roles/storage.objectViewer"
|
|
442
|
+
else:
|
|
443
|
+
role = "roles/storage.objectUser"
|
|
444
|
+
|
|
445
|
+
member = (
|
|
446
|
+
f"principal://iam.googleapis.com/projects/{args.project_number}/"
|
|
447
|
+
f"locations/global/workloadIdentityPools/{args.project}.svc.id.goog/"
|
|
448
|
+
f"subject/ns/default/sa/{XPK_SA}"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
policy.bindings.append({"role": role, "members": {member}})
|
|
452
|
+
bucket.set_iam_policy(policy)
|
|
453
|
+
xpk_print(f"Added {member} with role {role} to {storage.bucket}.")
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def print_storages_for_cluster(storages: list[Storage]) -> None:
|
|
457
|
+
"""
|
|
458
|
+
Prints in human readable manner a table of Storage resources that belong to the specified cluster.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
storages: A list of Storage objects.
|
|
462
|
+
cluster: The name of the cluster to filter by.
|
|
463
|
+
"""
|
|
464
|
+
headers = [
|
|
465
|
+
"NAME",
|
|
466
|
+
"TYPE",
|
|
467
|
+
"AUTO MOUNT",
|
|
468
|
+
"MOUNT POINT",
|
|
469
|
+
"READONLY",
|
|
470
|
+
"MANIFEST",
|
|
471
|
+
]
|
|
472
|
+
storage_tab = []
|
|
473
|
+
for storage in storages:
|
|
474
|
+
storage_tab.append(storage.fields_as_list())
|
|
475
|
+
|
|
476
|
+
print(
|
|
477
|
+
tabulate(
|
|
478
|
+
storage_tab,
|
|
479
|
+
headers=headers,
|
|
480
|
+
)
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def save_manifest(args: Namespace, manifest: list[dict]):
|
|
485
|
+
"""
|
|
486
|
+
Saves manifest to file in xpkclusters/storage-manifests.
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
args: An argparser Namespace object containing arguments for creating the
|
|
490
|
+
Storage resource.
|
|
491
|
+
manifest: A list of some of: PersistentVolume, PersistentVolumeClaim and
|
|
492
|
+
StorageClass definitions
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
manifest_path: Manifest file path
|
|
496
|
+
"""
|
|
497
|
+
ensure_directory_exists(MANIFESTS_PATH)
|
|
498
|
+
manifest_path = f"{MANIFESTS_PATH}/{args.project}-{args.zone}-{args.cluster}-{args.name}-manifest.yaml"
|
|
499
|
+
with open(manifest_path, "w", encoding="utf-8") as f:
|
|
500
|
+
yaml.dump_all(manifest, f)
|
|
501
|
+
return manifest_path
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def save_storage_crds(k8s_api_client: ApiClient, data: Any):
|
|
505
|
+
"""
|
|
506
|
+
Saves a new Storage custom resource in the Kubernetes cluster.
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
510
|
+
data: A dictionary containing data to save.
|
|
511
|
+
"""
|
|
512
|
+
api_instance = k8s_client.CustomObjectsApi(k8s_api_client)
|
|
513
|
+
|
|
514
|
+
api_instance.create_cluster_custom_object(
|
|
515
|
+
group=XPK_API_GROUP_NAME,
|
|
516
|
+
version=XPK_API_GROUP_VERSION,
|
|
517
|
+
plural=STORAGE_CRD_PLURAL,
|
|
518
|
+
body=data,
|
|
519
|
+
)
|
|
520
|
+
xpk_print(f"Created {STORAGE_CRD_KIND} object: {data['metadata']['name']}")
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def fill_storage_template(
|
|
524
|
+
template: dict, args: Namespace, manifest: list[dict], manifest_path: str
|
|
525
|
+
):
|
|
526
|
+
"""
|
|
527
|
+
Populates storage.yaml template with data.
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
template: A storage custom resource definition template
|
|
531
|
+
args: An argparse Namespace object containing the arguments for creating
|
|
532
|
+
the Storage resource.
|
|
533
|
+
manifest: A list of some of: PersistentVolume, PersistentVolumeClaim and
|
|
534
|
+
StorageClass definitions
|
|
535
|
+
"""
|
|
536
|
+
template["metadata"]["name"] = args.name
|
|
537
|
+
template["spec"] = {
|
|
538
|
+
"auto_mount": args.auto_mount,
|
|
539
|
+
"cluster": args.cluster,
|
|
540
|
+
"mount_point": args.mount_point,
|
|
541
|
+
"readonly": args.readonly,
|
|
542
|
+
"type": args.type,
|
|
543
|
+
"manifest": manifest_path,
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
for obj in manifest:
|
|
547
|
+
if obj["kind"] == "PersistentVolume":
|
|
548
|
+
template["spec"]["pv"] = obj["metadata"]["name"]
|
|
549
|
+
elif obj["kind"] == "PersistentVolumeClaim":
|
|
550
|
+
template["spec"]["pvc"] = obj["metadata"]["name"]
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def create_storage_crds(
|
|
554
|
+
k8s_api_client: ApiClient, args: Namespace, manifest: list[dict]
|
|
555
|
+
) -> None:
|
|
556
|
+
"""
|
|
557
|
+
Creates a new Storage custom resource in the Kubernetes cluster.
|
|
558
|
+
|
|
559
|
+
This function reads a Storage template from a YAML file, populates it with
|
|
560
|
+
values from the provided arguments, and then creates the Storage object
|
|
561
|
+
in the cluster.
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
565
|
+
args: An argparse Namespace object containing the arguments for creating
|
|
566
|
+
the Storage resource.
|
|
567
|
+
manifest: A list of some of: PersistentVolume, PersistentVolumeClaim and
|
|
568
|
+
StorageClass definitions
|
|
569
|
+
"""
|
|
570
|
+
try:
|
|
571
|
+
template = templates.load(STORAGE_TEMPLATE_PATH)
|
|
572
|
+
|
|
573
|
+
manifest_path = save_manifest(args, manifest)
|
|
574
|
+
fill_storage_template(template, args, manifest, manifest_path)
|
|
575
|
+
save_storage_crds(k8s_api_client, template)
|
|
576
|
+
except ApiException as e:
|
|
577
|
+
if e.status == 409:
|
|
578
|
+
xpk_print(f"Storage: {args.name} already exists. Skipping its creation")
|
|
579
|
+
else:
|
|
580
|
+
xpk_print(f"Encountered error during storage creation: {e}")
|
|
581
|
+
xpk_exit(1)
|
|
@@ -99,7 +99,44 @@ IN MaxText/accelerator_to_spec_map.py !!!!! """
|
|
|
99
99
|
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
|
100
100
|
UserFacingNameToSystemCharacteristics = {
|
|
101
101
|
# GPU system characteristics
|
|
102
|
-
#
|
|
102
|
+
# l4-$CHIPSc
|
|
103
|
+
'l4-1': SystemCharacteristics(
|
|
104
|
+
'N/A',
|
|
105
|
+
1,
|
|
106
|
+
'nvidia-l4',
|
|
107
|
+
'g2-standard-12',
|
|
108
|
+
1,
|
|
109
|
+
AcceleratorType['GPU'],
|
|
110
|
+
'l4-1',
|
|
111
|
+
),
|
|
112
|
+
'l4-2': SystemCharacteristics(
|
|
113
|
+
'N/A',
|
|
114
|
+
1,
|
|
115
|
+
'nvidia-l4',
|
|
116
|
+
'g2-standard-24',
|
|
117
|
+
2,
|
|
118
|
+
AcceleratorType['GPU'],
|
|
119
|
+
'l4-2',
|
|
120
|
+
),
|
|
121
|
+
'l4-4': SystemCharacteristics(
|
|
122
|
+
'N/A',
|
|
123
|
+
1,
|
|
124
|
+
'nvidia-l4',
|
|
125
|
+
'g2-standard-48',
|
|
126
|
+
4,
|
|
127
|
+
AcceleratorType['GPU'],
|
|
128
|
+
'l4-4',
|
|
129
|
+
),
|
|
130
|
+
'l4-8': SystemCharacteristics(
|
|
131
|
+
'N/A',
|
|
132
|
+
1,
|
|
133
|
+
'nvidia-l4',
|
|
134
|
+
'g2-standard-96',
|
|
135
|
+
8,
|
|
136
|
+
AcceleratorType['GPU'],
|
|
137
|
+
'l4-8',
|
|
138
|
+
),
|
|
139
|
+
# A100-40gb-$CHIPSc
|
|
103
140
|
'a100-40gb-1': SystemCharacteristics(
|
|
104
141
|
'N/A',
|
|
105
142
|
1,
|