xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/batch.py +27 -5
- xpk/commands/cluster.py +104 -80
- xpk/commands/cluster_gcluster.py +94 -10
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +8 -10
- xpk/commands/inspector.py +5 -11
- xpk/commands/job.py +9 -7
- xpk/commands/kind.py +34 -4
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +27 -7
- xpk/commands/storage.py +280 -0
- xpk/commands/version.py +6 -18
- xpk/commands/workload.py +381 -184
- xpk/core/blueprint/blueprint_definitions.py +1 -0
- xpk/core/blueprint/blueprint_generator.py +132 -76
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +6 -3
- xpk/core/commands.py +18 -14
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +20 -2
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +257 -18
- xpk/core/kueue.py +12 -6
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +32 -20
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +124 -45
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +38 -1
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +209 -1
- xpk/core/workload_decorators/rdma_decorator.py +25 -5
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
- xpk/main.py +3 -1
- xpk/parser/batch.py +10 -151
- xpk/parser/cluster.py +49 -8
- xpk/parser/common.py +189 -1
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +27 -1
- xpk/parser/info.py +2 -1
- xpk/parser/inspector.py +3 -3
- xpk/parser/job.py +25 -4
- xpk/parser/kind.py +3 -2
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +10 -1
- xpk/parser/storage.py +326 -0
- xpk/parser/validators.py +3 -3
- xpk/parser/workload.py +118 -76
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/objects.py +8 -5
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
- xpk-0.7.1.dist-info/RECORD +92 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
- xpk/core/core.py +0 -2824
- xpk-0.6.0.dist-info/RECORD +0 -57
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/core/pathways.py
CHANGED
|
@@ -14,13 +14,13 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from .cluster import XPK_SA
|
|
18
|
+
from ..core.docker_container import get_user_workload_container
|
|
19
|
+
from ..core.gcloud_context import zone_to_region
|
|
20
|
+
from ..core.nodepool import get_all_nodepools_programmatic
|
|
17
21
|
from ..utils.console import xpk_exit, xpk_print
|
|
18
|
-
from .
|
|
19
|
-
|
|
20
|
-
get_all_nodepools_programmatic,
|
|
21
|
-
get_user_workload_container,
|
|
22
|
-
zone_to_region,
|
|
23
|
-
)
|
|
22
|
+
from .config import AcceleratorType
|
|
23
|
+
from .storage import Storage, get_storage_volumes_yaml, GCS_FUSE_ANNOTATION
|
|
24
24
|
from .system_characteristics import SystemCharacteristics
|
|
25
25
|
|
|
26
26
|
PathwaysExpectedInstancesMap = {
|
|
@@ -41,9 +41,11 @@ def get_pathways_worker_args(args) -> str:
|
|
|
41
41
|
str: yaml containing arguments for the Pathways workers.
|
|
42
42
|
"""
|
|
43
43
|
yaml = """- --server_port=29001
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
- --resource_manager_address={rm_address}
|
|
45
|
+
- --gcs_scratch_location={args.pathways_gcs_location}"""
|
|
46
46
|
if args.use_pathways:
|
|
47
|
+
if args.custom_pathways_worker_args:
|
|
48
|
+
yaml = append_custom_pathways_args(yaml, args.custom_pathways_worker_args)
|
|
47
49
|
return yaml.format(args=args, rm_address=get_rm_address(args))
|
|
48
50
|
else:
|
|
49
51
|
return ''
|
|
@@ -58,15 +60,53 @@ def get_pathways_proxy_args(args) -> str:
|
|
|
58
60
|
str: yaml containing arguments for the Pathways proxy.
|
|
59
61
|
"""
|
|
60
62
|
yaml = """- --server_port=29000
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
- --resource_manager_address={rm_address}
|
|
64
|
+
- --gcs_scratch_location={args.pathways_gcs_location}"""
|
|
63
65
|
|
|
64
66
|
if args.use_pathways:
|
|
67
|
+
if args.custom_pathways_proxy_server_args:
|
|
68
|
+
yaml = append_custom_pathways_args(
|
|
69
|
+
yaml, args.custom_pathways_proxy_server_args
|
|
70
|
+
)
|
|
65
71
|
return yaml.format(args=args, rm_address=get_rm_address(args))
|
|
66
72
|
else:
|
|
67
73
|
return ''
|
|
68
74
|
|
|
69
75
|
|
|
76
|
+
def get_pathways_sidecar_container(args) -> str:
|
|
77
|
+
"""This is a sidecar container that runs the remote python server.
|
|
78
|
+
|
|
79
|
+
It is a special case of the initContainer (designated by restartPolicy:
|
|
80
|
+
Always)
|
|
81
|
+
See https://kubernetes.io/docs/concepts/workloads/pods/sidecar-containers/
|
|
82
|
+
for more details.
|
|
83
|
+
Args:
|
|
84
|
+
args: user provided arguments for running the command.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
str: yaml containing arguments for the Pathways sidecar container.
|
|
88
|
+
"""
|
|
89
|
+
yaml = """initContainers:
|
|
90
|
+
- name: remote-python-sidecar
|
|
91
|
+
image: {args.remote_python_sidecar_image}
|
|
92
|
+
imagePullPolicy: Always
|
|
93
|
+
securityContext:
|
|
94
|
+
privileged: true
|
|
95
|
+
volumeMounts:
|
|
96
|
+
- mountPath: /tmp # Shared volume mount with the main container.
|
|
97
|
+
name: shared-tmp
|
|
98
|
+
restartPolicy: Always
|
|
99
|
+
ports:
|
|
100
|
+
- containerPort: 50051
|
|
101
|
+
env:
|
|
102
|
+
- name: GRPC_SERVER_ADDRESS
|
|
103
|
+
value: '0.0.0.0:50051'"""
|
|
104
|
+
if args.use_pathways and args.remote_python_sidecar_image is not None:
|
|
105
|
+
return yaml.format(args=args)
|
|
106
|
+
else:
|
|
107
|
+
return ''
|
|
108
|
+
|
|
109
|
+
|
|
70
110
|
def add_pw_resource_flavors(args):
|
|
71
111
|
"""Add resource flavors required for Pathways enabled clusters."""
|
|
72
112
|
resource_flavor_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
|
|
@@ -105,9 +145,9 @@ def add_pw_resources_to_kueue(args):
|
|
|
105
145
|
- name: cpu-rm
|
|
106
146
|
resources:
|
|
107
147
|
- name: "cpu"
|
|
108
|
-
nominalQuota:
|
|
148
|
+
nominalQuota: 480
|
|
109
149
|
- name: "memory"
|
|
110
|
-
nominalQuota:
|
|
150
|
+
nominalQuota: 2000G
|
|
111
151
|
- name: cpu-proxy
|
|
112
152
|
resources:
|
|
113
153
|
- name: "cpu"
|
|
@@ -166,23 +206,17 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:
|
|
|
166
206
|
# Set the job which determines the life of other Pathways jobs
|
|
167
207
|
args.targetReplicatedJob = 'proxy' if args.headless else 'main'
|
|
168
208
|
|
|
169
|
-
# Always report user code failures back to JobSet.
|
|
170
|
-
args.restart_on_user_code_failure = True
|
|
171
|
-
|
|
172
209
|
return True
|
|
173
210
|
|
|
174
211
|
|
|
175
212
|
def get_pathways_unified_query_link(args) -> str:
|
|
176
213
|
"""Get the unified query link for the pathways workload."""
|
|
177
|
-
pw_suffixes = ['main', 'rm', 'proxy']
|
|
178
|
-
pw_pod_names = [f'"{args.workload}-{suffix}-0"' for suffix in pw_suffixes]
|
|
179
|
-
pw_pod_names_query = '%20OR%20'.join(pw_pod_names + ['worker-0-0'])
|
|
180
214
|
query_params = (
|
|
181
215
|
'resource.type%3D"k8s_container"%0A'
|
|
182
216
|
f'resource.labels.project_id%3D"{args.project}"%0A'
|
|
183
217
|
f'resource.labels.location%3D"{zone_to_region(args.zone)}"%0A'
|
|
184
218
|
f'resource.labels.cluster_name%3D"{args.cluster}"%0A'
|
|
185
|
-
f'resource.labels.pod_name:{
|
|
219
|
+
f'resource.labels.pod_name:"{args.workload}-"%0A'
|
|
186
220
|
'severity>%3DDEFAULT'
|
|
187
221
|
)
|
|
188
222
|
|
|
@@ -198,11 +232,13 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
|
|
|
198
232
|
str: yaml containing arguments for the Pathways resource manager.
|
|
199
233
|
"""
|
|
200
234
|
yaml = """- --server_port=29001
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
235
|
+
- --gcs_scratch_location={args.pathways_gcs_location}
|
|
236
|
+
- --node_type=resource_manager
|
|
237
|
+
- --instance_count={instance_count}
|
|
238
|
+
- --instance_type={instance_type}"""
|
|
205
239
|
if args.use_pathways:
|
|
240
|
+
if args.custom_pathways_server_args:
|
|
241
|
+
yaml = append_custom_pathways_args(yaml, args.custom_pathways_server_args)
|
|
206
242
|
return yaml.format(
|
|
207
243
|
args=args,
|
|
208
244
|
instance_count=args.num_slices,
|
|
@@ -212,7 +248,34 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
|
|
|
212
248
|
return ''
|
|
213
249
|
|
|
214
250
|
|
|
215
|
-
def
|
|
251
|
+
def append_custom_pathways_args(yaml, custom_args) -> str:
|
|
252
|
+
"""Append custom Pathways args to the YAML with proper indentation.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
yaml (string): existing yaml containing args
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
yaml (string): yaml with additional args appended.
|
|
259
|
+
"""
|
|
260
|
+
second_line = yaml.split('\n')[1]
|
|
261
|
+
if (
|
|
262
|
+
not second_line
|
|
263
|
+
): # to cover edge case if only one arg remains, we would have to look at the entire YAML in this case.
|
|
264
|
+
return yaml
|
|
265
|
+
# Calculate the indentation based on the second line of existing YAML.
|
|
266
|
+
indentation = ' ' * (len(second_line) - len(second_line.lstrip()))
|
|
267
|
+
custom_args = custom_args.split(' ')
|
|
268
|
+
for arg in custom_args:
|
|
269
|
+
yaml += '\n' + indentation + '- ' + arg
|
|
270
|
+
return yaml
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def get_user_workload_for_pathways(
|
|
274
|
+
args,
|
|
275
|
+
system: SystemCharacteristics,
|
|
276
|
+
pod_failure_policy,
|
|
277
|
+
storages: list[Storage],
|
|
278
|
+
) -> str:
|
|
216
279
|
"""
|
|
217
280
|
Create a user workload container for Pathways.
|
|
218
281
|
Don't create one for Pathways headless mode.
|
|
@@ -227,32 +290,48 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str:
|
|
|
227
290
|
Pathways server port as a YAML string
|
|
228
291
|
"""
|
|
229
292
|
user_workload_yaml = """- name: main
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
293
|
+
replicas: 1
|
|
294
|
+
template:
|
|
295
|
+
metadata:
|
|
296
|
+
labels:
|
|
297
|
+
xpk.google.com/workload: {args.workload}
|
|
298
|
+
spec:
|
|
299
|
+
backoffLimit: 0
|
|
300
|
+
completions: 1
|
|
301
|
+
parallelism: 1
|
|
302
|
+
{pod_failure_policy}
|
|
303
|
+
template:
|
|
304
|
+
metadata:
|
|
305
|
+
annotations:
|
|
306
|
+
{gcs_fuse_annotation}
|
|
307
|
+
spec:
|
|
308
|
+
containers:
|
|
242
309
|
{container}
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
310
|
+
serviceAccountName: {service_account}
|
|
311
|
+
nodeSelector:
|
|
312
|
+
cloud.google.com/gke-nodepool: cpu-user-np
|
|
313
|
+
hostNetwork: true
|
|
314
|
+
dnsPolicy: ClusterFirstWithHostNet
|
|
315
|
+
restartPolicy: Never
|
|
316
|
+
volumes:
|
|
317
|
+
- hostPath:
|
|
318
|
+
path: /tmp
|
|
319
|
+
type: DirectoryOrCreate
|
|
320
|
+
name: shared-tmp
|
|
321
|
+
{storage_volumes}"""
|
|
251
322
|
if args.headless:
|
|
252
323
|
return ''
|
|
253
324
|
else:
|
|
254
325
|
container, _ = get_user_workload_container(args, system)
|
|
255
|
-
|
|
326
|
+
storage_volumes = get_storage_volumes_yaml(storages)
|
|
327
|
+
return user_workload_yaml.format(
|
|
328
|
+
args=args,
|
|
329
|
+
container=container,
|
|
330
|
+
storage_volumes=storage_volumes,
|
|
331
|
+
pod_failure_policy=pod_failure_policy,
|
|
332
|
+
service_account=XPK_SA,
|
|
333
|
+
gcs_fuse_annotation=GCS_FUSE_ANNOTATION,
|
|
334
|
+
)
|
|
256
335
|
|
|
257
336
|
|
|
258
337
|
def get_rm_address(args) -> str:
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .remote_state_client import RemoteStateClient
|
|
18
|
+
from ...utils.gcs_utils import upload_directory_to_gcs, check_file_exists, download_bucket_to_dir, upload_file_to_gcs
|
|
19
|
+
from ...utils.console import xpk_print
|
|
20
|
+
from google.cloud.storage import Client
|
|
21
|
+
import os
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class FuseStateClient(RemoteStateClient):
|
|
25
|
+
"""FuseStateClient is a class for managing remote xpk state stored in GCS Fuse."""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
bucket: str,
|
|
30
|
+
state_directory: str,
|
|
31
|
+
cluster: str,
|
|
32
|
+
deployment_name: str,
|
|
33
|
+
prefix: str,
|
|
34
|
+
) -> None:
|
|
35
|
+
self.bucket = bucket
|
|
36
|
+
self.state_dir = state_directory
|
|
37
|
+
self.storage_client = Client()
|
|
38
|
+
self.cluster = cluster
|
|
39
|
+
self.prefix = prefix
|
|
40
|
+
self.deployment_name = deployment_name
|
|
41
|
+
|
|
42
|
+
def _get_bucket_path(self) -> str:
|
|
43
|
+
return (
|
|
44
|
+
f'xpk_terraform_state/{self.prefix}/blueprints/{self.deployment_name}/'
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def _get_bucket_path_blueprint(self) -> str:
|
|
48
|
+
return f'xpk_terraform_state/{self.prefix}/blueprints/'
|
|
49
|
+
|
|
50
|
+
def _get_deployment_filename(self) -> str:
|
|
51
|
+
return f'{self.deployment_name}.yaml'
|
|
52
|
+
|
|
53
|
+
def _get_blueprint_path(self) -> str:
|
|
54
|
+
blueprint_dir = '/'.join(self.state_dir.split('/')[:-1])
|
|
55
|
+
return os.path.join(blueprint_dir, self.deployment_name) + '.yaml'
|
|
56
|
+
|
|
57
|
+
def upload_state(self) -> None:
|
|
58
|
+
xpk_print(
|
|
59
|
+
f'Uploading dependecies from directory {self.state_dir} to bucket:'
|
|
60
|
+
f' {self.bucket}. Path within bucket is: {self._get_bucket_path()}'
|
|
61
|
+
)
|
|
62
|
+
upload_directory_to_gcs(
|
|
63
|
+
storage_client=self.storage_client,
|
|
64
|
+
bucket_name=self.bucket,
|
|
65
|
+
bucket_path=self._get_bucket_path(),
|
|
66
|
+
source_directory=self.state_dir,
|
|
67
|
+
)
|
|
68
|
+
blueprint_bucket_path = (
|
|
69
|
+
self._get_bucket_path_blueprint() + self._get_deployment_filename()
|
|
70
|
+
)
|
|
71
|
+
xpk_print(
|
|
72
|
+
f'Uploading blueprint file: {self._get_blueprint_path()} to bucket'
|
|
73
|
+
f' {self.bucket}. Path within bucket is: {blueprint_bucket_path}'
|
|
74
|
+
)
|
|
75
|
+
upload_file_to_gcs(
|
|
76
|
+
storage_client=self.storage_client,
|
|
77
|
+
bucket_name=self.bucket,
|
|
78
|
+
bucket_path=blueprint_bucket_path,
|
|
79
|
+
file=self._get_blueprint_path(),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def download_state(self) -> None:
|
|
83
|
+
xpk_print(
|
|
84
|
+
f'Downloading from bucket: {self.bucket}, from path:'
|
|
85
|
+
f' {self._get_bucket_path()} to directory: {self.state_dir}'
|
|
86
|
+
)
|
|
87
|
+
download_bucket_to_dir(
|
|
88
|
+
self.storage_client,
|
|
89
|
+
self.bucket,
|
|
90
|
+
self._get_bucket_path(),
|
|
91
|
+
destination_directory=self.state_dir,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def check_remote_state_exists(self) -> bool:
|
|
95
|
+
return check_file_exists(
|
|
96
|
+
self.storage_client,
|
|
97
|
+
self.bucket,
|
|
98
|
+
self._get_bucket_path_blueprint() + self._get_deployment_filename(),
|
|
99
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RemoteStateClient(ABC):
|
|
21
|
+
"""This is a base class that defines methods a class for managing remote cluster state.
|
|
22
|
+
Args:
|
|
23
|
+
ABC (_type_): _description_
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def upload_state(self) -> None:
|
|
28
|
+
"""Upload state to remote storage"""
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def download_state(self) -> None:
|
|
33
|
+
"""Download state from remote storage"""
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def check_remote_state_exists(self) -> bool:
|
|
38
|
+
return False
|
xpk/core/resources.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
|
|
19
|
+
from ..utils.console import xpk_print
|
|
20
|
+
from ..utils.file import write_tmp_file
|
|
21
|
+
from .capacity import (
|
|
22
|
+
AUTOPROVISIONING_CONFIG_MAXIMUM_KEY,
|
|
23
|
+
AUTOPROVISIONING_CONFIG_MINIMUM_KEY,
|
|
24
|
+
AUTOPROVISIONING_CONFIG_VALUE,
|
|
25
|
+
CAPACITY_TYPE_CONFIG_KEY,
|
|
26
|
+
RESERVATION_CONFIG_KEY,
|
|
27
|
+
CapacityType,
|
|
28
|
+
get_capacity_type,
|
|
29
|
+
)
|
|
30
|
+
from .commands import run_command_for_value, run_commands
|
|
31
|
+
from .config import XPK_CURRENT_VERSION
|
|
32
|
+
from .system_characteristics import AcceleratorType, get_system_characteristics_by_device_type, SystemCharacteristics
|
|
33
|
+
|
|
34
|
+
CLUSTER_RESOURCES_CONFIGMAP = 'resources-configmap'
|
|
35
|
+
CLUSTER_METADATA_CONFIGMAP = 'metadata-configmap'
|
|
36
|
+
|
|
37
|
+
CLUSTER_CONFIGMAP_YAML = """kind: ConfigMap
|
|
38
|
+
apiVersion: v1
|
|
39
|
+
metadata:
|
|
40
|
+
name: {name}
|
|
41
|
+
data:
|
|
42
|
+
{data}
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class AutoprovisioningConfig:
|
|
48
|
+
config_filename: str
|
|
49
|
+
minimum_chips: int
|
|
50
|
+
maximum_chips: int
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
|
|
54
|
+
"""Run the Get GKE Cluster ConfigMap request.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
args: user provided arguments for running the command.
|
|
58
|
+
configmap_name: name of the configmap.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
key:value pairs stored in cluster ConfigMap.
|
|
62
|
+
"""
|
|
63
|
+
command = (
|
|
64
|
+
'kubectl get configmap'
|
|
65
|
+
f' {configmap_name} -o=custom-columns="ConfigData:data" --no-headers=true'
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return_code, return_value = run_command_for_value(
|
|
69
|
+
command, 'GKE Cluster Get ConfigMap', args
|
|
70
|
+
)
|
|
71
|
+
if return_code != 0:
|
|
72
|
+
xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
config_map = {}
|
|
76
|
+
return_value = return_value.strip()
|
|
77
|
+
|
|
78
|
+
if return_value:
|
|
79
|
+
# Format of ConfigMap: map[key1:value1 key2:value2]
|
|
80
|
+
return_value = return_value[return_value.index('map') :]
|
|
81
|
+
configs = return_value[4:-1].split(' ')
|
|
82
|
+
|
|
83
|
+
for config in configs:
|
|
84
|
+
key, value = config.strip().split(':')
|
|
85
|
+
config_map[key] = value
|
|
86
|
+
return config_map
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def create_cluster_configmaps(
|
|
90
|
+
args,
|
|
91
|
+
system,
|
|
92
|
+
tensorboard_config: dict,
|
|
93
|
+
autoprovisioning_config: AutoprovisioningConfig | None,
|
|
94
|
+
) -> int:
|
|
95
|
+
"""Run the Create GKE Cluster ConfigMap request.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
args: user provided arguments for running the command.
|
|
99
|
+
system: system characteristics.
|
|
100
|
+
tensorboard_config: map that contains Vertex Tensorboard name, id and location
|
|
101
|
+
autoprovisioning_config: Config used in autoprovisioning.
|
|
102
|
+
Returns:
|
|
103
|
+
0 if successful and 1 otherwise.
|
|
104
|
+
"""
|
|
105
|
+
configmap_yml = {}
|
|
106
|
+
|
|
107
|
+
# ConfigMap to store resources available in the cluster.
|
|
108
|
+
device_type = system.device_type
|
|
109
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
110
|
+
resources_data = f'{device_type}: "{int(args.num_nodes)}"'
|
|
111
|
+
elif (
|
|
112
|
+
not args.enable_pathways
|
|
113
|
+
and args.enable_autoprovisioning
|
|
114
|
+
and autoprovisioning_config
|
|
115
|
+
):
|
|
116
|
+
# Currently autoprovisioning is not supported with Pathways.
|
|
117
|
+
# Auto provisioning will have variable topologies for a gke accelerator type.
|
|
118
|
+
resources_data = (
|
|
119
|
+
f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
|
|
120
|
+
)
|
|
121
|
+
resources_data += (
|
|
122
|
+
f'\n {AUTOPROVISIONING_CONFIG_MINIMUM_KEY}:'
|
|
123
|
+
f' "{autoprovisioning_config.minimum_chips}"'
|
|
124
|
+
)
|
|
125
|
+
resources_data += (
|
|
126
|
+
f'\n {AUTOPROVISIONING_CONFIG_MAXIMUM_KEY}:'
|
|
127
|
+
f' "{autoprovisioning_config.maximum_chips}"'
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
resources_data = (
|
|
131
|
+
f'{device_type}: "{int(args.num_slices) * system.vms_per_slice}"'
|
|
132
|
+
)
|
|
133
|
+
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
134
|
+
resources_yml = CLUSTER_CONFIGMAP_YAML.format(
|
|
135
|
+
args=args, name=resources_configmap_name, data=resources_data
|
|
136
|
+
)
|
|
137
|
+
configmap_yml[resources_configmap_name] = resources_yml
|
|
138
|
+
|
|
139
|
+
# ConfigMap to store cluster metadata.
|
|
140
|
+
# XPK Version.
|
|
141
|
+
metadata = f'xpk_version: {XPK_CURRENT_VERSION}'
|
|
142
|
+
# Vertex Tensorboard information
|
|
143
|
+
for key, value in tensorboard_config.items():
|
|
144
|
+
metadata += f'\n {key}: "{value}"'
|
|
145
|
+
# Capacity Type.
|
|
146
|
+
capacity_type, return_code = get_capacity_type(args)
|
|
147
|
+
if return_code != 0:
|
|
148
|
+
xpk_print('Unable to determine capacity type.')
|
|
149
|
+
return return_code
|
|
150
|
+
metadata += f'\n {CAPACITY_TYPE_CONFIG_KEY}: {capacity_type.name}'
|
|
151
|
+
# Reservation ID if applicable.
|
|
152
|
+
if capacity_type == CapacityType.RESERVATION:
|
|
153
|
+
metadata += f'\n {RESERVATION_CONFIG_KEY}: {args.reservation}'
|
|
154
|
+
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
155
|
+
metadata_yml = CLUSTER_CONFIGMAP_YAML.format(
|
|
156
|
+
args=args, name=metadata_configmap_name, data=metadata
|
|
157
|
+
)
|
|
158
|
+
configmap_yml[metadata_configmap_name] = metadata_yml
|
|
159
|
+
return create_or_update_cluster_configmap(configmap_yml)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
|
|
163
|
+
"""
|
|
164
|
+
Args:
|
|
165
|
+
configmap_yml: dict containing ConfigMap name and yml string.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
0 if successful, 1 otherwise.
|
|
169
|
+
"""
|
|
170
|
+
commands = []
|
|
171
|
+
task_names = []
|
|
172
|
+
for configmap_name, yml_string in configmap_yml.items():
|
|
173
|
+
tmp = write_tmp_file(yml_string)
|
|
174
|
+
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
175
|
+
commands.append(command)
|
|
176
|
+
task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
|
|
177
|
+
task_names.append(task_name)
|
|
178
|
+
|
|
179
|
+
return_code = run_commands(
|
|
180
|
+
commands, 'GKE Cluster CreateOrUpdate ConfigMap(s)', task_names
|
|
181
|
+
)
|
|
182
|
+
if return_code != 0:
|
|
183
|
+
xpk_print(
|
|
184
|
+
'GKE Cluster Create/Update ConfigMap(s) request returned ERROR'
|
|
185
|
+
f' {return_code}'
|
|
186
|
+
)
|
|
187
|
+
return 1
|
|
188
|
+
return 0
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def check_cluster_resources(args, system) -> tuple[bool, bool]:
|
|
192
|
+
"""Check if cluster has resources of a specified device_type/gke_accelerator.
|
|
193
|
+
This check will be skipped if <args.cluster>-<_CLUSTER_RESOURCES_CONFIGMAP> ConfigMap doesn't exist for the cluster.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
args: user provided arguments for running the command.
|
|
197
|
+
system: system characteristics.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Tuple of bool, bool
|
|
201
|
+
True if resources in the cluster should be checked, False otherwise.
|
|
202
|
+
True if device_type/gke_accelerator exists in the cluster, False otherwise.
|
|
203
|
+
"""
|
|
204
|
+
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
205
|
+
resources_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
206
|
+
if resources_config_map is None:
|
|
207
|
+
xpk_print(
|
|
208
|
+
f'No ConfigMap exist for cluster with the name {resources_config_map}.'
|
|
209
|
+
' Cluster resources check will be skipped.'
|
|
210
|
+
)
|
|
211
|
+
return False, False
|
|
212
|
+
if system.device_type in resources_config_map:
|
|
213
|
+
return True, True
|
|
214
|
+
elif system.gke_accelerator in resources_config_map:
|
|
215
|
+
return True, True
|
|
216
|
+
return True, False
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
|
|
220
|
+
"""Get systemCharcteristics based on the cluster resources configMap
|
|
221
|
+
Args:
|
|
222
|
+
args: user provided arguments for running the command.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
returns system characteristics
|
|
226
|
+
"""
|
|
227
|
+
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
228
|
+
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
229
|
+
|
|
230
|
+
if cluster_config_map is None:
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
for key in cluster_config_map:
|
|
234
|
+
system, result_code = get_system_characteristics_by_device_type(key)
|
|
235
|
+
if result_code == 0:
|
|
236
|
+
return system
|
|
237
|
+
|
|
238
|
+
return None
|