xpk 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +131 -0
- xpk/commands/cluster.py +808 -0
- xpk/commands/cluster_gcluster.py +269 -0
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +243 -0
- xpk/commands/inspector.py +357 -0
- xpk/commands/job.py +199 -0
- xpk/commands/kind.py +283 -0
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +140 -0
- xpk/commands/storage.py +267 -0
- xpk/commands/version.py +27 -0
- xpk/commands/workload.py +889 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +62 -0
- xpk/core/blueprint/blueprint_generator.py +708 -0
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +200 -0
- xpk/core/commands.py +356 -0
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +176 -0
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +444 -0
- xpk/core/kueue.py +358 -0
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +361 -0
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +377 -0
- xpk/core/ray.py +222 -0
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +1432 -0
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +341 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +129 -0
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
- xpk/main.py +75 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +43 -0
- xpk/parser/cluster.py +662 -0
- xpk/parser/common.py +259 -0
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +135 -0
- xpk/parser/info.py +64 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +147 -0
- xpk/parser/kind.py +95 -0
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +59 -0
- xpk/parser/storage.py +316 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +726 -0
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +88 -0
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- xpk/utils/yaml.py +30 -0
- xpk-0.0.1.dist-info/LICENSE +202 -0
- xpk-0.0.1.dist-info/METADATA +1498 -0
- xpk-0.0.1.dist-info/RECORD +92 -0
- xpk-0.0.1.dist-info/WHEEL +5 -0
- xpk-0.0.1.dist-info/entry_points.txt +2 -0
- xpk-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
from ..core.remote_state.remote_state_client import RemoteStateClient
|
|
20
|
+
from ..core.remote_state.fuse_remote_state import FuseStateClient
|
|
21
|
+
from ..core.blueprint.blueprint_generator import (
|
|
22
|
+
BlueprintGenerator,
|
|
23
|
+
BlueprintGeneratorOutput,
|
|
24
|
+
a3mega_device_type,
|
|
25
|
+
a3ultra_device_type,
|
|
26
|
+
supported_device_types,
|
|
27
|
+
)
|
|
28
|
+
from ..core.commands import run_command_for_value
|
|
29
|
+
from ..core.capacity import get_capacity_type
|
|
30
|
+
from ..core.docker_manager import DockerManager
|
|
31
|
+
from ..core.gcloud_context import zone_to_region
|
|
32
|
+
from ..core.gcluster_manager import GclusterManager
|
|
33
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
34
|
+
from ..utils.file import ensure_directory_exists
|
|
35
|
+
from ..utils.network import all_IPs_cidr
|
|
36
|
+
from ..utils.objects import hash_string
|
|
37
|
+
from ..core.cluster import get_cluster_credentials
|
|
38
|
+
from ..core.kjob import apply_kjob_crds, prepare_kjob
|
|
39
|
+
|
|
40
|
+
blueprints_path = os.path.abspath('xpkclusters/blueprints')
|
|
41
|
+
gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
|
|
42
|
+
gcloud_cfg_path = os.path.expanduser('~/.config/gcloud')
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def cluster_create(args) -> None:
|
|
46
|
+
"""Function around cluster creation using Cluster toolkit.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
args: user provided arguments for running the command.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
0 if successful and 1 otherwise.
|
|
53
|
+
"""
|
|
54
|
+
check_gcloud_authenticated()
|
|
55
|
+
prepare_directories()
|
|
56
|
+
region = zone_to_region(args.zone)
|
|
57
|
+
|
|
58
|
+
# unique_name uses shortened hash string, so still name collision is possible
|
|
59
|
+
unique_name = get_unique_name(args.project, region, args.cluster)
|
|
60
|
+
# prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
|
|
61
|
+
prefix = get_prefix_path(args.project, region)
|
|
62
|
+
remote_state_client = None
|
|
63
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
64
|
+
remote_state_client = FuseStateClient(
|
|
65
|
+
bucket=args.cluster_state_gcs_bucket,
|
|
66
|
+
state_directory=os.path.join(blueprints_path, prefix, unique_name),
|
|
67
|
+
prefix=prefix,
|
|
68
|
+
cluster=args.cluster,
|
|
69
|
+
deployment_name=unique_name,
|
|
70
|
+
)
|
|
71
|
+
gcm = prepare_gcluster_manager(remote_state_client)
|
|
72
|
+
|
|
73
|
+
bp = generate_blueprint(blueprint_name=unique_name, args=args, prefix=prefix)
|
|
74
|
+
|
|
75
|
+
# staging: sending the blueprint file(s) to gcluster's working directory
|
|
76
|
+
bp_staged_path = gcm.stage_files(
|
|
77
|
+
blueprint_file=bp.blueprint_file,
|
|
78
|
+
blueprint_dependencies=bp.blueprint_dependencies,
|
|
79
|
+
prefix=prefix,
|
|
80
|
+
)
|
|
81
|
+
gcm.deploy(
|
|
82
|
+
blueprint_path=bp_staged_path,
|
|
83
|
+
deployment_name=unique_name,
|
|
84
|
+
prefix=prefix,
|
|
85
|
+
)
|
|
86
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
87
|
+
gcm.upload_state()
|
|
88
|
+
|
|
89
|
+
get_cluster_credentials(args)
|
|
90
|
+
|
|
91
|
+
err_code = apply_kjob_crds(args)
|
|
92
|
+
if err_code > 0:
|
|
93
|
+
xpk_exit(err_code)
|
|
94
|
+
|
|
95
|
+
err_code = prepare_kjob(args)
|
|
96
|
+
if err_code > 0:
|
|
97
|
+
xpk_exit(err_code)
|
|
98
|
+
|
|
99
|
+
xpk_exit(0)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def cluster_delete(args) -> None:
|
|
103
|
+
"""Function around cluster delete for the clusters created by Cluster toolkit.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
args: user provided arguments for running the command.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
0 if successful and 1 otherwise.
|
|
110
|
+
"""
|
|
111
|
+
check_gcloud_authenticated()
|
|
112
|
+
prepare_directories()
|
|
113
|
+
region = zone_to_region(args.zone)
|
|
114
|
+
unique_name = get_unique_name(args.project, region, args.cluster)
|
|
115
|
+
# prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
|
|
116
|
+
prefix = get_prefix_path(args.project, region)
|
|
117
|
+
remote_state_client = None
|
|
118
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
119
|
+
remote_state_client = FuseStateClient(
|
|
120
|
+
bucket=args.cluster_state_gcs_bucket,
|
|
121
|
+
state_directory=os.path.join(blueprints_path, prefix, unique_name),
|
|
122
|
+
prefix=prefix,
|
|
123
|
+
cluster=args.cluster,
|
|
124
|
+
deployment_name=unique_name,
|
|
125
|
+
)
|
|
126
|
+
gcm = prepare_gcluster_manager(remote_state_client)
|
|
127
|
+
|
|
128
|
+
# unique_name uses shortened hash string, so still name collision is possible
|
|
129
|
+
unique_name = get_unique_name(args.project, region, args.cluster)
|
|
130
|
+
# prefix is to prevent name collisions for blueprints and also deployments by storing them in prefix directory. Ex.: blueprints/{prefix}/cluster_name_hash
|
|
131
|
+
prefix = get_prefix_path(args.project, region)
|
|
132
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
133
|
+
gcm.download_state()
|
|
134
|
+
|
|
135
|
+
bp = BlueprintGeneratorOutput(
|
|
136
|
+
blueprint_file=os.path.join(blueprints_path, prefix, unique_name)
|
|
137
|
+
+ '.yaml',
|
|
138
|
+
blueprint_dependencies=os.path.join(
|
|
139
|
+
blueprints_path, prefix, unique_name
|
|
140
|
+
),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
gcm.stage_files(
|
|
144
|
+
blueprint_file=bp.blueprint_file,
|
|
145
|
+
blueprint_dependencies=bp.blueprint_dependencies,
|
|
146
|
+
prefix=prefix,
|
|
147
|
+
)
|
|
148
|
+
gcm.destroy_deployment(deployment_name=unique_name, prefix=prefix)
|
|
149
|
+
|
|
150
|
+
xpk_exit(0)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def created_by_gcluster(args) -> bool:
|
|
154
|
+
prepare_directories()
|
|
155
|
+
region = zone_to_region(args.zone)
|
|
156
|
+
unique_name = get_unique_name(args.project, region, args.cluster)
|
|
157
|
+
prefix = get_prefix_path(args.project, region)
|
|
158
|
+
bpg = prepare_blueprint_generator()
|
|
159
|
+
return bpg.blueprint_exists(unique_name, prefix)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def get_unique_name(project_id, region, cluster_name):
|
|
163
|
+
unique_string_hash = hash_string(
|
|
164
|
+
input_string=f'{project_id}-{region}-{cluster_name}'.lower(), length=5
|
|
165
|
+
)
|
|
166
|
+
return f'{cluster_name}-{unique_string_hash}'
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def get_prefix_path(project_id, region):
|
|
170
|
+
return f'{project_id}-{region}'.lower()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def prepare_directories() -> None:
|
|
174
|
+
ensure_directory_exists(blueprints_path)
|
|
175
|
+
ensure_directory_exists(gcluster_working_dir)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def check_gcloud_authenticated():
|
|
179
|
+
if not os.path.exists(gcloud_cfg_path):
|
|
180
|
+
xpk_print(
|
|
181
|
+
'Failed to find gcloud credential directory.'
|
|
182
|
+
f' {gcloud_cfg_path} {blueprints_path} {gcluster_working_dir}'
|
|
183
|
+
)
|
|
184
|
+
xpk_print(
|
|
185
|
+
'Please authenticate to gcloud ("gcloud auth application-default'
|
|
186
|
+
' login") and then run your command.'
|
|
187
|
+
)
|
|
188
|
+
xpk_exit(1)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def prepare_gcluster_manager(
|
|
192
|
+
remote_state_client: RemoteStateClient | None,
|
|
193
|
+
) -> GclusterManager:
|
|
194
|
+
dm = DockerManager(
|
|
195
|
+
working_dir=gcluster_working_dir, gcloud_cfg_path=gcloud_cfg_path
|
|
196
|
+
)
|
|
197
|
+
dm.initialize()
|
|
198
|
+
return GclusterManager(
|
|
199
|
+
gcluster_command_runner=dm, remote_state_client=remote_state_client
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def prepare_blueprint_generator() -> BlueprintGenerator:
|
|
204
|
+
return BlueprintGenerator(storage_path=blueprints_path)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def validate_state_gcs_bucket(args):
|
|
208
|
+
bucket_validate_cmd = (
|
|
209
|
+
f'gcloud storage buckets describe gs://{args.cluster_state_gcs_bucket}'
|
|
210
|
+
)
|
|
211
|
+
err_code, _ = run_command_for_value(
|
|
212
|
+
bucket_validate_cmd,
|
|
213
|
+
'Validate remote state bucket existence.',
|
|
214
|
+
global_args=args,
|
|
215
|
+
)
|
|
216
|
+
if err_code != 0:
|
|
217
|
+
xpk_exit(err_code)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def generate_blueprint(
|
|
221
|
+
blueprint_name, args, prefix=None
|
|
222
|
+
) -> BlueprintGeneratorOutput:
|
|
223
|
+
capacity_type, return_code = get_capacity_type(args)
|
|
224
|
+
if return_code != 0:
|
|
225
|
+
xpk_print('Capacity type is invalid.')
|
|
226
|
+
xpk_exit(return_code)
|
|
227
|
+
|
|
228
|
+
bpg = prepare_blueprint_generator()
|
|
229
|
+
|
|
230
|
+
if args.cluster_state_gcs_bucket is not None:
|
|
231
|
+
validate_state_gcs_bucket(args)
|
|
232
|
+
|
|
233
|
+
if args.device_type in supported_device_types:
|
|
234
|
+
if args.device_type == a3mega_device_type:
|
|
235
|
+
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
236
|
+
return bpg.generate_a3_mega_blueprint(
|
|
237
|
+
blueprint_name=blueprint_name,
|
|
238
|
+
prefix=prefix,
|
|
239
|
+
cluster_name=args.cluster,
|
|
240
|
+
region=zone_to_region(args.zone),
|
|
241
|
+
project_id=args.project,
|
|
242
|
+
zone=args.zone,
|
|
243
|
+
auth_cidr=all_IPs_cidr,
|
|
244
|
+
num_nodes=num_nodes,
|
|
245
|
+
reservation=args.reservation if args.reservation else None,
|
|
246
|
+
capacity_type=capacity_type,
|
|
247
|
+
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
248
|
+
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
249
|
+
gcs_bucket=args.cluster_state_gcs_bucket,
|
|
250
|
+
)
|
|
251
|
+
if args.device_type == a3ultra_device_type:
|
|
252
|
+
num_nodes = args.num_nodes if not args.num_nodes is None else 2
|
|
253
|
+
return bpg.generate_a3_ultra_blueprint(
|
|
254
|
+
blueprint_name=blueprint_name,
|
|
255
|
+
prefix=prefix,
|
|
256
|
+
cluster_name=args.cluster,
|
|
257
|
+
region=zone_to_region(args.zone),
|
|
258
|
+
project_id=args.project,
|
|
259
|
+
zone=args.zone,
|
|
260
|
+
auth_cidr=all_IPs_cidr,
|
|
261
|
+
num_nodes=num_nodes,
|
|
262
|
+
reservation=args.reservation if args.reservation else None,
|
|
263
|
+
enable_filestore_csi_driver=args.enable_gcpfilestore_csi_driver,
|
|
264
|
+
capacity_type=capacity_type,
|
|
265
|
+
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
|
|
266
|
+
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
|
|
267
|
+
gcs_bucket=args.cluster_state_gcs_bucket,
|
|
268
|
+
)
|
|
269
|
+
return None
|
xpk/commands/common.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..core.commands import run_command_with_updates_retry
|
|
18
|
+
from ..core.gcloud_context import zone_to_region
|
|
19
|
+
from ..utils.console import xpk_print
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def set_cluster_command(args) -> int:
|
|
23
|
+
"""Run cluster configuration command to set the kubectl config.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
args: user provided arguments for running the command.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
0 if successful and 1 otherwise.
|
|
30
|
+
"""
|
|
31
|
+
command = (
|
|
32
|
+
'gcloud container clusters get-credentials'
|
|
33
|
+
f' {args.cluster} --region={zone_to_region(args.zone)}'
|
|
34
|
+
f' --project={args.project} &&'
|
|
35
|
+
' kubectl config view && kubectl config set-context --current'
|
|
36
|
+
' --namespace=default'
|
|
37
|
+
)
|
|
38
|
+
task = f'get-credentials to cluster {args.cluster}'
|
|
39
|
+
return_code = run_command_with_updates_retry(
|
|
40
|
+
command, task, args, verbose=False
|
|
41
|
+
)
|
|
42
|
+
if return_code != 0:
|
|
43
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
44
|
+
return return_code
|
xpk/commands/config.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..core.config import XpkConfig
|
|
18
|
+
from ..utils.console import xpk_print
|
|
19
|
+
|
|
20
|
+
xpk_cfg = XpkConfig()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def set_config(args):
|
|
24
|
+
xpk_cfg.set(args.set_config_args[0], args.set_config_args[1])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_config(args):
|
|
28
|
+
value = xpk_cfg.get(args.get_config_key[0])
|
|
29
|
+
xpk_print(value)
|
xpk/commands/info.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
from argparse import Namespace
|
|
19
|
+
|
|
20
|
+
from tabulate import tabulate
|
|
21
|
+
|
|
22
|
+
from ..core.commands import run_command_for_value
|
|
23
|
+
from ..core.gcloud_context import add_zone_and_project
|
|
24
|
+
from ..core.kueue import verify_kueuectl
|
|
25
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
26
|
+
from .common import set_cluster_command
|
|
27
|
+
|
|
28
|
+
table_fmt = 'plain'
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def info(args: Namespace) -> None:
|
|
32
|
+
"""Provide info about localqueues, clusterqueues and their resources.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
args: user provided arguments for running the command.
|
|
36
|
+
Returns:
|
|
37
|
+
None
|
|
38
|
+
"""
|
|
39
|
+
add_zone_and_project(args)
|
|
40
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
41
|
+
if set_cluster_command_code != 0:
|
|
42
|
+
xpk_exit(set_cluster_command_code)
|
|
43
|
+
|
|
44
|
+
verify_kueuectl(args)
|
|
45
|
+
lq, cq = bool(args.localqueue), bool(args.clusterqueue)
|
|
46
|
+
if not lq and not cq:
|
|
47
|
+
lq, cq = True, True
|
|
48
|
+
|
|
49
|
+
lqs, cqs = None, None
|
|
50
|
+
if lq:
|
|
51
|
+
lqs = run_kueuectl_list_localqueue(args)
|
|
52
|
+
|
|
53
|
+
cqs = run_kueuectl_list_clusterqueue(args)
|
|
54
|
+
quotas = get_nominal_quotas(cqs)
|
|
55
|
+
|
|
56
|
+
if lq:
|
|
57
|
+
print_formatted_lqs(lqs, quotas)
|
|
58
|
+
|
|
59
|
+
if cq:
|
|
60
|
+
print_formatted_cqs(cqs, quotas)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
|
|
64
|
+
"""Get quotas from clusterqueues.
|
|
65
|
+
This function retrieves how much of resource in each flavor is assigned to cluster queue.
|
|
66
|
+
It parses flavors of passed cluster queues.
|
|
67
|
+
Args:
|
|
68
|
+
- cqs - list of cluster queues.
|
|
69
|
+
Returns:
|
|
70
|
+
- dictionary of cluster queues resources quotas in format:
|
|
71
|
+
{cq_name:{"flavorName:resourceName":quota}}
|
|
72
|
+
"""
|
|
73
|
+
try:
|
|
74
|
+
cq_list = json.loads(cqs)['items']
|
|
75
|
+
except ValueError:
|
|
76
|
+
xpk_print('Incorrect respone from list clusterqueue')
|
|
77
|
+
xpk_print(cqs)
|
|
78
|
+
xpk_exit(1)
|
|
79
|
+
|
|
80
|
+
quotas = {}
|
|
81
|
+
for cq in cq_list:
|
|
82
|
+
spec = cq['spec']
|
|
83
|
+
cq_name = cq['metadata']['name']
|
|
84
|
+
quotas[cq_name] = {}
|
|
85
|
+
for rg in spec['resourceGroups']:
|
|
86
|
+
for flavor in rg['flavors']:
|
|
87
|
+
name = flavor['name']
|
|
88
|
+
for resource in flavor['resources']:
|
|
89
|
+
key = f'{name}:{resource["name"]}'
|
|
90
|
+
quotas[cq_name][key] = resource['nominalQuota']
|
|
91
|
+
return quotas
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def print_formatted_cqs(cqs: list[dict], nominalQuotas) -> None:
|
|
95
|
+
try:
|
|
96
|
+
cq_list = json.loads(cqs)['items']
|
|
97
|
+
except ValueError:
|
|
98
|
+
xpk_print('Incorrect respone from list clusterqueue')
|
|
99
|
+
xpk_print(cqs)
|
|
100
|
+
xpk_exit(1)
|
|
101
|
+
|
|
102
|
+
cq_usages = parse_queue_lists(cq_list, nominalQuotas)
|
|
103
|
+
|
|
104
|
+
xpk_print(
|
|
105
|
+
'Cluster Queues usage \n',
|
|
106
|
+
tabulate(cq_usages, headers='keys', tablefmt=table_fmt),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def print_formatted_lqs(lqs: list[dict], nominalQuotas) -> None:
|
|
111
|
+
try:
|
|
112
|
+
lq_list = json.loads(lqs)['items']
|
|
113
|
+
except ValueError:
|
|
114
|
+
xpk_print('Incorrect respone from list localqueue')
|
|
115
|
+
xpk_print(lqs)
|
|
116
|
+
xpk_exit(1)
|
|
117
|
+
|
|
118
|
+
lq_usages = parse_queue_lists(lq_list, nominalQuotas)
|
|
119
|
+
xpk_print(
|
|
120
|
+
'Local Queues usage \n',
|
|
121
|
+
tabulate(lq_usages, headers='keys', tablefmt=table_fmt),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def parse_queue_lists(
|
|
126
|
+
qs: list[dict],
|
|
127
|
+
flavor_resource_quotas: dict,
|
|
128
|
+
reservation_key: str = 'flavorsReservation',
|
|
129
|
+
) -> list[dict]:
|
|
130
|
+
qs_usage_list = []
|
|
131
|
+
for q in qs:
|
|
132
|
+
queue_name = q['metadata']['name']
|
|
133
|
+
q_pending_workloads = q['status']['pendingWorkloads']
|
|
134
|
+
q_admitted_workloads = q['status']['admittedWorkloads']
|
|
135
|
+
q_status = {
|
|
136
|
+
'QUEUE': queue_name,
|
|
137
|
+
'ADMITTED_WORKLOADS': q_admitted_workloads,
|
|
138
|
+
'PENDING_WORKLOADS': q_pending_workloads,
|
|
139
|
+
}
|
|
140
|
+
q_status.update(
|
|
141
|
+
get_flavors_usage(q, reservation_key, flavor_resource_quotas)
|
|
142
|
+
)
|
|
143
|
+
qs_usage_list.append(q_status)
|
|
144
|
+
return qs_usage_list
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def get_flavors_resources_reservations(
|
|
148
|
+
cq_name: str, flavors_res: list[dict]
|
|
149
|
+
) -> dict[str, dict[str, str]]:
|
|
150
|
+
"""Get usage of flavors resources.
|
|
151
|
+
This function parser flavorsReservation section of clusterQueue of LocalQueue.
|
|
152
|
+
Args:
|
|
153
|
+
- cq_name - name of ClusterQueue to which flavors belong.
|
|
154
|
+
- flavors_res - list of reservations made by flavors
|
|
155
|
+
Returns:
|
|
156
|
+
Dict containing usage of each resource in flavor for each flavor in cluster or local queue.
|
|
157
|
+
Dict format: {cq_name: {{flavor:resource}:reservation}}
|
|
158
|
+
"""
|
|
159
|
+
reservations = {}
|
|
160
|
+
reservations[cq_name] = {}
|
|
161
|
+
for flavor_name, flavor_resources_reservation_list in flavors_res.items():
|
|
162
|
+
for resource in flavor_resources_reservation_list:
|
|
163
|
+
reservations[cq_name][f'{flavor_name}:{resource["name"]}'] = resource[
|
|
164
|
+
'total'
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
return reservations
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def get_flavors_usage(
|
|
171
|
+
q_entry: dict, res_field: str, flavor_resource_quotas: dict
|
|
172
|
+
) -> list[dict]:
|
|
173
|
+
"""Parse q_entry to retrieve list of each resource usage in flavour.
|
|
174
|
+
Args:
|
|
175
|
+
q_entry - single entry into either LocalQueue or ClusterQueue structured as json
|
|
176
|
+
flavor_resource_quotas - nominalQuota of flavors resource usage for each clusterqueue
|
|
177
|
+
Returns:
|
|
178
|
+
list of dicts where each list entry is in format (key, entry) where:
|
|
179
|
+
- key is flavorName:resourceName
|
|
180
|
+
- entry is flavorResourceReservation/flavorResourceQuota
|
|
181
|
+
"""
|
|
182
|
+
status = q_entry['status']
|
|
183
|
+
flavors_res = status[res_field]
|
|
184
|
+
queue_type = q_entry['kind']
|
|
185
|
+
|
|
186
|
+
flavors_res = {flavor['name']: flavor['resources'] for flavor in flavors_res}
|
|
187
|
+
usage_fraction = {}
|
|
188
|
+
cq_name = (
|
|
189
|
+
q_entry['metadata']['name']
|
|
190
|
+
if queue_type == 'ClusterQueue'
|
|
191
|
+
else q_entry['spec']['clusterQueue']
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
reservations = get_flavors_resources_reservations(cq_name, flavors_res)
|
|
195
|
+
|
|
196
|
+
for cq_name, cq_reservations in reservations.items():
|
|
197
|
+
cq_nominal_quotas = flavor_resource_quotas[cq_name]
|
|
198
|
+
|
|
199
|
+
for flavor_resource, flavor_resource_quota in cq_nominal_quotas.items():
|
|
200
|
+
flavor_resource_reservation = cq_reservations[flavor_resource]
|
|
201
|
+
usage_fraction[flavor_resource] = (
|
|
202
|
+
f'{flavor_resource_reservation}/{flavor_resource_quota}'
|
|
203
|
+
)
|
|
204
|
+
return usage_fraction
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def run_kueuectl_list_localqueue(args: Namespace) -> str:
|
|
208
|
+
"""Run the kueuectl list localqueue command.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
args: user provided arguments for running the command.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
kueuectl list localqueue formatted as json string.
|
|
215
|
+
"""
|
|
216
|
+
command = 'kubectl kueue list localqueue -o json'
|
|
217
|
+
if args.namespace != '':
|
|
218
|
+
command += f' --namespace {args.namespace}'
|
|
219
|
+
return_code, val = run_command_for_value(command, 'list localqueue', args)
|
|
220
|
+
|
|
221
|
+
if return_code != 0:
|
|
222
|
+
xpk_print(f'Cluster info request returned ERROR {return_code}')
|
|
223
|
+
xpk_exit(return_code)
|
|
224
|
+
return val
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def run_kueuectl_list_clusterqueue(args: Namespace) -> str:
|
|
228
|
+
"""Run the kueuectl list clusterqueue command.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
args: user provided arguments for running the command.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
kueuectl list clusterqueue formatted as json string
|
|
235
|
+
"""
|
|
236
|
+
command = 'kubectl kueue list clusterqueue -o json'
|
|
237
|
+
|
|
238
|
+
return_code, val = run_command_for_value(command, 'list clusterqueue', args)
|
|
239
|
+
|
|
240
|
+
if return_code != 0:
|
|
241
|
+
xpk_print(f'Cluster info request returned ERROR {return_code}')
|
|
242
|
+
xpk_exit(return_code)
|
|
243
|
+
return val
|