xpk 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +131 -0
- xpk/commands/cluster.py +808 -0
- xpk/commands/cluster_gcluster.py +269 -0
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +243 -0
- xpk/commands/inspector.py +357 -0
- xpk/commands/job.py +199 -0
- xpk/commands/kind.py +283 -0
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +140 -0
- xpk/commands/storage.py +267 -0
- xpk/commands/version.py +27 -0
- xpk/commands/workload.py +889 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +62 -0
- xpk/core/blueprint/blueprint_generator.py +708 -0
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +200 -0
- xpk/core/commands.py +356 -0
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +176 -0
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +444 -0
- xpk/core/kueue.py +358 -0
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +361 -0
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +377 -0
- xpk/core/ray.py +222 -0
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +1432 -0
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +341 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +129 -0
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
- xpk/main.py +75 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +43 -0
- xpk/parser/cluster.py +662 -0
- xpk/parser/common.py +259 -0
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +135 -0
- xpk/parser/info.py +64 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +147 -0
- xpk/parser/kind.py +95 -0
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +59 -0
- xpk/parser/storage.py +316 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +726 -0
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +88 -0
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- xpk/utils/yaml.py +30 -0
- {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/METADATA +456 -32
- xpk-0.7.0.dist-info/RECORD +92 -0
- {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/WHEEL +1 -1
- xpk-0.7.0.dist-info/entry_points.txt +2 -0
- xpk-0.5.0.dist-info/RECORD +0 -7
- xpk-0.5.0.dist-info/entry_points.txt +0 -2
- xpk.py +0 -7282
- {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/LICENSE +0 -0
- {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/top_level.txt +0 -0
xpk/core/kueue.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from argparse import Namespace
|
|
18
|
+
|
|
19
|
+
import packaging
|
|
20
|
+
from packaging.version import Version
|
|
21
|
+
|
|
22
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
23
|
+
from ..utils.file import write_tmp_file
|
|
24
|
+
from .commands import (
|
|
25
|
+
run_command_for_value,
|
|
26
|
+
run_command_with_updates,
|
|
27
|
+
run_command_with_updates_retry,
|
|
28
|
+
)
|
|
29
|
+
from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
|
|
30
|
+
from .resources import AutoprovisioningConfig
|
|
31
|
+
from .scheduling import (
|
|
32
|
+
create_accelerator_label,
|
|
33
|
+
create_machine_label,
|
|
34
|
+
get_total_chips_requested_from_args,
|
|
35
|
+
)
|
|
36
|
+
from .system_characteristics import (
|
|
37
|
+
AcceleratorTypeToAcceleratorCharacteristics,
|
|
38
|
+
SystemCharacteristics,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
KUEUE_VERSION = 'v0.10.0'
|
|
42
|
+
CLUSTER_QUEUE_NAME = 'cluster-queue'
|
|
43
|
+
LOCAL_QUEUE_NAME = 'multislice-queue'
|
|
44
|
+
WAIT_FOR_KUEUE_TIMEOUT = '5m'
|
|
45
|
+
|
|
46
|
+
packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$'
|
|
47
|
+
|
|
48
|
+
cluster_set_crd_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
|
|
49
|
+
kind: ResourceFlavor
|
|
50
|
+
metadata:
|
|
51
|
+
name: {cluster_hardware_name}
|
|
52
|
+
spec:
|
|
53
|
+
nodeLabels:
|
|
54
|
+
{accelerator_label}
|
|
55
|
+
{machine_label}
|
|
56
|
+
---
|
|
57
|
+
{pw_resource_flavors}
|
|
58
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
59
|
+
kind: ClusterQueue
|
|
60
|
+
metadata:
|
|
61
|
+
name: {cluster_queue_name}
|
|
62
|
+
spec:
|
|
63
|
+
preemption:
|
|
64
|
+
reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
|
|
65
|
+
withinClusterQueue: LowerPriority
|
|
66
|
+
namespaceSelector: {{}} # match all.
|
|
67
|
+
resourceGroups:
|
|
68
|
+
{covered_resources_config}
|
|
69
|
+
{pw_resources_kueue}
|
|
70
|
+
---
|
|
71
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
72
|
+
kind: LocalQueue
|
|
73
|
+
metadata:
|
|
74
|
+
namespace: default
|
|
75
|
+
name: {local_queue_name}
|
|
76
|
+
spec:
|
|
77
|
+
clusterQueue: {cluster_queue_name}
|
|
78
|
+
---
|
|
79
|
+
apiVersion: scheduling.k8s.io/v1
|
|
80
|
+
kind: PriorityClass
|
|
81
|
+
metadata:
|
|
82
|
+
name: very-low
|
|
83
|
+
value: 100
|
|
84
|
+
globalDefault: false
|
|
85
|
+
description: "Very Low"
|
|
86
|
+
---
|
|
87
|
+
apiVersion: scheduling.k8s.io/v1
|
|
88
|
+
kind: PriorityClass
|
|
89
|
+
metadata:
|
|
90
|
+
name: low
|
|
91
|
+
value: 250
|
|
92
|
+
globalDefault: false
|
|
93
|
+
description: "Low"
|
|
94
|
+
---
|
|
95
|
+
apiVersion: scheduling.k8s.io/v1
|
|
96
|
+
kind: PriorityClass
|
|
97
|
+
metadata:
|
|
98
|
+
name: medium
|
|
99
|
+
value: 500
|
|
100
|
+
globalDefault: false
|
|
101
|
+
description: "Medium"
|
|
102
|
+
---
|
|
103
|
+
apiVersion: scheduling.k8s.io/v1
|
|
104
|
+
kind: PriorityClass
|
|
105
|
+
metadata:
|
|
106
|
+
name: high
|
|
107
|
+
value: 750
|
|
108
|
+
globalDefault: false
|
|
109
|
+
description: "High"
|
|
110
|
+
---
|
|
111
|
+
apiVersion: scheduling.k8s.io/v1
|
|
112
|
+
kind: PriorityClass
|
|
113
|
+
metadata:
|
|
114
|
+
name: very-high
|
|
115
|
+
value: 1000
|
|
116
|
+
globalDefault: false
|
|
117
|
+
description: "Very High"
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
cluster_preheat_yml = """
|
|
121
|
+
apiVersion: apps/v1
|
|
122
|
+
kind: DaemonSet
|
|
123
|
+
metadata:
|
|
124
|
+
name: {cachekey}
|
|
125
|
+
labels:
|
|
126
|
+
k8s-app: {cachekey}
|
|
127
|
+
spec:
|
|
128
|
+
selector:
|
|
129
|
+
matchLabels:
|
|
130
|
+
k8s-app: {cachekey}
|
|
131
|
+
updateStrategy:
|
|
132
|
+
type: RollingUpdate
|
|
133
|
+
template:
|
|
134
|
+
metadata:
|
|
135
|
+
labels:
|
|
136
|
+
name: {cachekey}
|
|
137
|
+
k8s-app: {cachekey}
|
|
138
|
+
spec:
|
|
139
|
+
affinity:
|
|
140
|
+
nodeAffinity:
|
|
141
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
142
|
+
nodeSelectorTerms:
|
|
143
|
+
- matchExpressions:
|
|
144
|
+
- key: {nodeSelectorKey}
|
|
145
|
+
operator: Exists
|
|
146
|
+
tolerations:
|
|
147
|
+
- operator: "Exists"
|
|
148
|
+
containers:
|
|
149
|
+
- image: {image_name}
|
|
150
|
+
name: {cachekey}
|
|
151
|
+
command: [ "sleep", "inf" ]
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def verify_kueuectl(args: Namespace) -> None:
|
|
156
|
+
"""Verify if kueuectl is installed.
|
|
157
|
+
Args:
|
|
158
|
+
args: user provided arguments.
|
|
159
|
+
Returns:
|
|
160
|
+
None
|
|
161
|
+
"""
|
|
162
|
+
xpk_print('Veryfing kueuectl installation')
|
|
163
|
+
|
|
164
|
+
command = 'kubectl kueue version'
|
|
165
|
+
task = 'Verify kueuectl installation on cluster'
|
|
166
|
+
verify_kueuectl_installed_code, _ = run_command_for_value(command, task, args)
|
|
167
|
+
|
|
168
|
+
if verify_kueuectl_installed_code == 0:
|
|
169
|
+
xpk_print('kueuectl found')
|
|
170
|
+
|
|
171
|
+
if verify_kueuectl_installed_code != 0:
|
|
172
|
+
xpk_print(
|
|
173
|
+
'kueuectl not found. Please follow'
|
|
174
|
+
' https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/'
|
|
175
|
+
' to install kueuectl.'
|
|
176
|
+
)
|
|
177
|
+
xpk_exit(verify_kueuectl_installed_code)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def delete_multikueueconfigs_definitions(args) -> int:
|
|
181
|
+
command = 'kubectl delete crd multikueueconfigs.kueue.x-k8s.io'
|
|
182
|
+
task = 'Delete multikueueconfigs crds'
|
|
183
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
184
|
+
if return_code != 0:
|
|
185
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
186
|
+
return return_code
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def delete_multikueueclusters_definitions(args) -> int:
|
|
190
|
+
command = 'kubectl delete crd multikueueclusters.kueue.x-k8s.io'
|
|
191
|
+
task = 'Delete multikueueclusters crds'
|
|
192
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
193
|
+
if return_code != 0:
|
|
194
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
195
|
+
return return_code
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def get_kueue_version(args) -> (int, str):
|
|
199
|
+
command = 'kubectl kueue version'
|
|
200
|
+
task = 'Get kueue version on server'
|
|
201
|
+
return_code, val = run_command_for_value(command, task, args)
|
|
202
|
+
if return_code != 0:
|
|
203
|
+
return return_code, ''
|
|
204
|
+
lines = val.splitlines()
|
|
205
|
+
if len(lines) == 1:
|
|
206
|
+
return 1, ''
|
|
207
|
+
server_version_line = lines[1]
|
|
208
|
+
manager_image_version = server_version_line.split(':')[-1]
|
|
209
|
+
return return_code, manager_image_version
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def install_kueue_on_cluster(args) -> int:
|
|
213
|
+
"""Install Kueue on the cluster.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
args: user provided arguments for running the command.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
0 if successful and 1 otherwise.
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
err_code, kueue_version_installed = get_kueue_version(args)
|
|
223
|
+
if err_code == 0:
|
|
224
|
+
if Version(kueue_version_installed) < Version('v0.9.0') and Version(
|
|
225
|
+
KUEUE_VERSION
|
|
226
|
+
) >= Version('v0.9.0'):
|
|
227
|
+
xpk_print('Upgrading kueue on cluster from version < 0.9.0.')
|
|
228
|
+
upgrade_code = delete_multikueueclusters_definitions(args)
|
|
229
|
+
if upgrade_code != 0:
|
|
230
|
+
return upgrade_code
|
|
231
|
+
upgrade_code = delete_multikueueconfigs_definitions(args)
|
|
232
|
+
if upgrade_code != 0:
|
|
233
|
+
return upgrade_code
|
|
234
|
+
|
|
235
|
+
command = (
|
|
236
|
+
'kubectl apply --server-side --force-conflicts -f'
|
|
237
|
+
f' https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml'
|
|
238
|
+
)
|
|
239
|
+
task = 'Set Kueue On Cluster'
|
|
240
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
241
|
+
if return_code != 0:
|
|
242
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
243
|
+
return return_code
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def wait_for_kueue_available(args: Namespace) -> int:
|
|
247
|
+
"""Wait for Kueue to be fully available.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
args: user provided arguments for running the command.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
0 if successful and 1 otherwise.
|
|
254
|
+
"""
|
|
255
|
+
command = (
|
|
256
|
+
'kubectl wait deploy/kueue-controller-manager -nkueue-system'
|
|
257
|
+
f' --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}'
|
|
258
|
+
)
|
|
259
|
+
task = 'Wait for Kueue to be available'
|
|
260
|
+
return_code = run_command_with_updates(command, task, args)
|
|
261
|
+
if return_code != 0:
|
|
262
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
263
|
+
return return_code
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def install_kueue_crs(
|
|
267
|
+
args,
|
|
268
|
+
system: SystemCharacteristics,
|
|
269
|
+
autoprovisioning_config: AutoprovisioningConfig | None,
|
|
270
|
+
) -> int:
|
|
271
|
+
"""Install Kueue Custom Resources.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
args: user provided arguments for running the command.
|
|
275
|
+
system: system level arguments.
|
|
276
|
+
autoprovisioning_config: Autoprovisioning config to configure kueue with if
|
|
277
|
+
autoprovisioning is enabled.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
0 if successful and 1 otherwise.
|
|
281
|
+
"""
|
|
282
|
+
device_type = system.device_type
|
|
283
|
+
cluster_hardware_name = f'{args.num_slices}x{device_type}'
|
|
284
|
+
resource_type = AcceleratorTypeToAcceleratorCharacteristics[
|
|
285
|
+
system.accelerator_type
|
|
286
|
+
].resource_type
|
|
287
|
+
|
|
288
|
+
autoprovisioning_enabled = False
|
|
289
|
+
if autoprovisioning_config:
|
|
290
|
+
# Determine total resources available based on autoprovisioning max chips.
|
|
291
|
+
autoprovisioning_enabled = True
|
|
292
|
+
total_chips = autoprovisioning_config.maximum_chips
|
|
293
|
+
cluster_hardware_name = f'{system.gke_accelerator}'
|
|
294
|
+
else:
|
|
295
|
+
# Determine total chips based on user specified topology.
|
|
296
|
+
total_chips = get_total_chips_requested_from_args(args, system)
|
|
297
|
+
|
|
298
|
+
covered_resources_config = get_kueue_covered_resources_config(
|
|
299
|
+
cluster_hardware_name=cluster_hardware_name,
|
|
300
|
+
resource_type=resource_type,
|
|
301
|
+
total_chips=total_chips,
|
|
302
|
+
)
|
|
303
|
+
yml_string = cluster_set_crd_yaml.format(
|
|
304
|
+
system=system,
|
|
305
|
+
cluster_hardware_name=cluster_hardware_name,
|
|
306
|
+
accelerator_label=create_accelerator_label(
|
|
307
|
+
system.accelerator_type, system
|
|
308
|
+
),
|
|
309
|
+
machine_label=create_machine_label(
|
|
310
|
+
system.accelerator_type, system, autoprovisioning_enabled
|
|
311
|
+
),
|
|
312
|
+
covered_resources_config=covered_resources_config,
|
|
313
|
+
resource_type=AcceleratorTypeToAcceleratorCharacteristics[
|
|
314
|
+
system.accelerator_type
|
|
315
|
+
].resource_type,
|
|
316
|
+
pw_resource_flavors=add_pw_resource_flavors(args),
|
|
317
|
+
pw_resources_kueue=add_pw_resources_to_kueue(args),
|
|
318
|
+
cluster_queue_name=CLUSTER_QUEUE_NAME,
|
|
319
|
+
local_queue_name=LOCAL_QUEUE_NAME,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
tmp = write_tmp_file(yml_string)
|
|
323
|
+
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
324
|
+
|
|
325
|
+
task = 'Applying Kueue Custom Resources'
|
|
326
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
327
|
+
if return_code != 0:
|
|
328
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
329
|
+
return return_code
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def get_kueue_covered_resources_config(
|
|
333
|
+
cluster_hardware_name, resource_type, total_chips
|
|
334
|
+
) -> str:
|
|
335
|
+
"""Gets Kueue covered resources configuration.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
cluster_hardware_name: cluster hardware name.
|
|
339
|
+
resource_type: resource type of tpu or gpu.
|
|
340
|
+
total_chips: total number of chips for the specific resource type.
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
A string of Kueue covered resources configuration.
|
|
344
|
+
"""
|
|
345
|
+
config_format = """
|
|
346
|
+
- coveredResources: ["{resource_type}"]
|
|
347
|
+
flavors:
|
|
348
|
+
- name: {cluster_hardware_name}
|
|
349
|
+
resources:
|
|
350
|
+
- name: "{resource_type}"
|
|
351
|
+
nominalQuota: {total_chips}
|
|
352
|
+
"""
|
|
353
|
+
config_string = config_format.format(
|
|
354
|
+
cluster_hardware_name=cluster_hardware_name,
|
|
355
|
+
resource_type=resource_type,
|
|
356
|
+
total_chips=total_chips,
|
|
357
|
+
)
|
|
358
|
+
return config_string
|
xpk/core/monitoring.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..utils.console import xpk_print
|
|
18
|
+
from .commands import run_command_for_value
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_gke_dashboard(args, dashboard_filter) -> tuple[bool, str | None]:
|
|
22
|
+
"""Get the identifier of GKE dashboard deployed in the project.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
args: user provided arguments for running the command.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
bool:
|
|
29
|
+
True if 'gcloud monitoring dashboards list' returned an error or
|
|
30
|
+
multiple dashboards with same filter exist in the project,
|
|
31
|
+
False otherwise.
|
|
32
|
+
str:
|
|
33
|
+
identifier of dashboard if deployed in project,
|
|
34
|
+
None otherwise.
|
|
35
|
+
"""
|
|
36
|
+
command = (
|
|
37
|
+
'gcloud monitoring dashboards list'
|
|
38
|
+
f' --project={args.project} --filter="{dashboard_filter}"'
|
|
39
|
+
' --format="value(name)" --verbosity=error'
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
return_code, return_value = run_command_for_value(
|
|
43
|
+
command, 'GKE Dashboard List', args
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if return_code != 0:
|
|
47
|
+
xpk_print(
|
|
48
|
+
f'GKE Dashboard List request returned ERROR {return_code}. If there is'
|
|
49
|
+
' a permissions error, please check'
|
|
50
|
+
' https://github.com/google/xpk/blob/main/README.md#roles-needed-based-on-permission-errors'
|
|
51
|
+
' for possible solutions.'
|
|
52
|
+
)
|
|
53
|
+
return True, None
|
|
54
|
+
|
|
55
|
+
if not return_value:
|
|
56
|
+
xpk_print(
|
|
57
|
+
f'No dashboard with {dashboard_filter} found in the'
|
|
58
|
+
f' project:{args.project}.'
|
|
59
|
+
)
|
|
60
|
+
return False, return_value
|
|
61
|
+
|
|
62
|
+
dashboards = return_value.strip().split('\n')
|
|
63
|
+
if len(dashboards) > 1:
|
|
64
|
+
xpk_print(
|
|
65
|
+
f'Multiple dashboards with same {dashboard_filter} exist in the'
|
|
66
|
+
f' project:{args.project}. Delete all but one dashboard deployed using'
|
|
67
|
+
' https://github.com/google/cloud-tpu-monitoring-debugging.'
|
|
68
|
+
)
|
|
69
|
+
return True, None
|
|
70
|
+
|
|
71
|
+
if dashboards[0]:
|
|
72
|
+
return False, dashboards[0].strip().split('/')[-1]
|
|
73
|
+
|
|
74
|
+
return True, None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_gke_outlier_dashboard(args) -> str | None:
|
|
78
|
+
"""Get the identifier of GKE outlier dashboard deployed in the project.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
args: user provided arguments for running the command.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
str:
|
|
85
|
+
identifier of outlier dashboard if deployed in project,
|
|
86
|
+
None otherwise.
|
|
87
|
+
"""
|
|
88
|
+
outlier_dashboard_filter = "displayName:'GKE - TPU Monitoring Dashboard'"
|
|
89
|
+
is_error, dashboard_id = get_gke_dashboard(args, outlier_dashboard_filter)
|
|
90
|
+
|
|
91
|
+
# 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
|
|
92
|
+
if is_error:
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
# 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
|
|
96
|
+
if not is_error and not dashboard_id:
|
|
97
|
+
xpk_print(
|
|
98
|
+
'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
|
|
99
|
+
' deploy monitoring dashboard to view statistics and outlier mode of'
|
|
100
|
+
' GKE metrics.'
|
|
101
|
+
)
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
return str(dashboard_id)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_gke_debugging_dashboard(args) -> str | None:
|
|
108
|
+
"""Get the identifier of GKE debugging dashboard deployed in the project.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
args: user provided arguments for running the command.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
str:
|
|
115
|
+
identifier of debugging dashboard if deployed in project,
|
|
116
|
+
None otherwise.
|
|
117
|
+
"""
|
|
118
|
+
debugging_dashboard_filter = "displayName:'GKE - TPU Logging Dashboard'"
|
|
119
|
+
is_error, dashboard_id = get_gke_dashboard(args, debugging_dashboard_filter)
|
|
120
|
+
|
|
121
|
+
# 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
|
|
122
|
+
if is_error:
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
# 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
|
|
126
|
+
if not is_error and not dashboard_id:
|
|
127
|
+
xpk_print(
|
|
128
|
+
'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
|
|
129
|
+
' deploy debugging dashboard to view stack traces collected in Cloud'
|
|
130
|
+
' Logging.'
|
|
131
|
+
)
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
return str(dashboard_id)
|