xpk 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +109 -0
- xpk/commands/cluster.py +784 -0
- xpk/commands/cluster_gcluster.py +185 -0
- xpk/commands/info.py +245 -0
- xpk/commands/inspector.py +363 -0
- xpk/commands/job.py +197 -0
- xpk/commands/kind.py +253 -0
- xpk/commands/shell.py +120 -0
- xpk/commands/version.py +39 -0
- xpk/commands/workload.py +692 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +61 -0
- xpk/core/blueprint/blueprint_generator.py +652 -0
- xpk/core/cluster_private.py +197 -0
- xpk/core/commands.py +352 -0
- xpk/core/core.py +2824 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/gcluster_manager.py +158 -0
- xpk/core/kjob.py +205 -0
- xpk/core/kueue.py +352 -0
- xpk/core/nap.py +349 -0
- xpk/core/pathways.py +298 -0
- xpk/core/ray.py +222 -0
- xpk/core/system_characteristics.py +1395 -0
- xpk/core/workload.py +133 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +109 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
- xpk/main.py +73 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +184 -0
- xpk/parser/cluster.py +621 -0
- xpk/parser/common.py +71 -0
- xpk/parser/core.py +109 -0
- xpk/parser/info.py +63 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +126 -0
- xpk/parser/kind.py +94 -0
- xpk/parser/shell.py +50 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +684 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +85 -0
- xpk/utils/yaml.py +30 -0
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/METADATA +301 -28
- xpk-0.6.0.dist-info/RECORD +57 -0
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
- xpk-0.6.0.dist-info/entry_points.txt +2 -0
- xpk-0.5.0.dist-info/RECORD +0 -7
- xpk-0.5.0.dist-info/entry_points.txt +0 -2
- xpk.py +0 -7282
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
xpk/core/kueue.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from argparse import Namespace
|
|
18
|
+
from packaging.version import Version
|
|
19
|
+
import packaging
|
|
20
|
+
from ..utils.file import write_tmp_file
|
|
21
|
+
from ..utils.console import xpk_print, xpk_exit
|
|
22
|
+
from .commands import run_command_with_updates, run_command_with_updates_retry, run_command_for_value
|
|
23
|
+
from .core import (
|
|
24
|
+
AutoprovisioningConfig,
|
|
25
|
+
create_accelerator_label,
|
|
26
|
+
create_machine_label,
|
|
27
|
+
get_total_chips_requested_from_args,
|
|
28
|
+
)
|
|
29
|
+
from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
|
|
30
|
+
from .system_characteristics import (
|
|
31
|
+
AcceleratorTypeToAcceleratorCharacteristics,
|
|
32
|
+
SystemCharacteristics,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
KUEUE_VERSION = 'v0.10.0'
|
|
36
|
+
CLUSTER_QUEUE_NAME = 'cluster-queue'
|
|
37
|
+
LOCAL_QUEUE_NAME = 'multislice-queue'
|
|
38
|
+
WAIT_FOR_KUEUE_TIMEOUT = '5m'
|
|
39
|
+
|
|
40
|
+
packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$'
|
|
41
|
+
|
|
42
|
+
cluster_set_crd_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
|
|
43
|
+
kind: ResourceFlavor
|
|
44
|
+
metadata:
|
|
45
|
+
name: {cluster_hardware_name}
|
|
46
|
+
spec:
|
|
47
|
+
nodeLabels:
|
|
48
|
+
{accelerator_label}
|
|
49
|
+
{machine_label}
|
|
50
|
+
---
|
|
51
|
+
{pw_resource_flavors}
|
|
52
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
53
|
+
kind: ClusterQueue
|
|
54
|
+
metadata:
|
|
55
|
+
name: {cluster_queue_name}
|
|
56
|
+
spec:
|
|
57
|
+
preemption:
|
|
58
|
+
reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
|
|
59
|
+
withinClusterQueue: LowerPriority
|
|
60
|
+
namespaceSelector: {{}} # match all.
|
|
61
|
+
resourceGroups:
|
|
62
|
+
{covered_resources_config}
|
|
63
|
+
{pw_resources_kueue}
|
|
64
|
+
---
|
|
65
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
66
|
+
kind: LocalQueue
|
|
67
|
+
metadata:
|
|
68
|
+
namespace: default
|
|
69
|
+
name: {local_queue_name}
|
|
70
|
+
spec:
|
|
71
|
+
clusterQueue: {cluster_queue_name}
|
|
72
|
+
---
|
|
73
|
+
apiVersion: scheduling.k8s.io/v1
|
|
74
|
+
kind: PriorityClass
|
|
75
|
+
metadata:
|
|
76
|
+
name: very-low
|
|
77
|
+
value: 100
|
|
78
|
+
globalDefault: false
|
|
79
|
+
description: "Very Low"
|
|
80
|
+
---
|
|
81
|
+
apiVersion: scheduling.k8s.io/v1
|
|
82
|
+
kind: PriorityClass
|
|
83
|
+
metadata:
|
|
84
|
+
name: low
|
|
85
|
+
value: 250
|
|
86
|
+
globalDefault: false
|
|
87
|
+
description: "Low"
|
|
88
|
+
---
|
|
89
|
+
apiVersion: scheduling.k8s.io/v1
|
|
90
|
+
kind: PriorityClass
|
|
91
|
+
metadata:
|
|
92
|
+
name: medium
|
|
93
|
+
value: 500
|
|
94
|
+
globalDefault: false
|
|
95
|
+
description: "Medium"
|
|
96
|
+
---
|
|
97
|
+
apiVersion: scheduling.k8s.io/v1
|
|
98
|
+
kind: PriorityClass
|
|
99
|
+
metadata:
|
|
100
|
+
name: high
|
|
101
|
+
value: 750
|
|
102
|
+
globalDefault: false
|
|
103
|
+
description: "High"
|
|
104
|
+
---
|
|
105
|
+
apiVersion: scheduling.k8s.io/v1
|
|
106
|
+
kind: PriorityClass
|
|
107
|
+
metadata:
|
|
108
|
+
name: very-high
|
|
109
|
+
value: 1000
|
|
110
|
+
globalDefault: false
|
|
111
|
+
description: "Very High"
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
cluster_preheat_yml = """
|
|
115
|
+
apiVersion: apps/v1
|
|
116
|
+
kind: DaemonSet
|
|
117
|
+
metadata:
|
|
118
|
+
name: {cachekey}
|
|
119
|
+
labels:
|
|
120
|
+
k8s-app: {cachekey}
|
|
121
|
+
spec:
|
|
122
|
+
selector:
|
|
123
|
+
matchLabels:
|
|
124
|
+
k8s-app: {cachekey}
|
|
125
|
+
updateStrategy:
|
|
126
|
+
type: RollingUpdate
|
|
127
|
+
template:
|
|
128
|
+
metadata:
|
|
129
|
+
labels:
|
|
130
|
+
name: {cachekey}
|
|
131
|
+
k8s-app: {cachekey}
|
|
132
|
+
spec:
|
|
133
|
+
affinity:
|
|
134
|
+
nodeAffinity:
|
|
135
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
136
|
+
nodeSelectorTerms:
|
|
137
|
+
- matchExpressions:
|
|
138
|
+
- key: {nodeSelectorKey}
|
|
139
|
+
operator: Exists
|
|
140
|
+
tolerations:
|
|
141
|
+
- operator: "Exists"
|
|
142
|
+
containers:
|
|
143
|
+
- image: {image_name}
|
|
144
|
+
name: {cachekey}
|
|
145
|
+
command: [ "sleep", "inf" ]
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def verify_kueuectl(args: Namespace) -> None:
|
|
150
|
+
"""Verify if kueuectl is installed.
|
|
151
|
+
Args:
|
|
152
|
+
args: user provided arguments.
|
|
153
|
+
Returns:
|
|
154
|
+
None
|
|
155
|
+
"""
|
|
156
|
+
xpk_print('Veryfing kueuectl installation')
|
|
157
|
+
|
|
158
|
+
command = 'kubectl kueue version'
|
|
159
|
+
task = 'Verify kueuectl installation on cluster'
|
|
160
|
+
verify_kueuectl_installed_code, _ = run_command_for_value(command, task, args)
|
|
161
|
+
|
|
162
|
+
if verify_kueuectl_installed_code == 0:
|
|
163
|
+
xpk_print('kueuectl found')
|
|
164
|
+
|
|
165
|
+
if verify_kueuectl_installed_code != 0:
|
|
166
|
+
xpk_print(
|
|
167
|
+
'kueuectl not found. Please follow'
|
|
168
|
+
' https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/'
|
|
169
|
+
' to install kueuectl.'
|
|
170
|
+
)
|
|
171
|
+
xpk_exit(verify_kueuectl_installed_code)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def delete_multikueueconfigs_definitions(args) -> int:
|
|
175
|
+
command = 'kubectl delete crd multikueueconfigs.kueue.x-k8s.io'
|
|
176
|
+
task = 'Delete multikueueconfigs crds'
|
|
177
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
178
|
+
if return_code != 0:
|
|
179
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
180
|
+
return return_code
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def delete_multikueueclusters_definitions(args) -> int:
|
|
184
|
+
command = 'kubectl delete crd multikueueclusters.kueue.x-k8s.io'
|
|
185
|
+
task = 'Delete multikueueclusters crds'
|
|
186
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
187
|
+
if return_code != 0:
|
|
188
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
189
|
+
return return_code
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def get_kueue_version(args) -> (int, str):
|
|
193
|
+
command = 'kubectl kueue version'
|
|
194
|
+
task = 'Get kueue version on server'
|
|
195
|
+
return_code, val = run_command_for_value(command, task, args)
|
|
196
|
+
if return_code != 0:
|
|
197
|
+
return return_code, ''
|
|
198
|
+
lines = val.splitlines()
|
|
199
|
+
if len(lines) == 1:
|
|
200
|
+
return 1, ''
|
|
201
|
+
server_version_line = lines[1]
|
|
202
|
+
manager_image_version = server_version_line.split(':')[-1]
|
|
203
|
+
return return_code, manager_image_version
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def install_kueue_on_cluster(args) -> int:
|
|
207
|
+
"""Install Kueue on the cluster.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
args: user provided arguments for running the command.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
0 if successful and 1 otherwise.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
err_code, kueue_version_installed = get_kueue_version(args)
|
|
217
|
+
if err_code == 0:
|
|
218
|
+
if Version(kueue_version_installed) < Version('v0.9.0') and Version(
|
|
219
|
+
KUEUE_VERSION
|
|
220
|
+
) >= Version('v0.9.0'):
|
|
221
|
+
xpk_print('Upgrading kueue on cluster from version < 0.9.0.')
|
|
222
|
+
upgrade_code = delete_multikueueclusters_definitions(args)
|
|
223
|
+
if upgrade_code != 0:
|
|
224
|
+
return upgrade_code
|
|
225
|
+
upgrade_code = delete_multikueueconfigs_definitions(args)
|
|
226
|
+
if upgrade_code != 0:
|
|
227
|
+
return upgrade_code
|
|
228
|
+
|
|
229
|
+
command = (
|
|
230
|
+
'kubectl apply --server-side --force-conflicts -f'
|
|
231
|
+
f' https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml'
|
|
232
|
+
)
|
|
233
|
+
task = 'Set Kueue On Cluster'
|
|
234
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
235
|
+
if return_code != 0:
|
|
236
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
237
|
+
return return_code
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def wait_for_kueue_available(args: Namespace) -> int:
|
|
241
|
+
"""Wait for Kueue to be fully available.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
args: user provided arguments for running the command.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
0 if successful and 1 otherwise.
|
|
248
|
+
"""
|
|
249
|
+
command = (
|
|
250
|
+
'kubectl wait deploy/kueue-controller-manager -nkueue-system'
|
|
251
|
+
f' --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}'
|
|
252
|
+
)
|
|
253
|
+
task = 'Wait for Kueue to be available'
|
|
254
|
+
return_code = run_command_with_updates(command, task, args)
|
|
255
|
+
if return_code != 0:
|
|
256
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
257
|
+
return return_code
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def install_kueue_crs(
|
|
261
|
+
args,
|
|
262
|
+
system: SystemCharacteristics,
|
|
263
|
+
autoprovisioning_config: AutoprovisioningConfig | None,
|
|
264
|
+
) -> int:
|
|
265
|
+
"""Install Kueue Custom Resources.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
args: user provided arguments for running the command.
|
|
269
|
+
system: system level arguments.
|
|
270
|
+
autoprovisioning_config: Autoprovisioning config to configure kueue with if
|
|
271
|
+
autoprovisioning is enabled.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
0 if successful and 1 otherwise.
|
|
275
|
+
"""
|
|
276
|
+
device_type = system.device_type
|
|
277
|
+
cluster_hardware_name = f'{args.num_slices}x{device_type}'
|
|
278
|
+
resource_type = AcceleratorTypeToAcceleratorCharacteristics[
|
|
279
|
+
system.accelerator_type
|
|
280
|
+
].resource_type
|
|
281
|
+
|
|
282
|
+
autoprovisioning_enabled = False
|
|
283
|
+
if autoprovisioning_config:
|
|
284
|
+
# Determine total resources available based on autoprovisioning max chips.
|
|
285
|
+
autoprovisioning_enabled = True
|
|
286
|
+
total_chips = autoprovisioning_config.maximum_chips
|
|
287
|
+
cluster_hardware_name = f'{system.gke_accelerator}'
|
|
288
|
+
else:
|
|
289
|
+
# Determine total chips based on user specified topology.
|
|
290
|
+
total_chips = get_total_chips_requested_from_args(args, system)
|
|
291
|
+
|
|
292
|
+
covered_resources_config = get_kueue_covered_resources_config(
|
|
293
|
+
cluster_hardware_name=cluster_hardware_name,
|
|
294
|
+
resource_type=resource_type,
|
|
295
|
+
total_chips=total_chips,
|
|
296
|
+
)
|
|
297
|
+
yml_string = cluster_set_crd_yaml.format(
|
|
298
|
+
system=system,
|
|
299
|
+
cluster_hardware_name=cluster_hardware_name,
|
|
300
|
+
accelerator_label=create_accelerator_label(
|
|
301
|
+
system.accelerator_type, system
|
|
302
|
+
),
|
|
303
|
+
machine_label=create_machine_label(
|
|
304
|
+
system.accelerator_type, system, autoprovisioning_enabled
|
|
305
|
+
),
|
|
306
|
+
covered_resources_config=covered_resources_config,
|
|
307
|
+
resource_type=AcceleratorTypeToAcceleratorCharacteristics[
|
|
308
|
+
system.accelerator_type
|
|
309
|
+
].resource_type,
|
|
310
|
+
pw_resource_flavors=add_pw_resource_flavors(args),
|
|
311
|
+
pw_resources_kueue=add_pw_resources_to_kueue(args),
|
|
312
|
+
cluster_queue_name=CLUSTER_QUEUE_NAME,
|
|
313
|
+
local_queue_name=LOCAL_QUEUE_NAME,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
tmp = write_tmp_file(yml_string)
|
|
317
|
+
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
318
|
+
|
|
319
|
+
task = 'Applying Kueue Custom Resources'
|
|
320
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
321
|
+
if return_code != 0:
|
|
322
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
323
|
+
return return_code
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def get_kueue_covered_resources_config(
|
|
327
|
+
cluster_hardware_name, resource_type, total_chips
|
|
328
|
+
) -> str:
|
|
329
|
+
"""Gets Kueue covered resources configuration.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
cluster_hardware_name: cluster hardware name.
|
|
333
|
+
resource_type: resource type of tpu or gpu.
|
|
334
|
+
total_chips: total number of chips for the specific resource type.
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
A string of Kueue covered resources configuration.
|
|
338
|
+
"""
|
|
339
|
+
config_format = """
|
|
340
|
+
- coveredResources: ["{resource_type}"]
|
|
341
|
+
flavors:
|
|
342
|
+
- name: {cluster_hardware_name}
|
|
343
|
+
resources:
|
|
344
|
+
- name: "{resource_type}"
|
|
345
|
+
nominalQuota: {total_chips}
|
|
346
|
+
"""
|
|
347
|
+
config_string = config_format.format(
|
|
348
|
+
cluster_hardware_name=cluster_hardware_name,
|
|
349
|
+
resource_type=resource_type,
|
|
350
|
+
total_chips=total_chips,
|
|
351
|
+
)
|
|
352
|
+
return config_string
|
xpk/core/nap.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..core.core import (
|
|
18
|
+
AUTOPROVISIONING_CONFIG_VALUE,
|
|
19
|
+
CAPACITY_TYPE_CONFIG_KEY,
|
|
20
|
+
CLUSTER_METADATA_CONFIGMAP,
|
|
21
|
+
CLUSTER_RESOURCES_CONFIGMAP,
|
|
22
|
+
RESERVATION_CONFIG_KEY,
|
|
23
|
+
AutoprovisioningConfig,
|
|
24
|
+
CapacityType,
|
|
25
|
+
get_all_nodepools_programmatic,
|
|
26
|
+
get_capacity_node_selectors_from_capacity_type,
|
|
27
|
+
get_capacity_type,
|
|
28
|
+
get_cluster_configmap,
|
|
29
|
+
get_total_chips_requested_from_args,
|
|
30
|
+
verify_reservation_exists,
|
|
31
|
+
zone_to_region,
|
|
32
|
+
)
|
|
33
|
+
from ..utils.objects import get_value_from_map
|
|
34
|
+
from ..utils.file import write_tmp_file
|
|
35
|
+
from ..utils.console import xpk_print
|
|
36
|
+
from .commands import run_command_with_updates, run_commands
|
|
37
|
+
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
38
|
+
|
|
39
|
+
autoprovisioning_config_file = """
|
|
40
|
+
management:
|
|
41
|
+
autoRepair: true
|
|
42
|
+
autoUpgrade: true
|
|
43
|
+
autoprovisioningLocations:
|
|
44
|
+
{zones}
|
|
45
|
+
{resource_limits}
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
autoprovisioning_resource_limits = """
|
|
49
|
+
resourceLimits:
|
|
50
|
+
- resourceType: 'cpu'
|
|
51
|
+
{cpu_limits}
|
|
52
|
+
- resourceType: 'memory'
|
|
53
|
+
{memory_limits}
|
|
54
|
+
{custom_resource_type}
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
autoprovisioning_custom_resource_type = """
|
|
58
|
+
- resourceType: {resource_type}
|
|
59
|
+
minimum: {minimum}
|
|
60
|
+
maximum: {maximum}
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def enable_autoprovisioning_on_cluster(
|
|
65
|
+
args, system: SystemCharacteristics | None
|
|
66
|
+
) -> tuple[AutoprovisioningConfig | None, int]:
|
|
67
|
+
"""Enable autoprovisioning on the cluster.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
args: user provided arguments for running the command.
|
|
71
|
+
system: system characteristics.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Autoprovisioning Config or None.
|
|
75
|
+
0 if successful and 1 otherwise.
|
|
76
|
+
"""
|
|
77
|
+
if not system:
|
|
78
|
+
return None, 1
|
|
79
|
+
|
|
80
|
+
# TODO(@vbarr): Disable NAP if they call xpk cluster create again without --enable-autoprovisioning.
|
|
81
|
+
# TODO(@vbarr): Support Pathways.
|
|
82
|
+
# TODO(@vbarr): Support timeout period for idle np before they are deleted.
|
|
83
|
+
# TODO(@vbarr): Support for hot idle configuration (timeout period is infinity).
|
|
84
|
+
return_code = 0
|
|
85
|
+
if system.accelerator_type == AcceleratorType['CPU']:
|
|
86
|
+
xpk_print("Error: XPK NAP doesn't support Accelerators of Types: CPUs.")
|
|
87
|
+
return None, 1
|
|
88
|
+
|
|
89
|
+
autoprovisioning_config, return_code = create_autoprovisioning_config(
|
|
90
|
+
args, system
|
|
91
|
+
)
|
|
92
|
+
if return_code != 0 or not autoprovisioning_config:
|
|
93
|
+
xpk_print('Unable to create autoprovisioning config.')
|
|
94
|
+
return autoprovisioning_config, return_code
|
|
95
|
+
|
|
96
|
+
command = (
|
|
97
|
+
'gcloud container clusters update'
|
|
98
|
+
f' {args.cluster} --project={args.project}'
|
|
99
|
+
f' --region={zone_to_region(args.zone)} --enable-autoprovisioning'
|
|
100
|
+
' --autoprovisioning-config-file'
|
|
101
|
+
f' {autoprovisioning_config.config_filename}'
|
|
102
|
+
)
|
|
103
|
+
task = 'Update cluster with autoprovisioning enabled'
|
|
104
|
+
return_code = run_command_with_updates(command, task, args)
|
|
105
|
+
if return_code != 0:
|
|
106
|
+
xpk_print(f'{task} request returned ERROR {return_code}')
|
|
107
|
+
return autoprovisioning_config, return_code
|
|
108
|
+
|
|
109
|
+
# Update created accelerator node pools to support autoprovisioning.
|
|
110
|
+
existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
|
|
111
|
+
if return_code != 0:
|
|
112
|
+
xpk_print('Listing all node pools failed!')
|
|
113
|
+
return autoprovisioning_config, return_code
|
|
114
|
+
|
|
115
|
+
desired_node_pool_names = [
|
|
116
|
+
f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
commands = []
|
|
120
|
+
task_names = []
|
|
121
|
+
for node_pool_name in desired_node_pool_names:
|
|
122
|
+
if node_pool_name not in existing_node_pool_names:
|
|
123
|
+
# Ignore node pools that are not created yet, and not of the accelerator type.
|
|
124
|
+
continue
|
|
125
|
+
commands.append(
|
|
126
|
+
f'gcloud container node-pools update {node_pool_name}'
|
|
127
|
+
f' --cluster {args.cluster}'
|
|
128
|
+
f' --project={args.project}'
|
|
129
|
+
f' --region={zone_to_region(args.zone)}'
|
|
130
|
+
' --enable-autoprovisioning'
|
|
131
|
+
' --enable-autoscaling'
|
|
132
|
+
)
|
|
133
|
+
task_name = (
|
|
134
|
+
f'Update node pool {node_pool_name} with autoprovisioning support.'
|
|
135
|
+
)
|
|
136
|
+
task_names.append(task_name)
|
|
137
|
+
|
|
138
|
+
for i, command in enumerate(commands):
|
|
139
|
+
xpk_print(f'To complete {task_names[i]} we are executing {command}')
|
|
140
|
+
max_return_code = run_commands(
|
|
141
|
+
commands,
|
|
142
|
+
'Update node pools with autoprovisioning support',
|
|
143
|
+
task_names,
|
|
144
|
+
dry_run=args.dry_run,
|
|
145
|
+
)
|
|
146
|
+
if max_return_code != 0:
|
|
147
|
+
xpk_print(
|
|
148
|
+
'Update node pools with autoprovisioning support returned ERROR:'
|
|
149
|
+
f' {max_return_code}'
|
|
150
|
+
)
|
|
151
|
+
return None, max_return_code
|
|
152
|
+
return autoprovisioning_config, return_code
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def create_autoprovisioning_config(
|
|
156
|
+
args, system: SystemCharacteristics
|
|
157
|
+
) -> tuple[AutoprovisioningConfig | None, int]:
|
|
158
|
+
"""Create autoprovisioning config based on template file and user args
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
args: user provided arguments for running the command.
|
|
162
|
+
system: system characteristics.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
tuple[AutoprovisioningConfig, int]
|
|
166
|
+
AutoprovisioningConfig: config used to enable autoprovisioning
|
|
167
|
+
int: return code
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
# CPU Limits and Memory Limits are for user jobs only. The default node pool
|
|
171
|
+
# is not controlled by NAP.
|
|
172
|
+
cpu_limits = """
|
|
173
|
+
minimum: 1
|
|
174
|
+
maximum: 10000
|
|
175
|
+
"""
|
|
176
|
+
memory_limits = """
|
|
177
|
+
minimum: 1
|
|
178
|
+
maximum: 10000
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
# By default, the maximum chips is set to be the current number of resources used
|
|
182
|
+
# in the cluster. The minimum is set to zero.
|
|
183
|
+
minimum = 0
|
|
184
|
+
maximum = get_total_chips_requested_from_args(args, system)
|
|
185
|
+
xpk_print(f'Default Chips quota is minimum: {minimum}, maximum: {maximum}.')
|
|
186
|
+
|
|
187
|
+
# Check for user overrides.
|
|
188
|
+
if args.autoprovisioning_min_chips:
|
|
189
|
+
minimum = args.autoprovisioning_min_chips
|
|
190
|
+
xpk_print(
|
|
191
|
+
f'User provided min chip quota of {minimum}. Overriding defaults.'
|
|
192
|
+
)
|
|
193
|
+
if args.autoprovisioning_max_chips:
|
|
194
|
+
maximum = args.autoprovisioning_max_chips
|
|
195
|
+
xpk_print(
|
|
196
|
+
f'User provided max chip quota of {maximum}. Overriding defaults.'
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Check for edge cases in min and max chip values.
|
|
200
|
+
if minimum < 0:
|
|
201
|
+
xpk_print(
|
|
202
|
+
f'Error: Minimum chips is set to {minimum}, and must be zero or'
|
|
203
|
+
' greater.'
|
|
204
|
+
)
|
|
205
|
+
return None, 1
|
|
206
|
+
if maximum <= minimum or maximum < 0:
|
|
207
|
+
xpk_print(
|
|
208
|
+
f'Error: Maximum chips is set to {maximum}, and must be greater than'
|
|
209
|
+
f' zero and greater or equal to minimum: {minimum}.Use'
|
|
210
|
+
' --autoprovisioning-max-chips=$MAX_CHIPS to adjust.'
|
|
211
|
+
)
|
|
212
|
+
return None, 1
|
|
213
|
+
xpk_print(
|
|
214
|
+
f'Chips quota is minimum: {minimum}, maximum: {maximum}. XPK will'
|
|
215
|
+
f' autoprovision {maximum - minimum} chips based on incoming workload'
|
|
216
|
+
f' requests, keeping at least {minimum} available at all times, and'
|
|
217
|
+
f' maximum of {maximum}. If the difference ({maximum - minimum} chips) is'
|
|
218
|
+
' small, rescaling will not work well.'
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
custom_resource_string = autoprovisioning_custom_resource_type.format(
|
|
222
|
+
resource_type=system.gke_accelerator,
|
|
223
|
+
minimum=minimum,
|
|
224
|
+
maximum=maximum,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
resource_limits = autoprovisioning_resource_limits.format(
|
|
228
|
+
cpu_limits=cpu_limits,
|
|
229
|
+
memory_limits=memory_limits,
|
|
230
|
+
custom_resource_type=custom_resource_string,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
yml_string = autoprovisioning_config_file.format(
|
|
234
|
+
resource_limits=resource_limits,
|
|
235
|
+
zones=f'- {args.zone}',
|
|
236
|
+
)
|
|
237
|
+
autoprovisioning_config = AutoprovisioningConfig(
|
|
238
|
+
config_filename=write_tmp_file(yml_string).name,
|
|
239
|
+
minimum_chips=minimum,
|
|
240
|
+
maximum_chips=maximum,
|
|
241
|
+
)
|
|
242
|
+
return autoprovisioning_config, 0
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def is_autoprovisioning_enabled(
|
|
246
|
+
args, system: SystemCharacteristics
|
|
247
|
+
) -> tuple[bool, int]:
|
|
248
|
+
"""Determine if autoprovisioning is enabled.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
args: user provided arguments for running the command.
|
|
252
|
+
system: system characteristics.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
bool is true if autoprovisioning is enabled, false otherwise.
|
|
256
|
+
int of 0 if successful and 1 otherwise.
|
|
257
|
+
"""
|
|
258
|
+
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
259
|
+
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
260
|
+
|
|
261
|
+
if cluster_config_map is None:
|
|
262
|
+
xpk_print(
|
|
263
|
+
f'Unable to find config map: {resources_configmap_name}.'
|
|
264
|
+
' Autoprovisioning is not enabled.'
|
|
265
|
+
)
|
|
266
|
+
return False, 0
|
|
267
|
+
|
|
268
|
+
return_code, autoprovisioning_value = get_value_from_map(
|
|
269
|
+
system.gke_accelerator, cluster_config_map
|
|
270
|
+
)
|
|
271
|
+
if return_code != 0:
|
|
272
|
+
xpk_print(
|
|
273
|
+
'gke_accelerator type not found in config map:'
|
|
274
|
+
f' {resources_configmap_name}. Autoprovisioning is not enabled.'
|
|
275
|
+
)
|
|
276
|
+
return False, 0
|
|
277
|
+
|
|
278
|
+
if autoprovisioning_value == AUTOPROVISIONING_CONFIG_VALUE:
|
|
279
|
+
xpk_print('Autoprovisioning is Enabled.')
|
|
280
|
+
return True, 0
|
|
281
|
+
else:
|
|
282
|
+
xpk_print(
|
|
283
|
+
'Error: Autoprovisioning not enabled but should be so exiting xpk.'
|
|
284
|
+
f' Value should be {AUTOPROVISIONING_CONFIG_VALUE} but instead found'
|
|
285
|
+
f' value of {autoprovisioning_value}'
|
|
286
|
+
)
|
|
287
|
+
return False, 1
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
|
|
291
|
+
"""Determine the capacity type when autoprovisioning is enabled.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
args: user provided arguments for running the command.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Tuple with string of autoprovisioning node selector args and
|
|
298
|
+
int of 0 if successful and 1 otherwise.
|
|
299
|
+
"""
|
|
300
|
+
return_code = 0
|
|
301
|
+
node_selector_args = ''
|
|
302
|
+
# If the user doesn't specify args, then use the cluster settings.
|
|
303
|
+
capacity_type, return_code = get_capacity_type(args)
|
|
304
|
+
capacity_type_str = capacity_type.name
|
|
305
|
+
if return_code != 0:
|
|
306
|
+
xpk_print('Unable to get capacity type.')
|
|
307
|
+
return node_selector_args, return_code
|
|
308
|
+
|
|
309
|
+
if capacity_type_str == CapacityType.UNKNOWN.name:
|
|
310
|
+
# Use default settings from cluster creation.
|
|
311
|
+
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
312
|
+
cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
|
|
313
|
+
|
|
314
|
+
# Error out if the metadata config map doesn't exist, and is attempting to use
|
|
315
|
+
# autoprovisioning.
|
|
316
|
+
if cluster_config_map is None:
|
|
317
|
+
xpk_print(
|
|
318
|
+
'Unable to find config map. Please specify a capacity type'
|
|
319
|
+
' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue'
|
|
320
|
+
' to use autoprovisioning (--enable-autoprovisioning).'
|
|
321
|
+
)
|
|
322
|
+
return node_selector_args, 1
|
|
323
|
+
|
|
324
|
+
return_code, capacity_type_str = get_value_from_map(
|
|
325
|
+
CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
|
|
326
|
+
)
|
|
327
|
+
if return_code != 0:
|
|
328
|
+
return node_selector_args, return_code
|
|
329
|
+
|
|
330
|
+
if capacity_type_str == CapacityType.RESERVATION.name:
|
|
331
|
+
return_code, args.reservation = get_value_from_map(
|
|
332
|
+
RESERVATION_CONFIG_KEY, cluster_config_map
|
|
333
|
+
)
|
|
334
|
+
if return_code != 0:
|
|
335
|
+
return node_selector_args, return_code
|
|
336
|
+
return_code = verify_reservation_exists(args)
|
|
337
|
+
if return_code > 0:
|
|
338
|
+
xpk_print('Unable to verify reservation name saved in config map.')
|
|
339
|
+
return node_selector_args, return_code
|
|
340
|
+
|
|
341
|
+
# Check if reservation id is valid. Shared function with cluster creation.
|
|
342
|
+
node_selector_args, return_code = (
|
|
343
|
+
get_capacity_node_selectors_from_capacity_type(args, capacity_type_str)
|
|
344
|
+
)
|
|
345
|
+
if return_code != 0:
|
|
346
|
+
xpk_print('Unable to get node selectors from capacity type.')
|
|
347
|
+
return node_selector_args, return_code
|
|
348
|
+
|
|
349
|
+
return node_selector_args, return_code
|