xpk 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/commands/__init__.py +15 -0
  3. xpk/commands/batch.py +109 -0
  4. xpk/commands/cluster.py +784 -0
  5. xpk/commands/cluster_gcluster.py +185 -0
  6. xpk/commands/info.py +245 -0
  7. xpk/commands/inspector.py +363 -0
  8. xpk/commands/job.py +197 -0
  9. xpk/commands/kind.py +253 -0
  10. xpk/commands/shell.py +120 -0
  11. xpk/commands/version.py +39 -0
  12. xpk/commands/workload.py +692 -0
  13. xpk/core/__init__.py +15 -0
  14. xpk/core/blueprint/__init__.py +15 -0
  15. xpk/core/blueprint/blueprint_definitions.py +61 -0
  16. xpk/core/blueprint/blueprint_generator.py +652 -0
  17. xpk/core/cluster_private.py +197 -0
  18. xpk/core/commands.py +352 -0
  19. xpk/core/core.py +2824 -0
  20. xpk/core/docker_manager.py +308 -0
  21. xpk/core/gcluster_manager.py +158 -0
  22. xpk/core/kjob.py +205 -0
  23. xpk/core/kueue.py +352 -0
  24. xpk/core/nap.py +349 -0
  25. xpk/core/pathways.py +298 -0
  26. xpk/core/ray.py +222 -0
  27. xpk/core/system_characteristics.py +1395 -0
  28. xpk/core/workload.py +133 -0
  29. xpk/core/workload_decorators/__init__.py +15 -0
  30. xpk/core/workload_decorators/rdma_decorator.py +109 -0
  31. xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
  32. xpk/main.py +73 -0
  33. xpk/parser/__init__.py +15 -0
  34. xpk/parser/batch.py +184 -0
  35. xpk/parser/cluster.py +621 -0
  36. xpk/parser/common.py +71 -0
  37. xpk/parser/core.py +109 -0
  38. xpk/parser/info.py +63 -0
  39. xpk/parser/inspector.py +65 -0
  40. xpk/parser/job.py +126 -0
  41. xpk/parser/kind.py +94 -0
  42. xpk/parser/shell.py +50 -0
  43. xpk/parser/validators.py +39 -0
  44. xpk/parser/version.py +23 -0
  45. xpk/parser/workload.py +684 -0
  46. xpk/utils/__init__.py +15 -0
  47. xpk/utils/console.py +55 -0
  48. xpk/utils/file.py +82 -0
  49. xpk/utils/network.py +168 -0
  50. xpk/utils/objects.py +85 -0
  51. xpk/utils/yaml.py +30 -0
  52. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/METADATA +301 -28
  53. xpk-0.6.0.dist-info/RECORD +57 -0
  54. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
  55. xpk-0.6.0.dist-info/entry_points.txt +2 -0
  56. xpk-0.5.0.dist-info/RECORD +0 -7
  57. xpk-0.5.0.dist-info/entry_points.txt +0 -2
  58. xpk.py +0 -7282
  59. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
  60. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
xpk/core/kueue.py ADDED
@@ -0,0 +1,352 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from argparse import Namespace
18
+ from packaging.version import Version
19
+ import packaging
20
+ from ..utils.file import write_tmp_file
21
+ from ..utils.console import xpk_print, xpk_exit
22
+ from .commands import run_command_with_updates, run_command_with_updates_retry, run_command_for_value
23
+ from .core import (
24
+ AutoprovisioningConfig,
25
+ create_accelerator_label,
26
+ create_machine_label,
27
+ get_total_chips_requested_from_args,
28
+ )
29
+ from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
30
+ from .system_characteristics import (
31
+ AcceleratorTypeToAcceleratorCharacteristics,
32
+ SystemCharacteristics,
33
+ )
34
+
35
+ KUEUE_VERSION = 'v0.10.0'
36
+ CLUSTER_QUEUE_NAME = 'cluster-queue'
37
+ LOCAL_QUEUE_NAME = 'multislice-queue'
38
+ WAIT_FOR_KUEUE_TIMEOUT = '5m'
39
+
40
+ packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$'
41
+
42
+ cluster_set_crd_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
43
+ kind: ResourceFlavor
44
+ metadata:
45
+ name: {cluster_hardware_name}
46
+ spec:
47
+ nodeLabels:
48
+ {accelerator_label}
49
+ {machine_label}
50
+ ---
51
+ {pw_resource_flavors}
52
+ apiVersion: kueue.x-k8s.io/v1beta1
53
+ kind: ClusterQueue
54
+ metadata:
55
+ name: {cluster_queue_name}
56
+ spec:
57
+ preemption:
58
+ reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
59
+ withinClusterQueue: LowerPriority
60
+ namespaceSelector: {{}} # match all.
61
+ resourceGroups:
62
+ {covered_resources_config}
63
+ {pw_resources_kueue}
64
+ ---
65
+ apiVersion: kueue.x-k8s.io/v1beta1
66
+ kind: LocalQueue
67
+ metadata:
68
+ namespace: default
69
+ name: {local_queue_name}
70
+ spec:
71
+ clusterQueue: {cluster_queue_name}
72
+ ---
73
+ apiVersion: scheduling.k8s.io/v1
74
+ kind: PriorityClass
75
+ metadata:
76
+ name: very-low
77
+ value: 100
78
+ globalDefault: false
79
+ description: "Very Low"
80
+ ---
81
+ apiVersion: scheduling.k8s.io/v1
82
+ kind: PriorityClass
83
+ metadata:
84
+ name: low
85
+ value: 250
86
+ globalDefault: false
87
+ description: "Low"
88
+ ---
89
+ apiVersion: scheduling.k8s.io/v1
90
+ kind: PriorityClass
91
+ metadata:
92
+ name: medium
93
+ value: 500
94
+ globalDefault: false
95
+ description: "Medium"
96
+ ---
97
+ apiVersion: scheduling.k8s.io/v1
98
+ kind: PriorityClass
99
+ metadata:
100
+ name: high
101
+ value: 750
102
+ globalDefault: false
103
+ description: "High"
104
+ ---
105
+ apiVersion: scheduling.k8s.io/v1
106
+ kind: PriorityClass
107
+ metadata:
108
+ name: very-high
109
+ value: 1000
110
+ globalDefault: false
111
+ description: "Very High"
112
+ """
113
+
114
+ cluster_preheat_yml = """
115
+ apiVersion: apps/v1
116
+ kind: DaemonSet
117
+ metadata:
118
+ name: {cachekey}
119
+ labels:
120
+ k8s-app: {cachekey}
121
+ spec:
122
+ selector:
123
+ matchLabels:
124
+ k8s-app: {cachekey}
125
+ updateStrategy:
126
+ type: RollingUpdate
127
+ template:
128
+ metadata:
129
+ labels:
130
+ name: {cachekey}
131
+ k8s-app: {cachekey}
132
+ spec:
133
+ affinity:
134
+ nodeAffinity:
135
+ requiredDuringSchedulingIgnoredDuringExecution:
136
+ nodeSelectorTerms:
137
+ - matchExpressions:
138
+ - key: {nodeSelectorKey}
139
+ operator: Exists
140
+ tolerations:
141
+ - operator: "Exists"
142
+ containers:
143
+ - image: {image_name}
144
+ name: {cachekey}
145
+ command: [ "sleep", "inf" ]
146
+ """
147
+
148
+
149
+ def verify_kueuectl(args: Namespace) -> None:
150
+ """Verify if kueuectl is installed.
151
+ Args:
152
+ args: user provided arguments.
153
+ Returns:
154
+ None
155
+ """
156
+ xpk_print('Veryfing kueuectl installation')
157
+
158
+ command = 'kubectl kueue version'
159
+ task = 'Verify kueuectl installation on cluster'
160
+ verify_kueuectl_installed_code, _ = run_command_for_value(command, task, args)
161
+
162
+ if verify_kueuectl_installed_code == 0:
163
+ xpk_print('kueuectl found')
164
+
165
+ if verify_kueuectl_installed_code != 0:
166
+ xpk_print(
167
+ 'kueuectl not found. Please follow'
168
+ ' https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/'
169
+ ' to install kueuectl.'
170
+ )
171
+ xpk_exit(verify_kueuectl_installed_code)
172
+
173
+
174
+ def delete_multikueueconfigs_definitions(args) -> int:
175
+ command = 'kubectl delete crd multikueueconfigs.kueue.x-k8s.io'
176
+ task = 'Delete multikueueconfigs crds'
177
+ return_code = run_command_with_updates_retry(command, task, args)
178
+ if return_code != 0:
179
+ xpk_print(f'{task} returned ERROR {return_code}')
180
+ return return_code
181
+
182
+
183
+ def delete_multikueueclusters_definitions(args) -> int:
184
+ command = 'kubectl delete crd multikueueclusters.kueue.x-k8s.io'
185
+ task = 'Delete multikueueclusters crds'
186
+ return_code = run_command_with_updates_retry(command, task, args)
187
+ if return_code != 0:
188
+ xpk_print(f'{task} returned ERROR {return_code}')
189
+ return return_code
190
+
191
+
192
+ def get_kueue_version(args) -> (int, str):
193
+ command = 'kubectl kueue version'
194
+ task = 'Get kueue version on server'
195
+ return_code, val = run_command_for_value(command, task, args)
196
+ if return_code != 0:
197
+ return return_code, ''
198
+ lines = val.splitlines()
199
+ if len(lines) == 1:
200
+ return 1, ''
201
+ server_version_line = lines[1]
202
+ manager_image_version = server_version_line.split(':')[-1]
203
+ return return_code, manager_image_version
204
+
205
+
206
+ def install_kueue_on_cluster(args) -> int:
207
+ """Install Kueue on the cluster.
208
+
209
+ Args:
210
+ args: user provided arguments for running the command.
211
+
212
+ Returns:
213
+ 0 if successful and 1 otherwise.
214
+ """
215
+
216
+ err_code, kueue_version_installed = get_kueue_version(args)
217
+ if err_code == 0:
218
+ if Version(kueue_version_installed) < Version('v0.9.0') and Version(
219
+ KUEUE_VERSION
220
+ ) >= Version('v0.9.0'):
221
+ xpk_print('Upgrading kueue on cluster from version < 0.9.0.')
222
+ upgrade_code = delete_multikueueclusters_definitions(args)
223
+ if upgrade_code != 0:
224
+ return upgrade_code
225
+ upgrade_code = delete_multikueueconfigs_definitions(args)
226
+ if upgrade_code != 0:
227
+ return upgrade_code
228
+
229
+ command = (
230
+ 'kubectl apply --server-side --force-conflicts -f'
231
+ f' https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml'
232
+ )
233
+ task = 'Set Kueue On Cluster'
234
+ return_code = run_command_with_updates_retry(command, task, args)
235
+ if return_code != 0:
236
+ xpk_print(f'{task} returned ERROR {return_code}')
237
+ return return_code
238
+
239
+
240
+ def wait_for_kueue_available(args: Namespace) -> int:
241
+ """Wait for Kueue to be fully available.
242
+
243
+ Args:
244
+ args: user provided arguments for running the command.
245
+
246
+ Returns:
247
+ 0 if successful and 1 otherwise.
248
+ """
249
+ command = (
250
+ 'kubectl wait deploy/kueue-controller-manager -nkueue-system'
251
+ f' --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}'
252
+ )
253
+ task = 'Wait for Kueue to be available'
254
+ return_code = run_command_with_updates(command, task, args)
255
+ if return_code != 0:
256
+ xpk_print(f'{task} returned ERROR {return_code}')
257
+ return return_code
258
+
259
+
260
+ def install_kueue_crs(
261
+ args,
262
+ system: SystemCharacteristics,
263
+ autoprovisioning_config: AutoprovisioningConfig | None,
264
+ ) -> int:
265
+ """Install Kueue Custom Resources.
266
+
267
+ Args:
268
+ args: user provided arguments for running the command.
269
+ system: system level arguments.
270
+ autoprovisioning_config: Autoprovisioning config to configure kueue with if
271
+ autoprovisioning is enabled.
272
+
273
+ Returns:
274
+ 0 if successful and 1 otherwise.
275
+ """
276
+ device_type = system.device_type
277
+ cluster_hardware_name = f'{args.num_slices}x{device_type}'
278
+ resource_type = AcceleratorTypeToAcceleratorCharacteristics[
279
+ system.accelerator_type
280
+ ].resource_type
281
+
282
+ autoprovisioning_enabled = False
283
+ if autoprovisioning_config:
284
+ # Determine total resources available based on autoprovisioning max chips.
285
+ autoprovisioning_enabled = True
286
+ total_chips = autoprovisioning_config.maximum_chips
287
+ cluster_hardware_name = f'{system.gke_accelerator}'
288
+ else:
289
+ # Determine total chips based on user specified topology.
290
+ total_chips = get_total_chips_requested_from_args(args, system)
291
+
292
+ covered_resources_config = get_kueue_covered_resources_config(
293
+ cluster_hardware_name=cluster_hardware_name,
294
+ resource_type=resource_type,
295
+ total_chips=total_chips,
296
+ )
297
+ yml_string = cluster_set_crd_yaml.format(
298
+ system=system,
299
+ cluster_hardware_name=cluster_hardware_name,
300
+ accelerator_label=create_accelerator_label(
301
+ system.accelerator_type, system
302
+ ),
303
+ machine_label=create_machine_label(
304
+ system.accelerator_type, system, autoprovisioning_enabled
305
+ ),
306
+ covered_resources_config=covered_resources_config,
307
+ resource_type=AcceleratorTypeToAcceleratorCharacteristics[
308
+ system.accelerator_type
309
+ ].resource_type,
310
+ pw_resource_flavors=add_pw_resource_flavors(args),
311
+ pw_resources_kueue=add_pw_resources_to_kueue(args),
312
+ cluster_queue_name=CLUSTER_QUEUE_NAME,
313
+ local_queue_name=LOCAL_QUEUE_NAME,
314
+ )
315
+
316
+ tmp = write_tmp_file(yml_string)
317
+ command = f'kubectl apply -f {str(tmp.file.name)}'
318
+
319
+ task = 'Applying Kueue Custom Resources'
320
+ return_code = run_command_with_updates_retry(command, task, args)
321
+ if return_code != 0:
322
+ xpk_print(f'{task} returned ERROR {return_code}')
323
+ return return_code
324
+
325
+
326
+ def get_kueue_covered_resources_config(
327
+ cluster_hardware_name, resource_type, total_chips
328
+ ) -> str:
329
+ """Gets Kueue covered resources configuration.
330
+
331
+ Args:
332
+ cluster_hardware_name: cluster hardware name.
333
+ resource_type: resource type of tpu or gpu.
334
+ total_chips: total number of chips for the specific resource type.
335
+
336
+ Returns:
337
+ A string of Kueue covered resources configuration.
338
+ """
339
+ config_format = """
340
+ - coveredResources: ["{resource_type}"]
341
+ flavors:
342
+ - name: {cluster_hardware_name}
343
+ resources:
344
+ - name: "{resource_type}"
345
+ nominalQuota: {total_chips}
346
+ """
347
+ config_string = config_format.format(
348
+ cluster_hardware_name=cluster_hardware_name,
349
+ resource_type=resource_type,
350
+ total_chips=total_chips,
351
+ )
352
+ return config_string
xpk/core/nap.py ADDED
@@ -0,0 +1,349 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..core.core import (
18
+ AUTOPROVISIONING_CONFIG_VALUE,
19
+ CAPACITY_TYPE_CONFIG_KEY,
20
+ CLUSTER_METADATA_CONFIGMAP,
21
+ CLUSTER_RESOURCES_CONFIGMAP,
22
+ RESERVATION_CONFIG_KEY,
23
+ AutoprovisioningConfig,
24
+ CapacityType,
25
+ get_all_nodepools_programmatic,
26
+ get_capacity_node_selectors_from_capacity_type,
27
+ get_capacity_type,
28
+ get_cluster_configmap,
29
+ get_total_chips_requested_from_args,
30
+ verify_reservation_exists,
31
+ zone_to_region,
32
+ )
33
+ from ..utils.objects import get_value_from_map
34
+ from ..utils.file import write_tmp_file
35
+ from ..utils.console import xpk_print
36
+ from .commands import run_command_with_updates, run_commands
37
+ from .system_characteristics import AcceleratorType, SystemCharacteristics
38
+
39
+ autoprovisioning_config_file = """
40
+ management:
41
+ autoRepair: true
42
+ autoUpgrade: true
43
+ autoprovisioningLocations:
44
+ {zones}
45
+ {resource_limits}
46
+ """
47
+
48
+ autoprovisioning_resource_limits = """
49
+ resourceLimits:
50
+ - resourceType: 'cpu'
51
+ {cpu_limits}
52
+ - resourceType: 'memory'
53
+ {memory_limits}
54
+ {custom_resource_type}
55
+ """
56
+
57
+ autoprovisioning_custom_resource_type = """
58
+ - resourceType: {resource_type}
59
+ minimum: {minimum}
60
+ maximum: {maximum}
61
+ """
62
+
63
+
64
+ def enable_autoprovisioning_on_cluster(
65
+ args, system: SystemCharacteristics | None
66
+ ) -> tuple[AutoprovisioningConfig | None, int]:
67
+ """Enable autoprovisioning on the cluster.
68
+
69
+ Args:
70
+ args: user provided arguments for running the command.
71
+ system: system characteristics.
72
+
73
+ Returns:
74
+ Autoprovisioning Config or None.
75
+ 0 if successful and 1 otherwise.
76
+ """
77
+ if not system:
78
+ return None, 1
79
+
80
+ # TODO(@vbarr): Disable NAP if they call xpk cluster create again without --enable-autoprovisioning.
81
+ # TODO(@vbarr): Support Pathways.
82
+ # TODO(@vbarr): Support timeout period for idle np before they are deleted.
83
+ # TODO(@vbarr): Support for hot idle configuration (timeout period is infinity).
84
+ return_code = 0
85
+ if system.accelerator_type == AcceleratorType['CPU']:
86
+ xpk_print("Error: XPK NAP doesn't support Accelerators of Types: CPUs.")
87
+ return None, 1
88
+
89
+ autoprovisioning_config, return_code = create_autoprovisioning_config(
90
+ args, system
91
+ )
92
+ if return_code != 0 or not autoprovisioning_config:
93
+ xpk_print('Unable to create autoprovisioning config.')
94
+ return autoprovisioning_config, return_code
95
+
96
+ command = (
97
+ 'gcloud container clusters update'
98
+ f' {args.cluster} --project={args.project}'
99
+ f' --region={zone_to_region(args.zone)} --enable-autoprovisioning'
100
+ ' --autoprovisioning-config-file'
101
+ f' {autoprovisioning_config.config_filename}'
102
+ )
103
+ task = 'Update cluster with autoprovisioning enabled'
104
+ return_code = run_command_with_updates(command, task, args)
105
+ if return_code != 0:
106
+ xpk_print(f'{task} request returned ERROR {return_code}')
107
+ return autoprovisioning_config, return_code
108
+
109
+ # Update created accelerator node pools to support autoprovisioning.
110
+ existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
111
+ if return_code != 0:
112
+ xpk_print('Listing all node pools failed!')
113
+ return autoprovisioning_config, return_code
114
+
115
+ desired_node_pool_names = [
116
+ f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
117
+ ]
118
+
119
+ commands = []
120
+ task_names = []
121
+ for node_pool_name in desired_node_pool_names:
122
+ if node_pool_name not in existing_node_pool_names:
123
+ # Ignore node pools that are not created yet, and not of the accelerator type.
124
+ continue
125
+ commands.append(
126
+ f'gcloud container node-pools update {node_pool_name}'
127
+ f' --cluster {args.cluster}'
128
+ f' --project={args.project}'
129
+ f' --region={zone_to_region(args.zone)}'
130
+ ' --enable-autoprovisioning'
131
+ ' --enable-autoscaling'
132
+ )
133
+ task_name = (
134
+ f'Update node pool {node_pool_name} with autoprovisioning support.'
135
+ )
136
+ task_names.append(task_name)
137
+
138
+ for i, command in enumerate(commands):
139
+ xpk_print(f'To complete {task_names[i]} we are executing {command}')
140
+ max_return_code = run_commands(
141
+ commands,
142
+ 'Update node pools with autoprovisioning support',
143
+ task_names,
144
+ dry_run=args.dry_run,
145
+ )
146
+ if max_return_code != 0:
147
+ xpk_print(
148
+ 'Update node pools with autoprovisioning support returned ERROR:'
149
+ f' {max_return_code}'
150
+ )
151
+ return None, max_return_code
152
+ return autoprovisioning_config, return_code
153
+
154
+
155
+ def create_autoprovisioning_config(
156
+ args, system: SystemCharacteristics
157
+ ) -> tuple[AutoprovisioningConfig | None, int]:
158
+ """Create autoprovisioning config based on template file and user args
159
+
160
+ Args:
161
+ args: user provided arguments for running the command.
162
+ system: system characteristics.
163
+
164
+ Returns:
165
+ tuple[AutoprovisioningConfig, int]
166
+ AutoprovisioningConfig: config used to enable autoprovisioning
167
+ int: return code
168
+ """
169
+
170
+ # CPU Limits and Memory Limits are for user jobs only. The default node pool
171
+ # is not controlled by NAP.
172
+ cpu_limits = """
173
+ minimum: 1
174
+ maximum: 10000
175
+ """
176
+ memory_limits = """
177
+ minimum: 1
178
+ maximum: 10000
179
+ """
180
+
181
+ # By default, the maximum chips is set to be the current number of resources used
182
+ # in the cluster. The minimum is set to zero.
183
+ minimum = 0
184
+ maximum = get_total_chips_requested_from_args(args, system)
185
+ xpk_print(f'Default Chips quota is minimum: {minimum}, maximum: {maximum}.')
186
+
187
+ # Check for user overrides.
188
+ if args.autoprovisioning_min_chips:
189
+ minimum = args.autoprovisioning_min_chips
190
+ xpk_print(
191
+ f'User provided min chip quota of {minimum}. Overriding defaults.'
192
+ )
193
+ if args.autoprovisioning_max_chips:
194
+ maximum = args.autoprovisioning_max_chips
195
+ xpk_print(
196
+ f'User provided max chip quota of {maximum}. Overriding defaults.'
197
+ )
198
+
199
+ # Check for edge cases in min and max chip values.
200
+ if minimum < 0:
201
+ xpk_print(
202
+ f'Error: Minimum chips is set to {minimum}, and must be zero or'
203
+ ' greater.'
204
+ )
205
+ return None, 1
206
+ if maximum <= minimum or maximum < 0:
207
+ xpk_print(
208
+ f'Error: Maximum chips is set to {maximum}, and must be greater than'
209
+ f' zero and greater or equal to minimum: {minimum}.Use'
210
+ ' --autoprovisioning-max-chips=$MAX_CHIPS to adjust.'
211
+ )
212
+ return None, 1
213
+ xpk_print(
214
+ f'Chips quota is minimum: {minimum}, maximum: {maximum}. XPK will'
215
+ f' autoprovision {maximum - minimum} chips based on incoming workload'
216
+ f' requests, keeping at least {minimum} available at all times, and'
217
+ f' maximum of {maximum}. If the difference ({maximum - minimum} chips) is'
218
+ ' small, rescaling will not work well.'
219
+ )
220
+
221
+ custom_resource_string = autoprovisioning_custom_resource_type.format(
222
+ resource_type=system.gke_accelerator,
223
+ minimum=minimum,
224
+ maximum=maximum,
225
+ )
226
+
227
+ resource_limits = autoprovisioning_resource_limits.format(
228
+ cpu_limits=cpu_limits,
229
+ memory_limits=memory_limits,
230
+ custom_resource_type=custom_resource_string,
231
+ )
232
+
233
+ yml_string = autoprovisioning_config_file.format(
234
+ resource_limits=resource_limits,
235
+ zones=f'- {args.zone}',
236
+ )
237
+ autoprovisioning_config = AutoprovisioningConfig(
238
+ config_filename=write_tmp_file(yml_string).name,
239
+ minimum_chips=minimum,
240
+ maximum_chips=maximum,
241
+ )
242
+ return autoprovisioning_config, 0
243
+
244
+
245
+ def is_autoprovisioning_enabled(
246
+ args, system: SystemCharacteristics
247
+ ) -> tuple[bool, int]:
248
+ """Determine if autoprovisioning is enabled.
249
+
250
+ Args:
251
+ args: user provided arguments for running the command.
252
+ system: system characteristics.
253
+
254
+ Returns:
255
+ bool is true if autoprovisioning is enabled, false otherwise.
256
+ int of 0 if successful and 1 otherwise.
257
+ """
258
+ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
259
+ cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
260
+
261
+ if cluster_config_map is None:
262
+ xpk_print(
263
+ f'Unable to find config map: {resources_configmap_name}.'
264
+ ' Autoprovisioning is not enabled.'
265
+ )
266
+ return False, 0
267
+
268
+ return_code, autoprovisioning_value = get_value_from_map(
269
+ system.gke_accelerator, cluster_config_map
270
+ )
271
+ if return_code != 0:
272
+ xpk_print(
273
+ 'gke_accelerator type not found in config map:'
274
+ f' {resources_configmap_name}. Autoprovisioning is not enabled.'
275
+ )
276
+ return False, 0
277
+
278
+ if autoprovisioning_value == AUTOPROVISIONING_CONFIG_VALUE:
279
+ xpk_print('Autoprovisioning is Enabled.')
280
+ return True, 0
281
+ else:
282
+ xpk_print(
283
+ 'Error: Autoprovisioning not enabled but should be so exiting xpk.'
284
+ f' Value should be {AUTOPROVISIONING_CONFIG_VALUE} but instead found'
285
+ f' value of {autoprovisioning_value}'
286
+ )
287
+ return False, 1
288
+
289
+
290
+ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
291
+ """Determine the capacity type when autoprovisioning is enabled.
292
+
293
+ Args:
294
+ args: user provided arguments for running the command.
295
+
296
+ Returns:
297
+ Tuple with string of autoprovisioning node selector args and
298
+ int of 0 if successful and 1 otherwise.
299
+ """
300
+ return_code = 0
301
+ node_selector_args = ''
302
+ # If the user doesn't specify args, then use the cluster settings.
303
+ capacity_type, return_code = get_capacity_type(args)
304
+ capacity_type_str = capacity_type.name
305
+ if return_code != 0:
306
+ xpk_print('Unable to get capacity type.')
307
+ return node_selector_args, return_code
308
+
309
+ if capacity_type_str == CapacityType.UNKNOWN.name:
310
+ # Use default settings from cluster creation.
311
+ metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
312
+ cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
313
+
314
+ # Error out if the metadata config map doesn't exist, and is attempting to use
315
+ # autoprovisioning.
316
+ if cluster_config_map is None:
317
+ xpk_print(
318
+ 'Unable to find config map. Please specify a capacity type'
319
+ ' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue'
320
+ ' to use autoprovisioning (--enable-autoprovisioning).'
321
+ )
322
+ return node_selector_args, 1
323
+
324
+ return_code, capacity_type_str = get_value_from_map(
325
+ CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
326
+ )
327
+ if return_code != 0:
328
+ return node_selector_args, return_code
329
+
330
+ if capacity_type_str == CapacityType.RESERVATION.name:
331
+ return_code, args.reservation = get_value_from_map(
332
+ RESERVATION_CONFIG_KEY, cluster_config_map
333
+ )
334
+ if return_code != 0:
335
+ return node_selector_args, return_code
336
+ return_code = verify_reservation_exists(args)
337
+ if return_code > 0:
338
+ xpk_print('Unable to verify reservation name saved in config map.')
339
+ return node_selector_args, return_code
340
+
341
+ # Check if reservation id is valid. Shared function with cluster creation.
342
+ node_selector_args, return_code = (
343
+ get_capacity_node_selectors_from_capacity_type(args, capacity_type_str)
344
+ )
345
+ if return_code != 0:
346
+ xpk_print('Unable to get node selectors from capacity type.')
347
+ return node_selector_args, return_code
348
+
349
+ return node_selector_args, return_code