xpk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. xpk-0.0.1.dist-info/LICENSE +202 -0
  88. xpk-0.0.1.dist-info/METADATA +1498 -0
  89. xpk-0.0.1.dist-info/RECORD +92 -0
  90. xpk-0.0.1.dist-info/WHEEL +5 -0
  91. xpk-0.0.1.dist-info/entry_points.txt +2 -0
  92. xpk-0.0.1.dist-info/top_level.txt +1 -0
xpk/core/nap.py ADDED
@@ -0,0 +1,361 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..utils.console import xpk_print
18
+ from ..utils.file import write_tmp_file
19
+ from ..utils.objects import get_value_from_map
20
+ from .capacity import (
21
+ AUTOPROVISIONING_CONFIG_VALUE,
22
+ CAPACITY_TYPE_CONFIG_KEY,
23
+ RESERVATION_CONFIG_KEY,
24
+ CapacityType,
25
+ get_capacity_node_selectors_from_capacity_type,
26
+ get_capacity_type,
27
+ verify_reservation_exists,
28
+ )
29
+ from .commands import run_command_with_updates, run_commands
30
+ from .gcloud_context import zone_to_region
31
+ from .nodepool import get_all_nodepools_programmatic
32
+ from .resources import (
33
+ CLUSTER_METADATA_CONFIGMAP,
34
+ CLUSTER_RESOURCES_CONFIGMAP,
35
+ AutoprovisioningConfig,
36
+ get_cluster_configmap,
37
+ )
38
+ from .scheduling import get_total_chips_requested_from_args
39
+ from .system_characteristics import AcceleratorType, SystemCharacteristics
40
+
41
+ AUTOPROVISIONING_CONFIG_FILE = """
42
+ management:
43
+ autoRepair: true
44
+ autoUpgrade: true
45
+ autoprovisioningLocations:
46
+ {zones}
47
+ {resource_limits}
48
+ """
49
+ AUTOPROVISIONING_RESOURCE_LIMITS = """
50
+ resourceLimits:
51
+ - resourceType: 'cpu'
52
+ {cpu_limits}
53
+ - resourceType: 'memory'
54
+ {memory_limits}
55
+ {custom_resource_type}
56
+ """
57
+ AUTOPROVISIONING_CUSTOM_RESOURCE_TYPE = """
58
+ - resourceType: {resource_type}
59
+ minimum: {minimum}
60
+ maximum: {maximum}
61
+ """
62
+
63
+
64
+ def enable_autoprovisioning_on_cluster(
65
+ args, system: SystemCharacteristics | None
66
+ ) -> tuple[AutoprovisioningConfig | None, int]:
67
+ """Enable autoprovisioning on the cluster.
68
+
69
+ Args:
70
+ args: user provided arguments for running the command.
71
+ system: system characteristics.
72
+
73
+ Returns:
74
+ Autoprovisioning Config or None.
75
+ 0 if successful and 1 otherwise.
76
+ """
77
+ if not system:
78
+ return None, 1
79
+
80
+ # TODO(@vbarr): Disable NAP if they call xpk cluster create again without --enable-autoprovisioning.
81
+ # TODO(@vbarr): Support Pathways.
82
+ # TODO(@vbarr): Support timeout period for idle np before they are deleted.
83
+ # TODO(@vbarr): Support for hot idle configuration (timeout period is infinity).
84
+ return_code = 0
85
+ if system.accelerator_type == AcceleratorType['CPU']:
86
+ xpk_print("Error: XPK NAP doesn't support Accelerators of Types: CPUs.")
87
+ return None, 1
88
+
89
+ autoprovisioning_config, return_code = create_autoprovisioning_config(
90
+ args, system
91
+ )
92
+ if return_code != 0 or not autoprovisioning_config:
93
+ xpk_print('Unable to create autoprovisioning config.')
94
+ return autoprovisioning_config, return_code
95
+
96
+ command = (
97
+ 'gcloud container clusters update'
98
+ f' {args.cluster} --project={args.project}'
99
+ f' --region={zone_to_region(args.zone)} --enable-autoprovisioning'
100
+ ' --autoprovisioning-config-file'
101
+ f' {autoprovisioning_config.config_filename}'
102
+ )
103
+ task = 'Update cluster with autoprovisioning enabled'
104
+ return_code = run_command_with_updates(command, task, args)
105
+ if return_code != 0:
106
+ xpk_print(f'{task} request returned ERROR {return_code}')
107
+ return autoprovisioning_config, return_code
108
+
109
+ # Update created accelerator node pools to support autoprovisioning.
110
+ existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
111
+ if return_code != 0:
112
+ xpk_print('Listing all node pools failed!')
113
+ return autoprovisioning_config, return_code
114
+
115
+ desired_node_pool_names = [
116
+ f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
117
+ ]
118
+
119
+ commands = []
120
+ task_names = []
121
+ for node_pool_name in desired_node_pool_names:
122
+ if node_pool_name not in existing_node_pool_names:
123
+ # Ignore node pools that are not created yet, and not of the accelerator type.
124
+ continue
125
+ commands.append(
126
+ f'gcloud container node-pools update {node_pool_name}'
127
+ f' --cluster {args.cluster}'
128
+ f' --project={args.project}'
129
+ f' --region={zone_to_region(args.zone)}'
130
+ ' --enable-autoprovisioning'
131
+ ' --enable-autoscaling'
132
+ )
133
+ task_name = (
134
+ f'Update node pool {node_pool_name} with autoprovisioning support.'
135
+ )
136
+ task_names.append(task_name)
137
+
138
+ for i, command in enumerate(commands):
139
+ xpk_print(f'To complete {task_names[i]} we are executing {command}')
140
+ max_return_code = run_commands(
141
+ commands,
142
+ 'Update node pools with autoprovisioning support',
143
+ task_names,
144
+ dry_run=args.dry_run,
145
+ )
146
+ if max_return_code != 0:
147
+ xpk_print(
148
+ 'Update node pools with autoprovisioning support returned ERROR:'
149
+ f' {max_return_code}'
150
+ )
151
+ return None, max_return_code
152
+ return autoprovisioning_config, return_code
153
+
154
+
155
+ def create_autoprovisioning_config(
156
+ args, system: SystemCharacteristics
157
+ ) -> tuple[AutoprovisioningConfig | None, int]:
158
+ """Create autoprovisioning config based on template file and user args
159
+
160
+ Args:
161
+ args: user provided arguments for running the command.
162
+ system: system characteristics.
163
+
164
+ Returns:
165
+ tuple[AutoprovisioningConfig, int]
166
+ AutoprovisioningConfig: config used to enable autoprovisioning
167
+ int: return code
168
+ """
169
+
170
+ # CPU Limits and Memory Limits are for user jobs only. The default node pool
171
+ # is not controlled by NAP.
172
+ cpu_limits = """
173
+ minimum: 1
174
+ maximum: 10000
175
+ """
176
+ memory_limits = """
177
+ minimum: 1
178
+ maximum: 10000
179
+ """
180
+
181
+ # By default, the maximum chips is set to be the current number of resources used
182
+ # in the cluster. The minimum is set to zero.
183
+ minimum = 0
184
+ maximum = get_total_chips_requested_from_args(args, system)
185
+ xpk_print(f'Default Chips quota is minimum: {minimum}, maximum: {maximum}.')
186
+
187
+ # Check for user overrides.
188
+ if args.autoprovisioning_min_chips:
189
+ minimum = args.autoprovisioning_min_chips
190
+ xpk_print(
191
+ f'User provided min chip quota of {minimum}. Overriding defaults.'
192
+ )
193
+ if args.autoprovisioning_max_chips:
194
+ maximum = args.autoprovisioning_max_chips
195
+ xpk_print(
196
+ f'User provided max chip quota of {maximum}. Overriding defaults.'
197
+ )
198
+
199
+ # Check for edge cases in min and max chip values.
200
+ if minimum < 0:
201
+ xpk_print(
202
+ f'Error: Minimum chips is set to {minimum}, and must be zero or'
203
+ ' greater.'
204
+ )
205
+ return None, 1
206
+ if maximum <= minimum or maximum < 0:
207
+ xpk_print(
208
+ f'Error: Maximum chips is set to {maximum}, and must be greater than'
209
+ f' zero and greater or equal to minimum: {minimum}.Use'
210
+ ' --autoprovisioning-max-chips=$MAX_CHIPS to adjust.'
211
+ )
212
+ return None, 1
213
+ xpk_print(
214
+ f'Chips quota is minimum: {minimum}, maximum: {maximum}. XPK will'
215
+ f' autoprovision {maximum - minimum} chips based on incoming workload'
216
+ f' requests, keeping at least {minimum} available at all times, and'
217
+ f' maximum of {maximum}. If the difference ({maximum - minimum} chips) is'
218
+ ' small, rescaling will not work well.'
219
+ )
220
+
221
+ custom_resource_string = AUTOPROVISIONING_CUSTOM_RESOURCE_TYPE.format(
222
+ resource_type=system.gke_accelerator,
223
+ minimum=minimum,
224
+ maximum=maximum,
225
+ )
226
+
227
+ resource_limits = AUTOPROVISIONING_RESOURCE_LIMITS.format(
228
+ cpu_limits=cpu_limits,
229
+ memory_limits=memory_limits,
230
+ custom_resource_type=custom_resource_string,
231
+ )
232
+
233
+ yml_string = AUTOPROVISIONING_CONFIG_FILE.format(
234
+ resource_limits=resource_limits,
235
+ zones=f'- {args.zone}',
236
+ )
237
+ autoprovisioning_config = AutoprovisioningConfig(
238
+ config_filename=write_tmp_file(yml_string).name,
239
+ minimum_chips=minimum,
240
+ maximum_chips=maximum,
241
+ )
242
+ return autoprovisioning_config, 0
243
+
244
+
245
+ def is_autoprovisioning_enabled(
246
+ args, system: SystemCharacteristics
247
+ ) -> tuple[bool, int]:
248
+ """Determine if autoprovisioning is enabled.
249
+
250
+ Args:
251
+ args: user provided arguments for running the command.
252
+ system: system characteristics.
253
+
254
+ Returns:
255
+ bool is true if autoprovisioning is enabled, false otherwise.
256
+ int of 0 if successful and 1 otherwise.
257
+ """
258
+ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
259
+ cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
260
+
261
+ if cluster_config_map is None:
262
+ xpk_print(
263
+ f'Unable to find config map: {resources_configmap_name}.'
264
+ ' Autoprovisioning is not enabled.'
265
+ )
266
+ return False, 0
267
+
268
+ return_code, autoprovisioning_value = get_value_from_map(
269
+ system.gke_accelerator, cluster_config_map, verbose=False
270
+ )
271
+ if return_code != 0:
272
+ xpk_print(
273
+ 'gke_accelerator type not found in config map:'
274
+ f' {resources_configmap_name}. Autoprovisioning is not enabled.'
275
+ )
276
+ return False, 0
277
+
278
+ if autoprovisioning_value == AUTOPROVISIONING_CONFIG_VALUE:
279
+ xpk_print('Autoprovisioning is Enabled.')
280
+ return True, 0
281
+ else:
282
+ xpk_print(
283
+ 'Error: Autoprovisioning not enabled but should be so exiting xpk.'
284
+ f' Value should be {AUTOPROVISIONING_CONFIG_VALUE} but instead found'
285
+ f' value of {autoprovisioning_value}'
286
+ )
287
+ return False, 1
288
+
289
+
290
+ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
291
+ """Determine the capacity type when autoprovisioning is enabled.
292
+
293
+ Args:
294
+ args: user provided arguments for running the command.
295
+
296
+ Returns:
297
+ Tuple with string of autoprovisioning node selector args and
298
+ int of 0 if successful and 1 otherwise.
299
+ """
300
+ return_code = 0
301
+ node_selector_args = ''
302
+ # If the user doesn't specify args, then use the cluster settings.
303
+ capacity_type, return_code = get_capacity_type(args)
304
+ capacity_type_str = capacity_type.name
305
+ if return_code != 0:
306
+ xpk_print('Unable to get capacity type.')
307
+ return node_selector_args, return_code
308
+
309
+ if capacity_type_str == CapacityType.UNKNOWN.name:
310
+ # Use default settings from cluster creation.
311
+ metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
312
+ cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
313
+
314
+ # Error out if the metadata config map doesn't exist, and is attempting to use
315
+ # autoprovisioning.
316
+ if cluster_config_map is None:
317
+ xpk_print(
318
+ 'Unable to find config map. Please specify a capacity type'
319
+ ' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue'
320
+ ' to use autoprovisioning (--enable-autoprovisioning).'
321
+ )
322
+ return node_selector_args, 1
323
+
324
+ return_code, capacity_type_str = get_value_from_map(
325
+ CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
326
+ )
327
+ if return_code != 0:
328
+ return node_selector_args, return_code
329
+
330
+ if capacity_type_str == CapacityType.RESERVATION.name:
331
+ return_code, args.reservation = get_value_from_map(
332
+ RESERVATION_CONFIG_KEY, cluster_config_map
333
+ )
334
+ if return_code != 0:
335
+ return node_selector_args, return_code
336
+ return_code = verify_reservation_exists(args)
337
+ if return_code > 0:
338
+ xpk_print('Unable to verify reservation name saved in config map.')
339
+ return node_selector_args, return_code
340
+
341
+ # Check if reservation id is valid. Shared function with cluster creation.
342
+ node_selector_args, return_code = (
343
+ get_capacity_node_selectors_from_capacity_type(args, capacity_type_str)
344
+ )
345
+ if return_code != 0:
346
+ xpk_print('Unable to get node selectors from capacity type.')
347
+ return node_selector_args, return_code
348
+
349
+ return node_selector_args, return_code
350
+
351
+
352
+ def get_cluster_provisioner(args) -> str:
353
+ metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
354
+ cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
355
+ cluster_provisioner = 'gcloud'
356
+ if not cluster_config_map is None:
357
+ provisioner = cluster_config_map.get('provisioner')
358
+ if not provisioner is None:
359
+ cluster_provisioner = provisioner
360
+ xpk_print(f'Cluster provisioner: {cluster_provisioner}')
361
+ return cluster_provisioner