xpk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. xpk-0.0.1.dist-info/LICENSE +202 -0
  88. xpk-0.0.1.dist-info/METADATA +1498 -0
  89. xpk-0.0.1.dist-info/RECORD +92 -0
  90. xpk-0.0.1.dist-info/WHEEL +5 -0
  91. xpk-0.0.1.dist-info/entry_points.txt +2 -0
  92. xpk-0.0.1.dist-info/top_level.txt +1 -0
xpk/core/kjob.py ADDED
@@ -0,0 +1,444 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..core.blueprint.blueprint_generator import get_subnetworks_for_a3mega, get_subnetworks_for_a3ultra
18
+ from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
19
+ from argparse import Namespace
20
+ import yaml
21
+ from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry
22
+ from ..utils.console import xpk_print, xpk_exit
23
+
24
+ from ..utils import templates
25
+ from kubernetes import client as k8s_client
26
+ from kubernetes.client import ApiClient
27
+ from kubernetes.client.rest import ApiException
28
+ from .cluster import setup_k8s_env, XPK_SA, DEFAULT_NAMESPACE
29
+ from .storage import get_auto_mount_storages, get_auto_mount_gcsfuse_storages
30
+ from .commands import run_command_for_value, run_kubectl_apply, run_command_with_updates
31
+ from .config import XpkConfig, KJOB_SHELL_IMAGE, KJOB_SHELL_INTERACTIVE_COMMAND, KJOB_SHELL_WORKING_DIRECTORY, KJOB_BATCH_IMAGE, KJOB_BATCH_WORKING_DIRECTORY
32
+ from .resources import get_cluster_system_characteristics, SystemCharacteristics, AcceleratorType
33
+ from enum import Enum
34
+
35
+ from ..core.workload_decorators import tcpxo_decorator
36
+
37
+ from ..core.workload_decorators import rdma_decorator
38
+
39
+ KJOB_API_GROUP_NAME = "kjobctl.x-k8s.io"
40
+ KJOB_API_GROUP_VERSION = "v1alpha1"
41
+ KJOB_API_VOLUME_BUNDLE_PLURAL = "volumebundles"
42
+ VOLUME_BUNDLE_TEMPLATE_PATH = "/../templates/volume_bundle.yaml"
43
+
44
+
45
+ class AppProfileDefaults(Enum):
46
+ NAME = "xpk-def-app-profile"
47
+
48
+
49
+ class JobTemplateDefaults(Enum):
50
+ NAME = "xpk-def-batch"
51
+ PARALLELISM = 1
52
+ COMPLETIONS = 1
53
+ CONTAINER_NAME = "xpk-batch-container"
54
+ IMAGE = "ubuntu:22.04"
55
+ WORKING_DIRECTORY = "/"
56
+
57
+
58
+ class PodTemplateDefaults(Enum):
59
+ NAME = "xpk-def-pod"
60
+ CONTAINER_NAME = "xpk-interactive-container"
61
+ IMAGE = "busybox:1.28"
62
+ WORKING_DIRECTORY = "/"
63
+ INTERACTIVE_COMMAND = "/bin/sh"
64
+
65
+
66
+ job_template_yaml = """
67
+ apiVersion: kjobctl.x-k8s.io/v1alpha1
68
+ kind: JobTemplate
69
+ metadata:
70
+ name: {name}
71
+ namespace: default
72
+ template:
73
+ spec:
74
+ parallelism: {parallelism}
75
+ completions: {completions}
76
+ completionMode: Indexed
77
+ template:
78
+ spec:
79
+ dnsPolicy: ClusterFirstWithHostNet
80
+ tolerations:
81
+ - operator: "Exists"
82
+ key: nvidia.com/gpu
83
+ containers:
84
+ - name: {container_name}
85
+ image: {image}
86
+ workingDir: {working_directory}
87
+ {resources}
88
+ {node_selector}
89
+ priorityClassName: {priority}
90
+ restartPolicy: OnFailure
91
+ serviceAccountName: {service_account}
92
+ """
93
+ job_node_selector_template = """
94
+ nodeSelector:
95
+ cloud.google.com/gke-accelerator: {gpu_name}
96
+ """
97
+ job_resources_template = """
98
+ resources:
99
+ limits:
100
+ nvidia.com/gpu: {gpu_per_node}
101
+ """
102
+
103
+ app_profile_yaml = """
104
+ apiVersion: kjobctl.x-k8s.io/v1alpha1
105
+ kind: ApplicationProfile
106
+ metadata:
107
+ name: {name}
108
+ namespace: default
109
+ spec:
110
+ supportedModes:
111
+ - name: Slurm
112
+ template: {batch_template}
113
+ requiredFlags: []
114
+ - name: Interactive
115
+ template: {interactive_template}
116
+ volumeBundles: {volume_bundles}
117
+ """
118
+
119
+ pod_template_yaml = """
120
+ apiVersion: v1
121
+ kind: PodTemplate
122
+ metadata:
123
+ name: {name}
124
+ namespace: default
125
+ template:
126
+ spec:
127
+ tolerations:
128
+ - effect: NoSchedule
129
+ key: components.gke.io/gke-managed-components
130
+ operator: Equal
131
+ value: "true"
132
+ containers:
133
+ - name: {container_name}
134
+ image: {image}
135
+ command: [{interactive_command}]
136
+ workingDir: {working_directory}
137
+ initContainers:
138
+ - name: init
139
+ image: {image}
140
+ command: ['/bin/mkdir', '-p', '{working_directory}']
141
+ serviceAccountName: {service_account}
142
+ """
143
+
144
+ Kueue_TAS_annotation = "kueue.x-k8s.io/podset-preferred-topology=cloud.google.com/gce-topology-host"
145
+
146
+ default_interface_annotation = "networking.gke.io/default-interface=eth0"
147
+
148
+
149
+ def get_a3ultra_pod_template_annotations(args: Namespace) -> tuple[str, str]:
150
+ sub_networks = get_subnetworks_for_a3ultra(args.cluster)
151
+ interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
152
+ sub_networks
153
+ )
154
+
155
+ return (
156
+ default_interface_annotation,
157
+ f"{interfaces_key}=$'{interfaces_value}'",
158
+ )
159
+
160
+
161
+ def get_a3mega_pod_template_annotations(
162
+ args: Namespace,
163
+ ) -> tuple[str, str, str]:
164
+ """Adds or updates annotations in the Pod template."""
165
+ sub_networks = get_subnetworks_for_a3mega(args.cluster)
166
+ tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
167
+ interfaces_key, interfaces_value = tcpxo_decorator.get_interfaces_entry(
168
+ sub_networks
169
+ )
170
+ tcpxo = f"{tcpxo_deamon_key}=$'{tcpxo_deamon_paths}'"
171
+ interfaces = f"{interfaces_key}=$'{interfaces_value}'"
172
+ return tcpxo, interfaces, default_interface_annotation
173
+
174
+
175
+ def verify_kjob_installed(args: Namespace) -> int:
176
+ """Check if kjob is installed. If not provide user with proper communicate and exit.
177
+ Args:
178
+ args - user provided arguments.
179
+ Returns:
180
+ error code > if kjob not installed, otherwise 0
181
+ """
182
+ command = "kubectl-kjob help"
183
+ task = "Verify kjob installation "
184
+ verify_kjob_installed_code, _ = run_command_for_value(command, task, args)
185
+
186
+ if verify_kjob_installed_code == 0:
187
+ xpk_print("kjob found")
188
+ return 0
189
+
190
+ if verify_kjob_installed_code != 0:
191
+ xpk_print(
192
+ " kjob not found. Please follow"
193
+ " https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md"
194
+ " to install kjob."
195
+ )
196
+ return verify_kjob_installed_code
197
+ return 0
198
+
199
+
200
+ def get_pod_template_interactive_command() -> str:
201
+ """Gets the interactive command for PodTemplate from config otherwise the default value.
202
+
203
+ Args:
204
+ args - user provided arguments
205
+ Returns:
206
+ str - PodTemplate's interactive command
207
+ """
208
+ config = XpkConfig()
209
+ pod_command = config.get(KJOB_SHELL_INTERACTIVE_COMMAND)
210
+ if pod_command is None or len(pod_command) == 0:
211
+ pod_command = PodTemplateDefaults.INTERACTIVE_COMMAND.value
212
+
213
+ return pod_command
214
+
215
+
216
+ def create_app_profile_instance(
217
+ args: Namespace, volume_bundles: list[str]
218
+ ) -> int:
219
+ """Create new AppProfile instance on cluster with default settings.
220
+
221
+ Args:
222
+ args - user provided arguments
223
+ Returns:
224
+ exit_code > 0 if creating AppProfile fails, 0 otherwise
225
+ """
226
+ return run_kubectl_apply(
227
+ yml_string=app_profile_yaml.format(
228
+ name=AppProfileDefaults.NAME.value,
229
+ batch_template=JobTemplateDefaults.NAME.value,
230
+ interactive_template=PodTemplateDefaults.NAME.value,
231
+ volume_bundles=volume_bundles,
232
+ ),
233
+ task="Creating AppProfile",
234
+ args=args,
235
+ )
236
+
237
+
238
+ def decorate_job_template_with_gpu(yml_string: str, gpu_type: str) -> str:
239
+ job_spec = yaml.safe_load(yml_string)["template"]
240
+ if gpu_type == H100_MEGA_DEVICE_TYPE:
241
+ job_spec = tcpxo_decorator.decorate_kjob_template(job_spec)
242
+ if gpu_type == H200_DEVICE_TYPE:
243
+ job_spec = rdma_decorator.decorate_kjob_template(job_spec)
244
+ job_template_dict = yaml.safe_load(yml_string)
245
+ job_template_dict["template"] = job_spec
246
+ return yaml.dump(job_template_dict, sort_keys=False)
247
+
248
+
249
+ def create_job_template_instance(
250
+ args: Namespace,
251
+ system: SystemCharacteristics | None,
252
+ service_account: str,
253
+ ) -> int:
254
+ """Create new JobTemplate instance on cluster with default settings.
255
+
256
+ Args:
257
+ args - user provided arguments
258
+ Returns:
259
+ exit_code > 0 if creating JobTemplate fails, 0 otherwise
260
+ """
261
+ config = XpkConfig()
262
+ job_image = config.get(KJOB_BATCH_IMAGE)
263
+ if job_image is None or len(job_image) == 0:
264
+ job_image = JobTemplateDefaults.IMAGE.value
265
+ working_directory = config.get(KJOB_BATCH_WORKING_DIRECTORY)
266
+ if working_directory is None or len(working_directory) == 0:
267
+ working_directory = JobTemplateDefaults.WORKING_DIRECTORY.value
268
+ resources = (
269
+ job_resources_template.format(gpu_per_node=system.chips_per_vm)
270
+ if system is not None
271
+ and system.accelerator_type == AcceleratorType["GPU"]
272
+ else ""
273
+ )
274
+
275
+ node_selector = (
276
+ job_node_selector_template.format(gpu_name=system.gke_accelerator)
277
+ if system is not None
278
+ and system.accelerator_type == AcceleratorType["GPU"]
279
+ else ""
280
+ )
281
+ yml_string = job_template_yaml.format(
282
+ name=JobTemplateDefaults.NAME.value,
283
+ parallelism=JobTemplateDefaults.PARALLELISM.value,
284
+ completions=JobTemplateDefaults.COMPLETIONS.value,
285
+ container_name=JobTemplateDefaults.CONTAINER_NAME.value,
286
+ image=job_image,
287
+ working_directory=working_directory,
288
+ resources=resources,
289
+ node_selector=node_selector,
290
+ priority=args.priority if hasattr(args, "priority") else "medium",
291
+ service_account=service_account,
292
+ )
293
+ if system is not None and system.accelerator_type == AcceleratorType["GPU"]:
294
+ yml_string = decorate_job_template_with_gpu(yml_string, system.device_type)
295
+
296
+ return run_kubectl_apply(
297
+ yml_string,
298
+ task="Creating JobTemplate",
299
+ args=args,
300
+ )
301
+
302
+
303
+ def create_pod_template_instance(args: Namespace, service_account: str) -> int:
304
+ """Create new PodTemplate instance on cluster with default settings.
305
+
306
+ Args:
307
+ args - user provided arguments
308
+ Returns:
309
+ exit_code > 0 if creating PodTemplate fails, 0 otherwise
310
+ """
311
+ config = XpkConfig()
312
+ pod_image = config.get(KJOB_SHELL_IMAGE)
313
+ if pod_image is None or len(pod_image) == 0:
314
+ pod_image = PodTemplateDefaults.IMAGE.value
315
+ working_directory = config.get(KJOB_SHELL_WORKING_DIRECTORY)
316
+ if working_directory is None or len(working_directory) == 0:
317
+ working_directory = PodTemplateDefaults.WORKING_DIRECTORY.value
318
+
319
+ return run_kubectl_apply(
320
+ yml_string=pod_template_yaml.format(
321
+ name=PodTemplateDefaults.NAME.value,
322
+ container_name=PodTemplateDefaults.CONTAINER_NAME.value,
323
+ image=pod_image,
324
+ working_directory=working_directory,
325
+ interactive_command=get_pod_template_interactive_command(),
326
+ service_account=service_account,
327
+ ),
328
+ task="Creating PodTemplate",
329
+ args=args,
330
+ )
331
+
332
+
333
+ def prepare_kjob(args: Namespace) -> int:
334
+ system = get_cluster_system_characteristics(args)
335
+
336
+ k8s_api_client = setup_k8s_env(args)
337
+ storages = get_auto_mount_storages(k8s_api_client)
338
+
339
+ service_account = ""
340
+ if len(storages) > 0:
341
+ service_account = XPK_SA
342
+
343
+ job_err_code = create_job_template_instance(args, system, service_account)
344
+ if job_err_code > 0:
345
+ return job_err_code
346
+
347
+ pod_err_code = create_pod_template_instance(args, service_account)
348
+ if pod_err_code > 0:
349
+ return pod_err_code
350
+
351
+ volume_bundles = [item.name for item in storages]
352
+
353
+ return create_app_profile_instance(args, volume_bundles)
354
+
355
+
356
+ def apply_kjob_crds(args: Namespace) -> int:
357
+ """Apply kjob CRDs on cluster.
358
+
359
+ This function install kjob CRDs files from kjobctl printcrds.
360
+ It creates all neccessary kjob CRDs.
361
+
362
+ Args:
363
+ args - user provided arguments
364
+ Returns:
365
+ None
366
+ """
367
+ command = "kubectl kjob printcrds | kubectl apply --server-side -f -"
368
+ task = "Create kjob CRDs on cluster"
369
+ return_code = run_command_with_updates(command, task, args)
370
+ if return_code != 0:
371
+ xpk_print(f"{task} returned ERROR {return_code}")
372
+ return return_code
373
+ xpk_print("Creating kjob CRDs succeeded")
374
+ return 0
375
+
376
+
377
+ def create_volume_bundle_instance(
378
+ k8s_api_client: ApiClient,
379
+ name: str,
380
+ manifest: list[dict],
381
+ readonly: bool,
382
+ mount_point: str,
383
+ ) -> None:
384
+ """
385
+ Creates a new VolumeBundle resource in the Kubernetes cluster.
386
+
387
+ This function reads a VolumeBundle template from a YAML file, populates it with
388
+ values from the provided arguments, and then creates the VolumeBundle object
389
+ in the cluster.
390
+
391
+ Args:
392
+ k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
393
+ args: An argparse Namespace object containing the arguments for creating
394
+ the Storage resource.
395
+ """
396
+ data = templates.load(VOLUME_BUNDLE_TEMPLATE_PATH)
397
+ data["metadata"]["name"] = name
398
+ spec = data["spec"]
399
+ spec["volumes"] = []
400
+ spec["containerVolumeMounts"] = []
401
+
402
+ for obj in manifest:
403
+ if obj["kind"] == "PersistentVolumeClaim":
404
+ spec["volumes"].append({
405
+ "name": obj["metadata"]["name"],
406
+ "persistentVolumeClaim": {
407
+ "claimName": obj["metadata"]["name"],
408
+ "readOnly": readonly,
409
+ },
410
+ })
411
+ spec["containerVolumeMounts"].append({
412
+ "name": obj["metadata"]["name"],
413
+ "mountPath": mount_point,
414
+ })
415
+
416
+ data["spec"] = spec
417
+
418
+ api_instance = k8s_client.CustomObjectsApi(k8s_api_client)
419
+ try:
420
+ api_instance.create_namespaced_custom_object(
421
+ namespace=DEFAULT_NAMESPACE,
422
+ group=KJOB_API_GROUP_NAME,
423
+ version=KJOB_API_GROUP_VERSION,
424
+ plural=KJOB_API_VOLUME_BUNDLE_PLURAL,
425
+ body=data,
426
+ )
427
+ xpk_print(
428
+ f"Created {KJOB_API_VOLUME_BUNDLE_PLURAL}.{KJOB_API_GROUP_NAME} object:"
429
+ f" {data['metadata']['name']}"
430
+ )
431
+ except ApiException as e:
432
+ if e.status == 409:
433
+ xpk_print(f"VolumeBundle: {name} already exists. Skipping its creation")
434
+ else:
435
+ xpk_print(f"Encountered error during VolumeBundle creation: {e}")
436
+ xpk_exit(1)
437
+
438
+
439
+ def get_gcsfuse_annotation(args: Namespace) -> str | None:
440
+ k8s_api_client = setup_k8s_env(args)
441
+ gcsfuse_storages = get_auto_mount_gcsfuse_storages(k8s_api_client)
442
+ if len(gcsfuse_storages) > 0:
443
+ return "gke-gcsfuse/volumes=true"
444
+ return None