xpk 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/commands/__init__.py +15 -0
  3. xpk/commands/batch.py +109 -0
  4. xpk/commands/cluster.py +784 -0
  5. xpk/commands/cluster_gcluster.py +185 -0
  6. xpk/commands/info.py +245 -0
  7. xpk/commands/inspector.py +363 -0
  8. xpk/commands/job.py +197 -0
  9. xpk/commands/kind.py +253 -0
  10. xpk/commands/shell.py +120 -0
  11. xpk/commands/version.py +39 -0
  12. xpk/commands/workload.py +692 -0
  13. xpk/core/__init__.py +15 -0
  14. xpk/core/blueprint/__init__.py +15 -0
  15. xpk/core/blueprint/blueprint_definitions.py +61 -0
  16. xpk/core/blueprint/blueprint_generator.py +652 -0
  17. xpk/core/cluster_private.py +197 -0
  18. xpk/core/commands.py +352 -0
  19. xpk/core/core.py +2824 -0
  20. xpk/core/docker_manager.py +308 -0
  21. xpk/core/gcluster_manager.py +158 -0
  22. xpk/core/kjob.py +205 -0
  23. xpk/core/kueue.py +352 -0
  24. xpk/core/nap.py +349 -0
  25. xpk/core/pathways.py +298 -0
  26. xpk/core/ray.py +222 -0
  27. xpk/core/system_characteristics.py +1395 -0
  28. xpk/core/workload.py +133 -0
  29. xpk/core/workload_decorators/__init__.py +15 -0
  30. xpk/core/workload_decorators/rdma_decorator.py +109 -0
  31. xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
  32. xpk/main.py +73 -0
  33. xpk/parser/__init__.py +15 -0
  34. xpk/parser/batch.py +184 -0
  35. xpk/parser/cluster.py +621 -0
  36. xpk/parser/common.py +71 -0
  37. xpk/parser/core.py +109 -0
  38. xpk/parser/info.py +63 -0
  39. xpk/parser/inspector.py +65 -0
  40. xpk/parser/job.py +126 -0
  41. xpk/parser/kind.py +94 -0
  42. xpk/parser/shell.py +50 -0
  43. xpk/parser/validators.py +39 -0
  44. xpk/parser/version.py +23 -0
  45. xpk/parser/workload.py +684 -0
  46. xpk/utils/__init__.py +15 -0
  47. xpk/utils/console.py +55 -0
  48. xpk/utils/file.py +82 -0
  49. xpk/utils/network.py +168 -0
  50. xpk/utils/objects.py +85 -0
  51. xpk/utils/yaml.py +30 -0
  52. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/METADATA +301 -28
  53. xpk-0.6.0.dist-info/RECORD +57 -0
  54. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
  55. xpk-0.6.0.dist-info/entry_points.txt +2 -0
  56. xpk-0.5.0.dist-info/RECORD +0 -7
  57. xpk-0.5.0.dist-info/entry_points.txt +0 -2
  58. xpk.py +0 -7282
  59. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
  60. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,308 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from abc import ABC, abstractmethod
18
+ import docker
19
+ from docker.errors import ContainerError, APIError, ImageNotFound, BuildError
20
+ from ..utils.console import xpk_print, xpk_exit
21
+ from ..utils.file import ensure_directory_exists
22
+ from ..utils.objects import hash_string
23
+ from shutil import copytree, copy
24
+ import requests
25
+ import os
26
+ import tempfile
27
+ import time
28
+
29
+
30
+ DockerRunCommandExitCode = 135
31
+ dockerBuildErrorCode = 134
32
+ ctk_dockerfile_path = "Dockerfile"
33
+ ctk_build_ref = "v1.45.1"
34
+ ctk_docker_image = "xpk-ctk"
35
+ ctk_container_name = "xpk-ctk-container"
36
+ gcloud_cfg_mount_path = "/root/.config/gcloud"
37
+ working_dir_mount_path = "/out"
38
+ dockerfile_gh_path = f"https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/{ctk_build_ref}/tools/cloud-build/images/cluster-toolkit-dockerfile/Dockerfile"
39
+ upload_dir_name = "uploads"
40
+
41
+
42
+ class CommandRunner(ABC):
43
+ """This is a base class that defines methods a class for running cluster toolkit command should implement."""
44
+
45
+ @abstractmethod
46
+ def initialize(self) -> None:
47
+ """initialize is a method that should implement all steps neccessary to run command.
48
+
49
+ Returns:
50
+ None
51
+ """
52
+ return None
53
+
54
+ @abstractmethod
55
+ def run_command(self, cmd: str) -> None:
56
+ """run_command implements executing command. If command execution fails, exception should be raised.
57
+
58
+ Args:
59
+ cmd (str): command to run
60
+
61
+ Returns:
62
+ None:
63
+ """
64
+ return None
65
+
66
+ @abstractmethod
67
+ def upload_file_to_working_dir(self, path: str, prefix: str = "") -> str:
68
+ """Uploads single file to working directory.
69
+
70
+ Args:
71
+ path (str): path to file to upload
72
+
73
+ Returns:
74
+ str: path to a destination file
75
+ """
76
+ return ""
77
+
78
+ @abstractmethod
79
+ def upload_directory_to_working_dir(self, path: str, prefix: str = "") -> str:
80
+ """upload directory and its content to working directory.
81
+
82
+ Args:
83
+ path (str): path pointing to directory that will be uploaded.
84
+
85
+ Returns:
86
+ str: path to a target directory.
87
+ """
88
+ return ""
89
+
90
+
91
+ class DockerManager(CommandRunner):
92
+ """DockerManager is a class for managing gcluster execution in docker container.
93
+ Attributes:
94
+ - dockerfile_path (str) : path to dockerfile defining gcluster execution image
95
+ - gcloud_cfg_path (str) : path to directory containing gcloud configuration
96
+ - working_dir (str) : path to directory in which gcluster deployment directory will be saved
97
+ - client (DockerClient) : docker client
98
+ - nocache (bool) : wheter to use docker cache when building image
99
+ - img_name (str) : name of docker image to create
100
+ - container_name (str) : name of the container that will be created from img_name
101
+ - rm_container_after (bool) : if set to True, docker container in which command is executed will be removed after each execution.
102
+ """
103
+
104
+ def __init__(
105
+ self,
106
+ gcloud_cfg_path: str,
107
+ working_dir: str,
108
+ nocache: bool = False,
109
+ img_name: str = ctk_docker_image,
110
+ container_name: str = ctk_container_name,
111
+ remove_container: bool = True,
112
+ ) -> None:
113
+ self.dockerfile_path = ""
114
+ self.client = docker.from_env()
115
+ self.gcloud_cfg_path = gcloud_cfg_path
116
+ self.working_dir = working_dir
117
+ self.nocache = nocache
118
+ self.img_name = f"{img_name}:{ctk_build_ref}"
119
+ self.container_name = container_name
120
+ self.remove_container = remove_container
121
+
122
+ def initialize(self):
123
+ """Build image from dockerfile pointed by _img_name. This method
124
+ uses python docker client to build cloud toolkit execution image.
125
+ Arguments:
126
+ Returns:
127
+ - None
128
+ Raises:
129
+ - docker.errors.BuildError – If there is an error during the build.
130
+ - docker.errors.APIError – If the server returns any other error.
131
+ - TypeError - otherwise
132
+
133
+ """
134
+ self._is_docker_installed()
135
+ xpk_print("Docker found!")
136
+
137
+ if not self._docker_image_exists():
138
+ xpk_print(f"Docker image {self.img_name} not found.")
139
+ self._build_image()
140
+ else:
141
+ xpk_print(f"Docker image {self.img_name} found!")
142
+
143
+ def run_command(
144
+ self,
145
+ cmd: str,
146
+ ) -> None:
147
+ """Run container from _img_name and mount directories:
148
+ - gcloud config
149
+ - deployment directory
150
+ Arguments:
151
+ Returns:
152
+ - bytes
153
+ Raises:
154
+ - docker.errors.ContainerError,
155
+ - docker.errors.ImageNotFound,
156
+ - docker.errors.APIError
157
+ """
158
+ xpk_print(f"Running command: {cmd} ...")
159
+ xpk_print(
160
+ f"volumes: {self.gcloud_cfg_path}:{gcloud_cfg_mount_path},"
161
+ f" {self.working_dir}:{working_dir_mount_path}"
162
+ )
163
+ try:
164
+ container = self.client.containers.run(
165
+ image=self.img_name,
166
+ entrypoint=cmd,
167
+ remove=self.remove_container,
168
+ name=self._get_container_unique_name(
169
+ cmd
170
+ ), # To allow multiple xpk commands run in one machine.
171
+ detach=True,
172
+ volumes=[
173
+ f"{self.gcloud_cfg_path}:{gcloud_cfg_mount_path}",
174
+ f"{self.working_dir}:{working_dir_mount_path}",
175
+ ],
176
+ environment={
177
+ "GOOGLE_APPLICATION_CREDENTIALS": (
178
+ "/root/.config/gcloud/application_default_credentials.json"
179
+ )
180
+ },
181
+ )
182
+ self._print_logs_from_container(container)
183
+ result = container.wait()
184
+ if result["StatusCode"] != 0:
185
+ xpk_print(f"Running gcluster command: {cmd} failed.")
186
+ xpk_exit(result["StatusCode"])
187
+ except ContainerError as e:
188
+ xpk_print(
189
+ "Running command failed due to ContainerError with exit status:"
190
+ f" {e.exit_status} and stderr: {e.stderr}"
191
+ )
192
+ xpk_exit(DockerRunCommandExitCode)
193
+ except ImageNotFound as _:
194
+ xpk_print(f"Image {ctk_docker_image} not found. Deploying cluster failed")
195
+ xpk_exit(DockerRunCommandExitCode)
196
+ except APIError as e:
197
+ xpk_print(f"Deploying cluster toolkit failed due to {e.explanation}")
198
+ xpk_exit(DockerRunCommandExitCode)
199
+
200
+ def _print_logs_from_container(self, container):
201
+ output = container.attach(stdout=True, stream=True, logs=True)
202
+ for line in output:
203
+ xpk_print(f"[gcluster] {line.decode('utf-8').strip()}")
204
+
205
+ def upload_directory_to_working_dir(self, path: str, prefix: str = "") -> str:
206
+ """Move file or directory from specified path to directory containing deployment files
207
+
208
+ Args:
209
+ path (str): path of directory/file that will be moved to deployment directory
210
+ """
211
+ name = path.split("/")[-1]
212
+ target_path = os.path.join(self._get_upload_directory(prefix), name)
213
+ uploaded_path = os.path.join(
214
+ self._get_upload_directory_mounted(prefix), name
215
+ )
216
+ xpk_print(
217
+ f"Copying directory from {path} to {target_path}. Path in docker:"
218
+ f" {uploaded_path}"
219
+ )
220
+ copytree(path, target_path, dirs_exist_ok=True)
221
+ return uploaded_path
222
+
223
+ def upload_file_to_working_dir(self, path: str, prefix: str = "") -> str:
224
+ """Move file or directory from specified path to directory containing deployment files
225
+
226
+ Args:
227
+ path (str): path of directory/file that will be moved to deployment directory
228
+ """
229
+ name = path.split("/")[-1]
230
+ target_path = os.path.join(self._get_upload_directory(prefix), name)
231
+ uploaded_path = os.path.join(
232
+ self._get_upload_directory_mounted(prefix), name
233
+ )
234
+ xpk_print(
235
+ f"Copying a file from {path} to {target_path}. Path in docker:"
236
+ f" {uploaded_path}"
237
+ )
238
+ copy(path, target_path)
239
+ return uploaded_path
240
+
241
+ def _get_upload_directory(self, prefix: str = "") -> str:
242
+ upload_dir = os.path.join(self.working_dir, upload_dir_name, prefix)
243
+ ensure_directory_exists(upload_dir)
244
+ return upload_dir
245
+
246
+ def _get_upload_directory_mounted(self, prefix: str = "") -> str:
247
+ return os.path.join(working_dir_mount_path, upload_dir_name, prefix)
248
+
249
+ def _create_tmp_for_dockerfile(self) -> str:
250
+ tmp_dir = os.path.join(tempfile.gettempdir(), "xpkutils")
251
+ ensure_directory_exists(tmp_dir)
252
+ tmp_path = os.path.join(tmp_dir, "Dockerfile")
253
+ return tmp_path
254
+
255
+ def _is_docker_installed(self) -> None:
256
+ self.client.info()
257
+
258
+ def _docker_image_exists(self) -> bool:
259
+ try:
260
+ self.client.images.get(f"{self.img_name}")
261
+ except ImageNotFound as _:
262
+ return False
263
+ return True
264
+
265
+ def _download_ctk_dockerfile(self) -> None:
266
+ """Downloads cluster toolkit dockerfile to dockerfile_path
267
+
268
+ Returns:
269
+ None
270
+ """
271
+ xpk_print(f"Downloading Dockerfile from {dockerfile_gh_path} ...")
272
+ self.dockerfile_path = self._create_tmp_for_dockerfile()
273
+ r = requests.get(dockerfile_gh_path, timeout=100)
274
+ with open(self.dockerfile_path, "w+", encoding="utf8") as dockerfile:
275
+ dockerfile.write(r.text)
276
+ xpk_print("Downloading Dockerfile completed!")
277
+
278
+ def _build_image(self):
279
+ try:
280
+ self._download_ctk_dockerfile()
281
+ dir_path = "/".join(self.dockerfile_path.split("/")[:-1])
282
+ xpk_print(
283
+ f"Building {self.img_name} docker image from dockerfile:"
284
+ f" {self.dockerfile_path}. It may take a while..."
285
+ )
286
+ self.client.images.build(
287
+ nocache=self.nocache,
288
+ path=dir_path,
289
+ tag=f"{self.img_name}",
290
+ rm=True,
291
+ buildargs={"CLUSTER_TOOLKIT_REF": ctk_build_ref},
292
+ )
293
+ except BuildError as e:
294
+ xpk_print(f"error while building image {self.img_name}: {e.msg}")
295
+ xpk_exit(dockerBuildErrorCode)
296
+ except APIError as e:
297
+ xpk_print(f"erro while building image {self.img_name}: {e.explanation}")
298
+ xpk_exit(dockerBuildErrorCode)
299
+ except TypeError as e:
300
+ xpk_print(f"TypeError while building image {self.img_name}: {e.args}")
301
+ xpk_exit(dockerBuildErrorCode)
302
+ xpk_print("Docker image build succesfully.")
303
+ os.remove(self.dockerfile_path)
304
+ tmp_dockerfile_dir = "/".join(self.dockerfile_path.split("/")[:-1])
305
+ os.rmdir(tmp_dockerfile_dir)
306
+
307
+ def _get_container_unique_name(self, cmd):
308
+ return f"{self.container_name}_{hash_string(cmd + str(time.time_ns()))}"
@@ -0,0 +1,158 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from .docker_manager import CommandRunner
18
+ from ..utils.console import xpk_print
19
+
20
+
21
+ xpk_gcloud_cfg_path = '~/gcloud/cfg'
22
+ xpk_deployment_dir = '/deployment'
23
+ gcluster_deploy_command = 'gcluster deploy'
24
+ gcluster_create_command = 'gcluster create'
25
+ gcluster_destroy_command = 'gcluster destroy'
26
+ blueprint_file_name = 'xpk_blueprint.yaml'
27
+ deployment_module = '/out/xpk-deployment'
28
+ a3_utils_dir_name = 'a3-mega-xpk'
29
+ config_map_repo_path = 'src/xpk/blueprints/a3-mega-xpk/config-map.yaml.tftpl'
30
+ kueue_config_repo_path = (
31
+ 'src/xpk/blueprints/a3-mega-xpk/kueue-xpk-configuration.yaml.tftpl'
32
+ )
33
+
34
+
35
+ class GclusterManager:
36
+ """Manager is a class responsible for running cluster toolkit commands.
37
+ Attributes:
38
+ - gcluster_command_runner (CommandRunner) : instance of class implementing CommandRunner abstract methods.
39
+ Methods:
40
+ - deploy : run a deployment process of cluster toolkit. This method will invoke gcluster create and than gcluster deploy commands.
41
+ - destroy_deployment : run gcluster command to destroy existing deployment.
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ gcluster_command_runner: CommandRunner,
47
+ ) -> None:
48
+ self.gcluster_command_runner = gcluster_command_runner
49
+
50
+ def _run_create_deployment_cmd(
51
+ self, blueprint_container_path: str, prefix: str = ''
52
+ ):
53
+ xpk_print('Creating deployment resources...')
54
+ cluster_create_cmd = (
55
+ f'{gcluster_create_command} -o {self._get_deployment_path(prefix)}'
56
+ f' {blueprint_container_path} -w --force'
57
+ )
58
+ self.gcluster_command_runner.run_command(cluster_create_cmd)
59
+ xpk_print('Creating deployment resources completed.')
60
+
61
+ def _run_deploy_cmd(
62
+ self,
63
+ deployment_name: str,
64
+ auto_approve: bool,
65
+ dry_run: bool,
66
+ prefix: str = '',
67
+ ):
68
+ xpk_print('Deploying resources...')
69
+ deploy_cmd = (
70
+ f'{gcluster_deploy_command} {self._get_deployment_path(prefix)}/{deployment_name}'
71
+ )
72
+ if auto_approve is True:
73
+ deploy_cmd += ' --auto-approve'
74
+ if dry_run is True:
75
+ return
76
+ self.gcluster_command_runner.run_command(deploy_cmd)
77
+ xpk_print('Deployment completed.')
78
+
79
+ def deploy(
80
+ self,
81
+ blueprint_path: str,
82
+ deployment_name: str,
83
+ prefix: str = '',
84
+ auto_approve: bool = True,
85
+ dry_run: bool = False,
86
+ ) -> None:
87
+ """ "deploy method provisions a new cluster using Cluster Toolkit.
88
+ It will invoke gcluster create and then gcluster deploy commands.
89
+ The files staged or created during running gcluster command will be managed by gcluster_command_runner in its working directory."
90
+
91
+ Args:
92
+ blueprint_path (str): path pointing to blueprint which will be deployed.
93
+ deployment_name (str): name of the deployment.
94
+ auto_approve (bool, optional): If set to true deployment command will be auto approved. Currently only True is supported. Defaults to True.
95
+ dry_run (bool, optional): If set to True gcluster will not deploy. Defaults to False.
96
+ Returns:
97
+ None
98
+ """
99
+ xpk_print(f'Deploying blueprint from path {blueprint_path} ...')
100
+ self._run_create_deployment_cmd(
101
+ blueprint_container_path=blueprint_path, prefix=prefix
102
+ )
103
+ self._run_deploy_cmd(
104
+ deployment_name=deployment_name,
105
+ prefix=prefix,
106
+ auto_approve=auto_approve,
107
+ dry_run=dry_run,
108
+ )
109
+ xpk_print('Deploying blueprint completed!')
110
+
111
+ def _run_destroy_command(
112
+ self,
113
+ deployment_name: str,
114
+ prefix: str = '',
115
+ auto_approve: bool = True,
116
+ dry_run: bool = False,
117
+ ):
118
+ destroy_cmd = (
119
+ f'{gcluster_destroy_command} {self._get_deployment_path(prefix)}/{deployment_name}'
120
+ )
121
+ if auto_approve is True:
122
+ destroy_cmd += ' --auto-approve'
123
+ if dry_run is True:
124
+ xpk_print(f'executing command {destroy_cmd}')
125
+ return
126
+ self.gcluster_command_runner.run_command(destroy_cmd)
127
+
128
+ def _get_deployment_path(self, prefix: str = '') -> str:
129
+ prefix = f'/{prefix}' if prefix != '' else ''
130
+ return f'deployments{prefix}'
131
+
132
+ def destroy_deployment(self, deployment_name: str, prefix: str = '') -> None:
133
+ """Destroy deployment.
134
+
135
+ Args:
136
+ deployment_name (str): name of deployment to destroy.
137
+ """
138
+ xpk_print(f'Destroying {deployment_name} started...')
139
+ self._run_destroy_command(deployment_name, prefix=prefix)
140
+ xpk_print(f'Destroying {deployment_name} completed!')
141
+
142
+ def stage_files(
143
+ self, blueprint_file: str, blueprint_dependencies: str, prefix: str = ''
144
+ ) -> str:
145
+ """Uploads blueprint file and directory to gcluster working directory."""
146
+ xpk_print(
147
+ "Staging (sending) blueprint file to gcluster's working directory..."
148
+ )
149
+ staged_blueprint = self.gcluster_command_runner.upload_file_to_working_dir(
150
+ blueprint_file, prefix
151
+ )
152
+ if len(blueprint_dependencies) > 0:
153
+ self.gcluster_command_runner.upload_directory_to_working_dir(
154
+ blueprint_dependencies, prefix
155
+ )
156
+ xpk_print('Staging blueprint completed!')
157
+ xpk_print(f"File path in gcluster's working directory: {staged_blueprint}")
158
+ return staged_blueprint
xpk/core/kjob.py ADDED
@@ -0,0 +1,205 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from argparse import Namespace
18
+ from ..utils.console import xpk_print
19
+ from .commands import run_command_for_value, run_kubectl_apply, run_command_with_updates
20
+ from enum import Enum
21
+
22
+
23
+ class AppProfileDefaults(Enum):
24
+ NAME = "xpk-def-app-profile"
25
+
26
+
27
+ class JobTemplateDefaults(Enum):
28
+ NAME = "xpk-def-batch"
29
+ PARALLELISM = 1
30
+ COMPLETIONS = 1
31
+ CONTAINER_NAME = "xpk-batch-container"
32
+ IMAGE = "ubuntu:22.04"
33
+
34
+
35
+ class PodTemplateDefaults(Enum):
36
+ NAME = "xpk-def-pod"
37
+ CONTAINER_NAME = "xpk-interactive-container"
38
+ IMAGE = "busybox:1.28"
39
+ INTERACTIVE_COMMAND = "/bin/sh"
40
+
41
+
42
+ job_template_yaml = """
43
+ apiVersion: kjobctl.x-k8s.io/v1alpha1
44
+ kind: JobTemplate
45
+ metadata:
46
+ name: {name}
47
+ namespace: default
48
+ template:
49
+ spec:
50
+ parallelism: {parallelism}
51
+ completions: {completions}
52
+ completionMode: Indexed
53
+ template:
54
+ spec:
55
+ containers:
56
+ - name: {container_name}
57
+ image: {image}
58
+ restartPolicy: OnFailure"""
59
+
60
+ app_profile_yaml = """
61
+ apiVersion: kjobctl.x-k8s.io/v1alpha1
62
+ kind: ApplicationProfile
63
+ metadata:
64
+ name: {name}
65
+ namespace: default
66
+ spec:
67
+ supportedModes:
68
+ - name: Slurm
69
+ template: {batch_template}
70
+ requiredFlags: []
71
+ - name: Interactive
72
+ template: {interactive_template}
73
+ """
74
+
75
+ pod_template_yaml = """
76
+ apiVersion: v1
77
+ kind: PodTemplate
78
+ metadata:
79
+ name: {name}
80
+ namespace: default
81
+ template:
82
+ spec:
83
+ containers:
84
+ - name: {container_name}
85
+ image: {image}
86
+ command: [{interactive_command}]
87
+ """
88
+
89
+
90
+ def verify_kjob_installed(args: Namespace) -> int:
91
+ """Check if kjob is installed. If not provide user with proper communicate and exit.
92
+ Args:
93
+ args - user provided arguments.
94
+ Returns:
95
+ error code > if kjob not installed, otherwise 0
96
+ """
97
+ command = "kubectl-kjob help"
98
+ task = "Verify kjob installation "
99
+ verify_kjob_installed_code, _ = run_command_for_value(command, task, args)
100
+
101
+ if verify_kjob_installed_code == 0:
102
+ xpk_print("kjob found")
103
+ return 0
104
+
105
+ if verify_kjob_installed_code != 0:
106
+ xpk_print(
107
+ " kjob not found. Please follow"
108
+ " https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md"
109
+ " to install kjob."
110
+ )
111
+ return verify_kjob_installed_code
112
+ return 0
113
+
114
+
115
+ def create_app_profile_instance(args: Namespace) -> int:
116
+ """Create new AppProfile instance on cluster with default settings.
117
+
118
+ Args:
119
+ args - user provided arguments
120
+ Returns:
121
+ exit_code > 0 if creating AppProfile fails, 0 otherwise
122
+ """
123
+ return run_kubectl_apply(
124
+ yml_string=app_profile_yaml.format(
125
+ name=AppProfileDefaults.NAME.value,
126
+ batch_template=JobTemplateDefaults.NAME.value,
127
+ interactive_template=PodTemplateDefaults.NAME.value,
128
+ ),
129
+ task="Creating AppProfile",
130
+ args=args,
131
+ )
132
+
133
+
134
+ def create_job_template_instance(args: Namespace) -> int:
135
+ """Create new JobTemplate instance on cluster with default settings.
136
+
137
+ Args:
138
+ args - user provided arguments
139
+ Returns:
140
+ exit_code > 0 if creating JobTemplate fails, 0 otherwise
141
+ """
142
+ return run_kubectl_apply(
143
+ yml_string=job_template_yaml.format(
144
+ name=JobTemplateDefaults.NAME.value,
145
+ parallelism=JobTemplateDefaults.PARALLELISM.value,
146
+ completions=JobTemplateDefaults.COMPLETIONS.value,
147
+ container_name=JobTemplateDefaults.CONTAINER_NAME.value,
148
+ image=JobTemplateDefaults.IMAGE.value,
149
+ ),
150
+ task="Creating JobTemplate",
151
+ args=args,
152
+ )
153
+
154
+
155
+ def create_pod_template_instance(args: Namespace) -> int:
156
+ """Create new PodTemplate instance on cluster with default settings.
157
+
158
+ Args:
159
+ args - user provided arguments
160
+ Returns:
161
+ exit_code > 0 if creating PodTemplate fails, 0 otherwise
162
+ """
163
+ return run_kubectl_apply(
164
+ yml_string=pod_template_yaml.format(
165
+ name=PodTemplateDefaults.NAME.value,
166
+ container_name=PodTemplateDefaults.CONTAINER_NAME.value,
167
+ image=PodTemplateDefaults.IMAGE.value,
168
+ interactive_command=PodTemplateDefaults.INTERACTIVE_COMMAND.value,
169
+ ),
170
+ task="Creating PodTemplate",
171
+ args=args,
172
+ )
173
+
174
+
175
+ def prepare_kjob(args) -> int:
176
+ job_err_code = create_job_template_instance(args)
177
+ if job_err_code > 0:
178
+ return job_err_code
179
+
180
+ pod_err_code = create_pod_template_instance(args)
181
+ if pod_err_code > 0:
182
+ return pod_err_code
183
+
184
+ return create_app_profile_instance(args)
185
+
186
+
187
+ def apply_kjob_crds(args: Namespace) -> int:
188
+ """Apply kjob CRDs on cluster.
189
+
190
+ This function install kjob CRDs files from kjobctl printcrds.
191
+ It creates all neccessary kjob CRDs.
192
+
193
+ Args:
194
+ args - user provided arguments
195
+ Returns:
196
+ None
197
+ """
198
+ command = "kubectl kjob printcrds | kubectl apply --server-side -f -"
199
+ task = "Create kjob CRDs on cluster"
200
+ return_code = run_command_with_updates(command, task, args)
201
+ if return_code != 0:
202
+ xpk_print(f"{task} returned ERROR {return_code}")
203
+ return return_code
204
+ xpk_print("Creating kjob CRDs succeeded")
205
+ return 0