xpk 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +109 -0
- xpk/commands/cluster.py +784 -0
- xpk/commands/cluster_gcluster.py +185 -0
- xpk/commands/info.py +245 -0
- xpk/commands/inspector.py +363 -0
- xpk/commands/job.py +197 -0
- xpk/commands/kind.py +253 -0
- xpk/commands/shell.py +120 -0
- xpk/commands/version.py +39 -0
- xpk/commands/workload.py +692 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +61 -0
- xpk/core/blueprint/blueprint_generator.py +652 -0
- xpk/core/cluster_private.py +197 -0
- xpk/core/commands.py +352 -0
- xpk/core/core.py +2824 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/gcluster_manager.py +158 -0
- xpk/core/kjob.py +205 -0
- xpk/core/kueue.py +352 -0
- xpk/core/nap.py +349 -0
- xpk/core/pathways.py +298 -0
- xpk/core/ray.py +222 -0
- xpk/core/system_characteristics.py +1395 -0
- xpk/core/workload.py +133 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +109 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
- xpk/main.py +73 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +184 -0
- xpk/parser/cluster.py +621 -0
- xpk/parser/common.py +71 -0
- xpk/parser/core.py +109 -0
- xpk/parser/info.py +63 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +126 -0
- xpk/parser/kind.py +94 -0
- xpk/parser/shell.py +50 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +684 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +85 -0
- xpk/utils/yaml.py +30 -0
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/METADATA +301 -28
- xpk-0.6.0.dist-info/RECORD +57 -0
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
- xpk-0.6.0.dist-info/entry_points.txt +2 -0
- xpk-0.5.0.dist-info/RECORD +0 -7
- xpk-0.5.0.dist-info/entry_points.txt +0 -2
- xpk.py +0 -7282
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
- {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
import docker
|
|
19
|
+
from docker.errors import ContainerError, APIError, ImageNotFound, BuildError
|
|
20
|
+
from ..utils.console import xpk_print, xpk_exit
|
|
21
|
+
from ..utils.file import ensure_directory_exists
|
|
22
|
+
from ..utils.objects import hash_string
|
|
23
|
+
from shutil import copytree, copy
|
|
24
|
+
import requests
|
|
25
|
+
import os
|
|
26
|
+
import tempfile
|
|
27
|
+
import time
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
DockerRunCommandExitCode = 135
|
|
31
|
+
dockerBuildErrorCode = 134
|
|
32
|
+
ctk_dockerfile_path = "Dockerfile"
|
|
33
|
+
ctk_build_ref = "v1.45.1"
|
|
34
|
+
ctk_docker_image = "xpk-ctk"
|
|
35
|
+
ctk_container_name = "xpk-ctk-container"
|
|
36
|
+
gcloud_cfg_mount_path = "/root/.config/gcloud"
|
|
37
|
+
working_dir_mount_path = "/out"
|
|
38
|
+
dockerfile_gh_path = f"https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/{ctk_build_ref}/tools/cloud-build/images/cluster-toolkit-dockerfile/Dockerfile"
|
|
39
|
+
upload_dir_name = "uploads"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class CommandRunner(ABC):
|
|
43
|
+
"""This is a base class that defines methods a class for running cluster toolkit command should implement."""
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def initialize(self) -> None:
|
|
47
|
+
"""initialize is a method that should implement all steps neccessary to run command.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
None
|
|
51
|
+
"""
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def run_command(self, cmd: str) -> None:
|
|
56
|
+
"""run_command implements executing command. If command execution fails, exception should be raised.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
cmd (str): command to run
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
None:
|
|
63
|
+
"""
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def upload_file_to_working_dir(self, path: str, prefix: str = "") -> str:
|
|
68
|
+
"""Uploads single file to working directory.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
path (str): path to file to upload
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
str: path to a destination file
|
|
75
|
+
"""
|
|
76
|
+
return ""
|
|
77
|
+
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def upload_directory_to_working_dir(self, path: str, prefix: str = "") -> str:
|
|
80
|
+
"""upload directory and its content to working directory.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
path (str): path pointing to directory that will be uploaded.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
str: path to a target directory.
|
|
87
|
+
"""
|
|
88
|
+
return ""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class DockerManager(CommandRunner):
|
|
92
|
+
"""DockerManager is a class for managing gcluster execution in docker container.
|
|
93
|
+
Attributes:
|
|
94
|
+
- dockerfile_path (str) : path to dockerfile defining gcluster execution image
|
|
95
|
+
- gcloud_cfg_path (str) : path to directory containing gcloud configuration
|
|
96
|
+
- working_dir (str) : path to directory in which gcluster deployment directory will be saved
|
|
97
|
+
- client (DockerClient) : docker client
|
|
98
|
+
- nocache (bool) : wheter to use docker cache when building image
|
|
99
|
+
- img_name (str) : name of docker image to create
|
|
100
|
+
- container_name (str) : name of the container that will be created from img_name
|
|
101
|
+
- rm_container_after (bool) : if set to True, docker container in which command is executed will be removed after each execution.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __init__(
|
|
105
|
+
self,
|
|
106
|
+
gcloud_cfg_path: str,
|
|
107
|
+
working_dir: str,
|
|
108
|
+
nocache: bool = False,
|
|
109
|
+
img_name: str = ctk_docker_image,
|
|
110
|
+
container_name: str = ctk_container_name,
|
|
111
|
+
remove_container: bool = True,
|
|
112
|
+
) -> None:
|
|
113
|
+
self.dockerfile_path = ""
|
|
114
|
+
self.client = docker.from_env()
|
|
115
|
+
self.gcloud_cfg_path = gcloud_cfg_path
|
|
116
|
+
self.working_dir = working_dir
|
|
117
|
+
self.nocache = nocache
|
|
118
|
+
self.img_name = f"{img_name}:{ctk_build_ref}"
|
|
119
|
+
self.container_name = container_name
|
|
120
|
+
self.remove_container = remove_container
|
|
121
|
+
|
|
122
|
+
def initialize(self):
|
|
123
|
+
"""Build image from dockerfile pointed by _img_name. This method
|
|
124
|
+
uses python docker client to build cloud toolkit execution image.
|
|
125
|
+
Arguments:
|
|
126
|
+
Returns:
|
|
127
|
+
- None
|
|
128
|
+
Raises:
|
|
129
|
+
- docker.errors.BuildError – If there is an error during the build.
|
|
130
|
+
- docker.errors.APIError – If the server returns any other error.
|
|
131
|
+
- TypeError - otherwise
|
|
132
|
+
|
|
133
|
+
"""
|
|
134
|
+
self._is_docker_installed()
|
|
135
|
+
xpk_print("Docker found!")
|
|
136
|
+
|
|
137
|
+
if not self._docker_image_exists():
|
|
138
|
+
xpk_print(f"Docker image {self.img_name} not found.")
|
|
139
|
+
self._build_image()
|
|
140
|
+
else:
|
|
141
|
+
xpk_print(f"Docker image {self.img_name} found!")
|
|
142
|
+
|
|
143
|
+
def run_command(
|
|
144
|
+
self,
|
|
145
|
+
cmd: str,
|
|
146
|
+
) -> None:
|
|
147
|
+
"""Run container from _img_name and mount directories:
|
|
148
|
+
- gcloud config
|
|
149
|
+
- deployment directory
|
|
150
|
+
Arguments:
|
|
151
|
+
Returns:
|
|
152
|
+
- bytes
|
|
153
|
+
Raises:
|
|
154
|
+
- docker.errors.ContainerError,
|
|
155
|
+
- docker.errors.ImageNotFound,
|
|
156
|
+
- docker.errors.APIError
|
|
157
|
+
"""
|
|
158
|
+
xpk_print(f"Running command: {cmd} ...")
|
|
159
|
+
xpk_print(
|
|
160
|
+
f"volumes: {self.gcloud_cfg_path}:{gcloud_cfg_mount_path},"
|
|
161
|
+
f" {self.working_dir}:{working_dir_mount_path}"
|
|
162
|
+
)
|
|
163
|
+
try:
|
|
164
|
+
container = self.client.containers.run(
|
|
165
|
+
image=self.img_name,
|
|
166
|
+
entrypoint=cmd,
|
|
167
|
+
remove=self.remove_container,
|
|
168
|
+
name=self._get_container_unique_name(
|
|
169
|
+
cmd
|
|
170
|
+
), # To allow multiple xpk commands run in one machine.
|
|
171
|
+
detach=True,
|
|
172
|
+
volumes=[
|
|
173
|
+
f"{self.gcloud_cfg_path}:{gcloud_cfg_mount_path}",
|
|
174
|
+
f"{self.working_dir}:{working_dir_mount_path}",
|
|
175
|
+
],
|
|
176
|
+
environment={
|
|
177
|
+
"GOOGLE_APPLICATION_CREDENTIALS": (
|
|
178
|
+
"/root/.config/gcloud/application_default_credentials.json"
|
|
179
|
+
)
|
|
180
|
+
},
|
|
181
|
+
)
|
|
182
|
+
self._print_logs_from_container(container)
|
|
183
|
+
result = container.wait()
|
|
184
|
+
if result["StatusCode"] != 0:
|
|
185
|
+
xpk_print(f"Running gcluster command: {cmd} failed.")
|
|
186
|
+
xpk_exit(result["StatusCode"])
|
|
187
|
+
except ContainerError as e:
|
|
188
|
+
xpk_print(
|
|
189
|
+
"Running command failed due to ContainerError with exit status:"
|
|
190
|
+
f" {e.exit_status} and stderr: {e.stderr}"
|
|
191
|
+
)
|
|
192
|
+
xpk_exit(DockerRunCommandExitCode)
|
|
193
|
+
except ImageNotFound as _:
|
|
194
|
+
xpk_print(f"Image {ctk_docker_image} not found. Deploying cluster failed")
|
|
195
|
+
xpk_exit(DockerRunCommandExitCode)
|
|
196
|
+
except APIError as e:
|
|
197
|
+
xpk_print(f"Deploying cluster toolkit failed due to {e.explanation}")
|
|
198
|
+
xpk_exit(DockerRunCommandExitCode)
|
|
199
|
+
|
|
200
|
+
def _print_logs_from_container(self, container):
|
|
201
|
+
output = container.attach(stdout=True, stream=True, logs=True)
|
|
202
|
+
for line in output:
|
|
203
|
+
xpk_print(f"[gcluster] {line.decode('utf-8').strip()}")
|
|
204
|
+
|
|
205
|
+
def upload_directory_to_working_dir(self, path: str, prefix: str = "") -> str:
|
|
206
|
+
"""Move file or directory from specified path to directory containing deployment files
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
path (str): path of directory/file that will be moved to deployment directory
|
|
210
|
+
"""
|
|
211
|
+
name = path.split("/")[-1]
|
|
212
|
+
target_path = os.path.join(self._get_upload_directory(prefix), name)
|
|
213
|
+
uploaded_path = os.path.join(
|
|
214
|
+
self._get_upload_directory_mounted(prefix), name
|
|
215
|
+
)
|
|
216
|
+
xpk_print(
|
|
217
|
+
f"Copying directory from {path} to {target_path}. Path in docker:"
|
|
218
|
+
f" {uploaded_path}"
|
|
219
|
+
)
|
|
220
|
+
copytree(path, target_path, dirs_exist_ok=True)
|
|
221
|
+
return uploaded_path
|
|
222
|
+
|
|
223
|
+
def upload_file_to_working_dir(self, path: str, prefix: str = "") -> str:
|
|
224
|
+
"""Move file or directory from specified path to directory containing deployment files
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
path (str): path of directory/file that will be moved to deployment directory
|
|
228
|
+
"""
|
|
229
|
+
name = path.split("/")[-1]
|
|
230
|
+
target_path = os.path.join(self._get_upload_directory(prefix), name)
|
|
231
|
+
uploaded_path = os.path.join(
|
|
232
|
+
self._get_upload_directory_mounted(prefix), name
|
|
233
|
+
)
|
|
234
|
+
xpk_print(
|
|
235
|
+
f"Copying a file from {path} to {target_path}. Path in docker:"
|
|
236
|
+
f" {uploaded_path}"
|
|
237
|
+
)
|
|
238
|
+
copy(path, target_path)
|
|
239
|
+
return uploaded_path
|
|
240
|
+
|
|
241
|
+
def _get_upload_directory(self, prefix: str = "") -> str:
|
|
242
|
+
upload_dir = os.path.join(self.working_dir, upload_dir_name, prefix)
|
|
243
|
+
ensure_directory_exists(upload_dir)
|
|
244
|
+
return upload_dir
|
|
245
|
+
|
|
246
|
+
def _get_upload_directory_mounted(self, prefix: str = "") -> str:
|
|
247
|
+
return os.path.join(working_dir_mount_path, upload_dir_name, prefix)
|
|
248
|
+
|
|
249
|
+
def _create_tmp_for_dockerfile(self) -> str:
|
|
250
|
+
tmp_dir = os.path.join(tempfile.gettempdir(), "xpkutils")
|
|
251
|
+
ensure_directory_exists(tmp_dir)
|
|
252
|
+
tmp_path = os.path.join(tmp_dir, "Dockerfile")
|
|
253
|
+
return tmp_path
|
|
254
|
+
|
|
255
|
+
def _is_docker_installed(self) -> None:
|
|
256
|
+
self.client.info()
|
|
257
|
+
|
|
258
|
+
def _docker_image_exists(self) -> bool:
|
|
259
|
+
try:
|
|
260
|
+
self.client.images.get(f"{self.img_name}")
|
|
261
|
+
except ImageNotFound as _:
|
|
262
|
+
return False
|
|
263
|
+
return True
|
|
264
|
+
|
|
265
|
+
def _download_ctk_dockerfile(self) -> None:
|
|
266
|
+
"""Downloads cluster toolkit dockerfile to dockerfile_path
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
None
|
|
270
|
+
"""
|
|
271
|
+
xpk_print(f"Downloading Dockerfile from {dockerfile_gh_path} ...")
|
|
272
|
+
self.dockerfile_path = self._create_tmp_for_dockerfile()
|
|
273
|
+
r = requests.get(dockerfile_gh_path, timeout=100)
|
|
274
|
+
with open(self.dockerfile_path, "w+", encoding="utf8") as dockerfile:
|
|
275
|
+
dockerfile.write(r.text)
|
|
276
|
+
xpk_print("Downloading Dockerfile completed!")
|
|
277
|
+
|
|
278
|
+
def _build_image(self):
|
|
279
|
+
try:
|
|
280
|
+
self._download_ctk_dockerfile()
|
|
281
|
+
dir_path = "/".join(self.dockerfile_path.split("/")[:-1])
|
|
282
|
+
xpk_print(
|
|
283
|
+
f"Building {self.img_name} docker image from dockerfile:"
|
|
284
|
+
f" {self.dockerfile_path}. It may take a while..."
|
|
285
|
+
)
|
|
286
|
+
self.client.images.build(
|
|
287
|
+
nocache=self.nocache,
|
|
288
|
+
path=dir_path,
|
|
289
|
+
tag=f"{self.img_name}",
|
|
290
|
+
rm=True,
|
|
291
|
+
buildargs={"CLUSTER_TOOLKIT_REF": ctk_build_ref},
|
|
292
|
+
)
|
|
293
|
+
except BuildError as e:
|
|
294
|
+
xpk_print(f"error while building image {self.img_name}: {e.msg}")
|
|
295
|
+
xpk_exit(dockerBuildErrorCode)
|
|
296
|
+
except APIError as e:
|
|
297
|
+
xpk_print(f"erro while building image {self.img_name}: {e.explanation}")
|
|
298
|
+
xpk_exit(dockerBuildErrorCode)
|
|
299
|
+
except TypeError as e:
|
|
300
|
+
xpk_print(f"TypeError while building image {self.img_name}: {e.args}")
|
|
301
|
+
xpk_exit(dockerBuildErrorCode)
|
|
302
|
+
xpk_print("Docker image build succesfully.")
|
|
303
|
+
os.remove(self.dockerfile_path)
|
|
304
|
+
tmp_dockerfile_dir = "/".join(self.dockerfile_path.split("/")[:-1])
|
|
305
|
+
os.rmdir(tmp_dockerfile_dir)
|
|
306
|
+
|
|
307
|
+
def _get_container_unique_name(self, cmd):
|
|
308
|
+
return f"{self.container_name}_{hash_string(cmd + str(time.time_ns()))}"
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .docker_manager import CommandRunner
|
|
18
|
+
from ..utils.console import xpk_print
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
xpk_gcloud_cfg_path = '~/gcloud/cfg'
|
|
22
|
+
xpk_deployment_dir = '/deployment'
|
|
23
|
+
gcluster_deploy_command = 'gcluster deploy'
|
|
24
|
+
gcluster_create_command = 'gcluster create'
|
|
25
|
+
gcluster_destroy_command = 'gcluster destroy'
|
|
26
|
+
blueprint_file_name = 'xpk_blueprint.yaml'
|
|
27
|
+
deployment_module = '/out/xpk-deployment'
|
|
28
|
+
a3_utils_dir_name = 'a3-mega-xpk'
|
|
29
|
+
config_map_repo_path = 'src/xpk/blueprints/a3-mega-xpk/config-map.yaml.tftpl'
|
|
30
|
+
kueue_config_repo_path = (
|
|
31
|
+
'src/xpk/blueprints/a3-mega-xpk/kueue-xpk-configuration.yaml.tftpl'
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class GclusterManager:
|
|
36
|
+
"""Manager is a class responsible for running cluster toolkit commands.
|
|
37
|
+
Attributes:
|
|
38
|
+
- gcluster_command_runner (CommandRunner) : instance of class implementing CommandRunner abstract methods.
|
|
39
|
+
Methods:
|
|
40
|
+
- deploy : run a deployment process of cluster toolkit. This method will invoke gcluster create and than gcluster deploy commands.
|
|
41
|
+
- destroy_deployment : run gcluster command to destroy existing deployment.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
gcluster_command_runner: CommandRunner,
|
|
47
|
+
) -> None:
|
|
48
|
+
self.gcluster_command_runner = gcluster_command_runner
|
|
49
|
+
|
|
50
|
+
def _run_create_deployment_cmd(
|
|
51
|
+
self, blueprint_container_path: str, prefix: str = ''
|
|
52
|
+
):
|
|
53
|
+
xpk_print('Creating deployment resources...')
|
|
54
|
+
cluster_create_cmd = (
|
|
55
|
+
f'{gcluster_create_command} -o {self._get_deployment_path(prefix)}'
|
|
56
|
+
f' {blueprint_container_path} -w --force'
|
|
57
|
+
)
|
|
58
|
+
self.gcluster_command_runner.run_command(cluster_create_cmd)
|
|
59
|
+
xpk_print('Creating deployment resources completed.')
|
|
60
|
+
|
|
61
|
+
def _run_deploy_cmd(
|
|
62
|
+
self,
|
|
63
|
+
deployment_name: str,
|
|
64
|
+
auto_approve: bool,
|
|
65
|
+
dry_run: bool,
|
|
66
|
+
prefix: str = '',
|
|
67
|
+
):
|
|
68
|
+
xpk_print('Deploying resources...')
|
|
69
|
+
deploy_cmd = (
|
|
70
|
+
f'{gcluster_deploy_command} {self._get_deployment_path(prefix)}/{deployment_name}'
|
|
71
|
+
)
|
|
72
|
+
if auto_approve is True:
|
|
73
|
+
deploy_cmd += ' --auto-approve'
|
|
74
|
+
if dry_run is True:
|
|
75
|
+
return
|
|
76
|
+
self.gcluster_command_runner.run_command(deploy_cmd)
|
|
77
|
+
xpk_print('Deployment completed.')
|
|
78
|
+
|
|
79
|
+
def deploy(
|
|
80
|
+
self,
|
|
81
|
+
blueprint_path: str,
|
|
82
|
+
deployment_name: str,
|
|
83
|
+
prefix: str = '',
|
|
84
|
+
auto_approve: bool = True,
|
|
85
|
+
dry_run: bool = False,
|
|
86
|
+
) -> None:
|
|
87
|
+
""" "deploy method provisions a new cluster using Cluster Toolkit.
|
|
88
|
+
It will invoke gcluster create and then gcluster deploy commands.
|
|
89
|
+
The files staged or created during running gcluster command will be managed by gcluster_command_runner in its working directory."
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
blueprint_path (str): path pointing to blueprint which will be deployed.
|
|
93
|
+
deployment_name (str): name of the deployment.
|
|
94
|
+
auto_approve (bool, optional): If set to true deployment command will be auto approved. Currently only True is supported. Defaults to True.
|
|
95
|
+
dry_run (bool, optional): If set to True gcluster will not deploy. Defaults to False.
|
|
96
|
+
Returns:
|
|
97
|
+
None
|
|
98
|
+
"""
|
|
99
|
+
xpk_print(f'Deploying blueprint from path {blueprint_path} ...')
|
|
100
|
+
self._run_create_deployment_cmd(
|
|
101
|
+
blueprint_container_path=blueprint_path, prefix=prefix
|
|
102
|
+
)
|
|
103
|
+
self._run_deploy_cmd(
|
|
104
|
+
deployment_name=deployment_name,
|
|
105
|
+
prefix=prefix,
|
|
106
|
+
auto_approve=auto_approve,
|
|
107
|
+
dry_run=dry_run,
|
|
108
|
+
)
|
|
109
|
+
xpk_print('Deploying blueprint completed!')
|
|
110
|
+
|
|
111
|
+
def _run_destroy_command(
|
|
112
|
+
self,
|
|
113
|
+
deployment_name: str,
|
|
114
|
+
prefix: str = '',
|
|
115
|
+
auto_approve: bool = True,
|
|
116
|
+
dry_run: bool = False,
|
|
117
|
+
):
|
|
118
|
+
destroy_cmd = (
|
|
119
|
+
f'{gcluster_destroy_command} {self._get_deployment_path(prefix)}/{deployment_name}'
|
|
120
|
+
)
|
|
121
|
+
if auto_approve is True:
|
|
122
|
+
destroy_cmd += ' --auto-approve'
|
|
123
|
+
if dry_run is True:
|
|
124
|
+
xpk_print(f'executing command {destroy_cmd}')
|
|
125
|
+
return
|
|
126
|
+
self.gcluster_command_runner.run_command(destroy_cmd)
|
|
127
|
+
|
|
128
|
+
def _get_deployment_path(self, prefix: str = '') -> str:
|
|
129
|
+
prefix = f'/{prefix}' if prefix != '' else ''
|
|
130
|
+
return f'deployments{prefix}'
|
|
131
|
+
|
|
132
|
+
def destroy_deployment(self, deployment_name: str, prefix: str = '') -> None:
|
|
133
|
+
"""Destroy deployment.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
deployment_name (str): name of deployment to destroy.
|
|
137
|
+
"""
|
|
138
|
+
xpk_print(f'Destroying {deployment_name} started...')
|
|
139
|
+
self._run_destroy_command(deployment_name, prefix=prefix)
|
|
140
|
+
xpk_print(f'Destroying {deployment_name} completed!')
|
|
141
|
+
|
|
142
|
+
def stage_files(
|
|
143
|
+
self, blueprint_file: str, blueprint_dependencies: str, prefix: str = ''
|
|
144
|
+
) -> str:
|
|
145
|
+
"""Uploads blueprint file and directory to gcluster working directory."""
|
|
146
|
+
xpk_print(
|
|
147
|
+
"Staging (sending) blueprint file to gcluster's working directory..."
|
|
148
|
+
)
|
|
149
|
+
staged_blueprint = self.gcluster_command_runner.upload_file_to_working_dir(
|
|
150
|
+
blueprint_file, prefix
|
|
151
|
+
)
|
|
152
|
+
if len(blueprint_dependencies) > 0:
|
|
153
|
+
self.gcluster_command_runner.upload_directory_to_working_dir(
|
|
154
|
+
blueprint_dependencies, prefix
|
|
155
|
+
)
|
|
156
|
+
xpk_print('Staging blueprint completed!')
|
|
157
|
+
xpk_print(f"File path in gcluster's working directory: {staged_blueprint}")
|
|
158
|
+
return staged_blueprint
|
xpk/core/kjob.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from argparse import Namespace
|
|
18
|
+
from ..utils.console import xpk_print
|
|
19
|
+
from .commands import run_command_for_value, run_kubectl_apply, run_command_with_updates
|
|
20
|
+
from enum import Enum
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AppProfileDefaults(Enum):
|
|
24
|
+
NAME = "xpk-def-app-profile"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class JobTemplateDefaults(Enum):
|
|
28
|
+
NAME = "xpk-def-batch"
|
|
29
|
+
PARALLELISM = 1
|
|
30
|
+
COMPLETIONS = 1
|
|
31
|
+
CONTAINER_NAME = "xpk-batch-container"
|
|
32
|
+
IMAGE = "ubuntu:22.04"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PodTemplateDefaults(Enum):
|
|
36
|
+
NAME = "xpk-def-pod"
|
|
37
|
+
CONTAINER_NAME = "xpk-interactive-container"
|
|
38
|
+
IMAGE = "busybox:1.28"
|
|
39
|
+
INTERACTIVE_COMMAND = "/bin/sh"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
job_template_yaml = """
|
|
43
|
+
apiVersion: kjobctl.x-k8s.io/v1alpha1
|
|
44
|
+
kind: JobTemplate
|
|
45
|
+
metadata:
|
|
46
|
+
name: {name}
|
|
47
|
+
namespace: default
|
|
48
|
+
template:
|
|
49
|
+
spec:
|
|
50
|
+
parallelism: {parallelism}
|
|
51
|
+
completions: {completions}
|
|
52
|
+
completionMode: Indexed
|
|
53
|
+
template:
|
|
54
|
+
spec:
|
|
55
|
+
containers:
|
|
56
|
+
- name: {container_name}
|
|
57
|
+
image: {image}
|
|
58
|
+
restartPolicy: OnFailure"""
|
|
59
|
+
|
|
60
|
+
app_profile_yaml = """
|
|
61
|
+
apiVersion: kjobctl.x-k8s.io/v1alpha1
|
|
62
|
+
kind: ApplicationProfile
|
|
63
|
+
metadata:
|
|
64
|
+
name: {name}
|
|
65
|
+
namespace: default
|
|
66
|
+
spec:
|
|
67
|
+
supportedModes:
|
|
68
|
+
- name: Slurm
|
|
69
|
+
template: {batch_template}
|
|
70
|
+
requiredFlags: []
|
|
71
|
+
- name: Interactive
|
|
72
|
+
template: {interactive_template}
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
pod_template_yaml = """
|
|
76
|
+
apiVersion: v1
|
|
77
|
+
kind: PodTemplate
|
|
78
|
+
metadata:
|
|
79
|
+
name: {name}
|
|
80
|
+
namespace: default
|
|
81
|
+
template:
|
|
82
|
+
spec:
|
|
83
|
+
containers:
|
|
84
|
+
- name: {container_name}
|
|
85
|
+
image: {image}
|
|
86
|
+
command: [{interactive_command}]
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def verify_kjob_installed(args: Namespace) -> int:
|
|
91
|
+
"""Check if kjob is installed. If not provide user with proper communicate and exit.
|
|
92
|
+
Args:
|
|
93
|
+
args - user provided arguments.
|
|
94
|
+
Returns:
|
|
95
|
+
error code > if kjob not installed, otherwise 0
|
|
96
|
+
"""
|
|
97
|
+
command = "kubectl-kjob help"
|
|
98
|
+
task = "Verify kjob installation "
|
|
99
|
+
verify_kjob_installed_code, _ = run_command_for_value(command, task, args)
|
|
100
|
+
|
|
101
|
+
if verify_kjob_installed_code == 0:
|
|
102
|
+
xpk_print("kjob found")
|
|
103
|
+
return 0
|
|
104
|
+
|
|
105
|
+
if verify_kjob_installed_code != 0:
|
|
106
|
+
xpk_print(
|
|
107
|
+
" kjob not found. Please follow"
|
|
108
|
+
" https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md"
|
|
109
|
+
" to install kjob."
|
|
110
|
+
)
|
|
111
|
+
return verify_kjob_installed_code
|
|
112
|
+
return 0
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def create_app_profile_instance(args: Namespace) -> int:
|
|
116
|
+
"""Create new AppProfile instance on cluster with default settings.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
args - user provided arguments
|
|
120
|
+
Returns:
|
|
121
|
+
exit_code > 0 if creating AppProfile fails, 0 otherwise
|
|
122
|
+
"""
|
|
123
|
+
return run_kubectl_apply(
|
|
124
|
+
yml_string=app_profile_yaml.format(
|
|
125
|
+
name=AppProfileDefaults.NAME.value,
|
|
126
|
+
batch_template=JobTemplateDefaults.NAME.value,
|
|
127
|
+
interactive_template=PodTemplateDefaults.NAME.value,
|
|
128
|
+
),
|
|
129
|
+
task="Creating AppProfile",
|
|
130
|
+
args=args,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def create_job_template_instance(args: Namespace) -> int:
|
|
135
|
+
"""Create new JobTemplate instance on cluster with default settings.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
args - user provided arguments
|
|
139
|
+
Returns:
|
|
140
|
+
exit_code > 0 if creating JobTemplate fails, 0 otherwise
|
|
141
|
+
"""
|
|
142
|
+
return run_kubectl_apply(
|
|
143
|
+
yml_string=job_template_yaml.format(
|
|
144
|
+
name=JobTemplateDefaults.NAME.value,
|
|
145
|
+
parallelism=JobTemplateDefaults.PARALLELISM.value,
|
|
146
|
+
completions=JobTemplateDefaults.COMPLETIONS.value,
|
|
147
|
+
container_name=JobTemplateDefaults.CONTAINER_NAME.value,
|
|
148
|
+
image=JobTemplateDefaults.IMAGE.value,
|
|
149
|
+
),
|
|
150
|
+
task="Creating JobTemplate",
|
|
151
|
+
args=args,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def create_pod_template_instance(args: Namespace) -> int:
|
|
156
|
+
"""Create new PodTemplate instance on cluster with default settings.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
args - user provided arguments
|
|
160
|
+
Returns:
|
|
161
|
+
exit_code > 0 if creating PodTemplate fails, 0 otherwise
|
|
162
|
+
"""
|
|
163
|
+
return run_kubectl_apply(
|
|
164
|
+
yml_string=pod_template_yaml.format(
|
|
165
|
+
name=PodTemplateDefaults.NAME.value,
|
|
166
|
+
container_name=PodTemplateDefaults.CONTAINER_NAME.value,
|
|
167
|
+
image=PodTemplateDefaults.IMAGE.value,
|
|
168
|
+
interactive_command=PodTemplateDefaults.INTERACTIVE_COMMAND.value,
|
|
169
|
+
),
|
|
170
|
+
task="Creating PodTemplate",
|
|
171
|
+
args=args,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def prepare_kjob(args) -> int:
|
|
176
|
+
job_err_code = create_job_template_instance(args)
|
|
177
|
+
if job_err_code > 0:
|
|
178
|
+
return job_err_code
|
|
179
|
+
|
|
180
|
+
pod_err_code = create_pod_template_instance(args)
|
|
181
|
+
if pod_err_code > 0:
|
|
182
|
+
return pod_err_code
|
|
183
|
+
|
|
184
|
+
return create_app_profile_instance(args)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def apply_kjob_crds(args: Namespace) -> int:
|
|
188
|
+
"""Apply kjob CRDs on cluster.
|
|
189
|
+
|
|
190
|
+
This function install kjob CRDs files from kjobctl printcrds.
|
|
191
|
+
It creates all neccessary kjob CRDs.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
args - user provided arguments
|
|
195
|
+
Returns:
|
|
196
|
+
None
|
|
197
|
+
"""
|
|
198
|
+
command = "kubectl kjob printcrds | kubectl apply --server-side -f -"
|
|
199
|
+
task = "Create kjob CRDs on cluster"
|
|
200
|
+
return_code = run_command_with_updates(command, task, args)
|
|
201
|
+
if return_code != 0:
|
|
202
|
+
xpk_print(f"{task} returned ERROR {return_code}")
|
|
203
|
+
return return_code
|
|
204
|
+
xpk_print("Creating kjob CRDs succeeded")
|
|
205
|
+
return 0
|