xpk 0.17.1__py3-none-any.whl → 0.17.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +0 -22
- xpk/commands/cluster_gcluster.py +1 -13
- xpk/commands/cluster_gcluster_test.py +0 -10
- xpk/commands/cluster_test.py +0 -4
- xpk/commands/kind.py +0 -21
- xpk/commands/storage.py +0 -25
- xpk/core/cluster.py +1 -3
- xpk/core/config.py +0 -15
- xpk/core/system_characteristics.py +1 -16
- xpk/core/workload_decorators/rdma_decorator.py +0 -15
- xpk/core/workload_decorators/tcpx_decorator.py +0 -8
- xpk/core/workload_decorators/tcpx_decorator_test.py +0 -78
- xpk/core/workload_decorators/tcpxo_decorator.py +0 -16
- xpk/parser/common.py +0 -151
- xpk/parser/core.py +0 -31
- xpk/utils/validation.py +0 -8
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/METADATA +1 -1
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/RECORD +22 -33
- xpk/commands/batch.py +0 -144
- xpk/commands/job.py +0 -244
- xpk/commands/kjob_common.py +0 -60
- xpk/commands/run.py +0 -140
- xpk/commands/shell.py +0 -142
- xpk/core/kjob.py +0 -473
- xpk/parser/batch.py +0 -43
- xpk/parser/job.py +0 -147
- xpk/parser/run.py +0 -47
- xpk/parser/shell.py +0 -59
- xpk/templates/volume_bundle.yaml +0 -7
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/WHEEL +0 -0
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/entry_points.txt +0 -0
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.17.1.dist-info → xpk-0.17.3.dist-info}/top_level.txt +0 -0
xpk/core/kjob.py
DELETED
|
@@ -1,473 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Copyright 2024 Google LLC
|
|
3
|
-
|
|
4
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
you may not use this file except in compliance with the License.
|
|
6
|
-
You may obtain a copy of the License at
|
|
7
|
-
|
|
8
|
-
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
|
|
10
|
-
Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
See the License for the specific language governing permissions and
|
|
14
|
-
limitations under the License.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
from argparse import Namespace
|
|
18
|
-
from enum import Enum
|
|
19
|
-
|
|
20
|
-
import yaml
|
|
21
|
-
from kubernetes import client as k8s_client
|
|
22
|
-
from kubernetes.client import ApiClient
|
|
23
|
-
from kubernetes.client.rest import ApiException
|
|
24
|
-
|
|
25
|
-
from ..utils import templates
|
|
26
|
-
from ..utils.execution_context import is_dry_run
|
|
27
|
-
from ..utils.console import xpk_exit, xpk_print
|
|
28
|
-
from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
|
|
29
|
-
from .commands import (
|
|
30
|
-
run_command_for_value,
|
|
31
|
-
run_command_with_updates,
|
|
32
|
-
run_kubectl_apply,
|
|
33
|
-
)
|
|
34
|
-
from .config import (
|
|
35
|
-
KJOB_BATCH_IMAGE,
|
|
36
|
-
KJOB_BATCH_WORKING_DIRECTORY,
|
|
37
|
-
KJOB_SHELL_IMAGE,
|
|
38
|
-
KJOB_SHELL_INTERACTIVE_COMMAND,
|
|
39
|
-
KJOB_SHELL_WORKING_DIRECTORY,
|
|
40
|
-
get_config,
|
|
41
|
-
)
|
|
42
|
-
from .network import get_cluster_subnetworks
|
|
43
|
-
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
44
|
-
from .resources import get_cluster_system_characteristics
|
|
45
|
-
from .storage import (
|
|
46
|
-
GCS_FUSE_ANNOTATIONS,
|
|
47
|
-
PARALLELSTORE_ANNOTATIONS,
|
|
48
|
-
get_auto_mount_gcsfuse_storages,
|
|
49
|
-
get_auto_mount_parallelstore_storages,
|
|
50
|
-
get_auto_mount_storages,
|
|
51
|
-
)
|
|
52
|
-
from .workload_decorators import (
|
|
53
|
-
rdma_decorator,
|
|
54
|
-
tcpxo_decorator,
|
|
55
|
-
)
|
|
56
|
-
from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry
|
|
57
|
-
|
|
58
|
-
KJOB_API_GROUP_NAME = "kjobctl.x-k8s.io"
|
|
59
|
-
KJOB_API_GROUP_VERSION = "v1alpha1"
|
|
60
|
-
KJOB_API_VOLUME_BUNDLE_PLURAL = "volumebundles"
|
|
61
|
-
VOLUME_BUNDLE_TEMPLATE_PATH = "/../templates/volume_bundle.yaml"
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class AppProfileDefaults(Enum):
|
|
65
|
-
NAME = "xpk-def-app-profile"
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class JobTemplateDefaults(Enum):
|
|
69
|
-
NAME = "xpk-def-batch"
|
|
70
|
-
PARALLELISM = 1
|
|
71
|
-
COMPLETIONS = 1
|
|
72
|
-
CONTAINER_NAME = "xpk-batch-container"
|
|
73
|
-
IMAGE = "ubuntu:22.04"
|
|
74
|
-
WORKING_DIRECTORY = "/"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class PodTemplateDefaults(Enum):
|
|
78
|
-
NAME = "xpk-def-pod"
|
|
79
|
-
CONTAINER_NAME = "xpk-interactive-container"
|
|
80
|
-
IMAGE = "busybox:1.28"
|
|
81
|
-
WORKING_DIRECTORY = "/"
|
|
82
|
-
INTERACTIVE_COMMAND = "/bin/sh"
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
job_template_yaml = """
|
|
86
|
-
apiVersion: kjobctl.x-k8s.io/v1alpha1
|
|
87
|
-
kind: JobTemplate
|
|
88
|
-
metadata:
|
|
89
|
-
name: {name}
|
|
90
|
-
namespace: default
|
|
91
|
-
template:
|
|
92
|
-
spec:
|
|
93
|
-
parallelism: {parallelism}
|
|
94
|
-
completions: {completions}
|
|
95
|
-
completionMode: Indexed
|
|
96
|
-
template:
|
|
97
|
-
spec:
|
|
98
|
-
dnsPolicy: ClusterFirstWithHostNet
|
|
99
|
-
tolerations:
|
|
100
|
-
- operator: "Exists"
|
|
101
|
-
key: nvidia.com/gpu
|
|
102
|
-
containers:
|
|
103
|
-
- name: {container_name}
|
|
104
|
-
image: {image}
|
|
105
|
-
workingDir: {working_directory}
|
|
106
|
-
{resources}
|
|
107
|
-
{node_selector}
|
|
108
|
-
priorityClassName: {priority}
|
|
109
|
-
restartPolicy: OnFailure
|
|
110
|
-
serviceAccountName: {service_account}
|
|
111
|
-
"""
|
|
112
|
-
job_node_selector_template = """
|
|
113
|
-
nodeSelector:
|
|
114
|
-
cloud.google.com/gke-accelerator: {gpu_name}
|
|
115
|
-
"""
|
|
116
|
-
job_resources_template = """
|
|
117
|
-
resources:
|
|
118
|
-
limits:
|
|
119
|
-
nvidia.com/gpu: {gpu_per_node}
|
|
120
|
-
"""
|
|
121
|
-
|
|
122
|
-
app_profile_yaml = """
|
|
123
|
-
apiVersion: kjobctl.x-k8s.io/v1alpha1
|
|
124
|
-
kind: ApplicationProfile
|
|
125
|
-
metadata:
|
|
126
|
-
name: {name}
|
|
127
|
-
namespace: default
|
|
128
|
-
spec:
|
|
129
|
-
supportedModes:
|
|
130
|
-
- name: Slurm
|
|
131
|
-
template: {batch_template}
|
|
132
|
-
requiredFlags: []
|
|
133
|
-
- name: Interactive
|
|
134
|
-
template: {interactive_template}
|
|
135
|
-
volumeBundles: {volume_bundles}
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
pod_template_yaml = """
|
|
139
|
-
apiVersion: v1
|
|
140
|
-
kind: PodTemplate
|
|
141
|
-
metadata:
|
|
142
|
-
name: {name}
|
|
143
|
-
namespace: default
|
|
144
|
-
template:
|
|
145
|
-
spec:
|
|
146
|
-
tolerations:
|
|
147
|
-
- effect: NoSchedule
|
|
148
|
-
key: components.gke.io/gke-managed-components
|
|
149
|
-
operator: Equal
|
|
150
|
-
value: "true"
|
|
151
|
-
containers:
|
|
152
|
-
- name: {container_name}
|
|
153
|
-
image: {image}
|
|
154
|
-
command: [{interactive_command}]
|
|
155
|
-
workingDir: {working_directory}
|
|
156
|
-
initContainers:
|
|
157
|
-
- name: init
|
|
158
|
-
image: {image}
|
|
159
|
-
command: ['/bin/mkdir', '-p', '{working_directory}']
|
|
160
|
-
serviceAccountName: {service_account}
|
|
161
|
-
"""
|
|
162
|
-
|
|
163
|
-
Kueue_TAS_annotation = "kueue.x-k8s.io/podset-preferred-topology=cloud.google.com/gce-topology-host"
|
|
164
|
-
|
|
165
|
-
default_interface_annotation = "networking.gke.io/default-interface=eth0"
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
def get_a4_pod_template_annotations() -> tuple[str, str]:
|
|
169
|
-
sub_networks = get_cluster_subnetworks()
|
|
170
|
-
interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
|
|
171
|
-
sub_networks
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
return (
|
|
175
|
-
default_interface_annotation,
|
|
176
|
-
f"{interfaces_key}=$'{interfaces_value}'",
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def get_a3ultra_pod_template_annotations() -> tuple[str, str]:
|
|
181
|
-
sub_networks = get_cluster_subnetworks()
|
|
182
|
-
interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
|
|
183
|
-
sub_networks
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
return (
|
|
187
|
-
default_interface_annotation,
|
|
188
|
-
f"{interfaces_key}=$'{interfaces_value}'",
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def get_a3mega_pod_template_annotations() -> tuple[str, str, str]:
|
|
193
|
-
"""Adds or updates annotations in the Pod template."""
|
|
194
|
-
sub_networks = get_cluster_subnetworks()
|
|
195
|
-
tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
|
|
196
|
-
interfaces_key, interfaces_value = tcpxo_decorator.get_interfaces_entry(
|
|
197
|
-
sub_networks
|
|
198
|
-
)
|
|
199
|
-
tcpxo = f"{tcpxo_deamon_key}=$'{tcpxo_deamon_paths}'"
|
|
200
|
-
interfaces = f"{interfaces_key}=$'{interfaces_value}'"
|
|
201
|
-
return tcpxo, interfaces, default_interface_annotation
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def verify_kjob_installed() -> int:
|
|
205
|
-
"""Check if kjob is installed. If not provide user with proper communicate and exit.
|
|
206
|
-
Returns:
|
|
207
|
-
error code > if kjob not installed, otherwise 0
|
|
208
|
-
"""
|
|
209
|
-
command = "kubectl-kjob help"
|
|
210
|
-
task = "Verify kjob installation "
|
|
211
|
-
verify_kjob_installed_code, _ = run_command_for_value(command, task)
|
|
212
|
-
|
|
213
|
-
if verify_kjob_installed_code == 0:
|
|
214
|
-
xpk_print("kjob found")
|
|
215
|
-
return 0
|
|
216
|
-
|
|
217
|
-
if verify_kjob_installed_code != 0:
|
|
218
|
-
xpk_print(
|
|
219
|
-
" kjob not found. Please follow"
|
|
220
|
-
" https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md"
|
|
221
|
-
" to install kjob."
|
|
222
|
-
)
|
|
223
|
-
return verify_kjob_installed_code
|
|
224
|
-
return 0
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
def get_pod_template_interactive_command() -> str:
|
|
228
|
-
"""Gets the interactive command for PodTemplate from config otherwise the default value.
|
|
229
|
-
|
|
230
|
-
Args:
|
|
231
|
-
args - user provided arguments
|
|
232
|
-
Returns:
|
|
233
|
-
str - PodTemplate's interactive command
|
|
234
|
-
"""
|
|
235
|
-
pod_command = get_config().get(KJOB_SHELL_INTERACTIVE_COMMAND)
|
|
236
|
-
if pod_command is None or len(pod_command) == 0:
|
|
237
|
-
pod_command = PodTemplateDefaults.INTERACTIVE_COMMAND.value
|
|
238
|
-
|
|
239
|
-
return pod_command
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
def create_app_profile_instance(volume_bundles: list[str]) -> int:
|
|
243
|
-
"""Create new AppProfile instance on cluster with default settings.
|
|
244
|
-
|
|
245
|
-
Args:
|
|
246
|
-
args - user provided arguments
|
|
247
|
-
Returns:
|
|
248
|
-
exit_code > 0 if creating AppProfile fails, 0 otherwise
|
|
249
|
-
"""
|
|
250
|
-
return run_kubectl_apply(
|
|
251
|
-
yml_string=app_profile_yaml.format(
|
|
252
|
-
name=AppProfileDefaults.NAME.value,
|
|
253
|
-
batch_template=JobTemplateDefaults.NAME.value,
|
|
254
|
-
interactive_template=PodTemplateDefaults.NAME.value,
|
|
255
|
-
volume_bundles=volume_bundles,
|
|
256
|
-
),
|
|
257
|
-
task="Creating AppProfile",
|
|
258
|
-
)
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def decorate_job_template_with_gpu(
|
|
262
|
-
yml_string: str, system: SystemCharacteristics
|
|
263
|
-
) -> str:
|
|
264
|
-
job_spec = yaml.safe_load(yml_string)["template"]
|
|
265
|
-
kjob_decorator = (
|
|
266
|
-
system.gpu_config.kjob_decorator_fn
|
|
267
|
-
if system.gpu_config and system.gpu_config.kjob_decorator_fn
|
|
268
|
-
else None
|
|
269
|
-
)
|
|
270
|
-
if kjob_decorator:
|
|
271
|
-
job_spec = kjob_decorator(job_spec)
|
|
272
|
-
job_template_dict = yaml.safe_load(yml_string)
|
|
273
|
-
job_template_dict["template"] = job_spec
|
|
274
|
-
yaml_result: str = yaml.dump(job_template_dict, sort_keys=False)
|
|
275
|
-
return yaml_result
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
def create_job_template_instance(
|
|
279
|
-
args: Namespace,
|
|
280
|
-
system: SystemCharacteristics | None,
|
|
281
|
-
service_account: str,
|
|
282
|
-
) -> int:
|
|
283
|
-
"""Create new JobTemplate instance on cluster with default settings.
|
|
284
|
-
|
|
285
|
-
Args:
|
|
286
|
-
args - user provided arguments
|
|
287
|
-
Returns:
|
|
288
|
-
exit_code > 0 if creating JobTemplate fails, 0 otherwise
|
|
289
|
-
"""
|
|
290
|
-
job_image = get_config().get(KJOB_BATCH_IMAGE)
|
|
291
|
-
if job_image is None or len(job_image) == 0:
|
|
292
|
-
job_image = JobTemplateDefaults.IMAGE.value
|
|
293
|
-
working_directory = get_config().get(KJOB_BATCH_WORKING_DIRECTORY)
|
|
294
|
-
if working_directory is None or len(working_directory) == 0:
|
|
295
|
-
working_directory = JobTemplateDefaults.WORKING_DIRECTORY.value
|
|
296
|
-
resources = (
|
|
297
|
-
job_resources_template.format(gpu_per_node=system.chips_per_vm)
|
|
298
|
-
if system is not None and system.accelerator_type == AcceleratorType.GPU
|
|
299
|
-
else ""
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
node_selector = (
|
|
303
|
-
job_node_selector_template.format(gpu_name=system.gke_accelerator)
|
|
304
|
-
if system is not None and system.accelerator_type == AcceleratorType.GPU
|
|
305
|
-
else ""
|
|
306
|
-
)
|
|
307
|
-
yml_string = job_template_yaml.format(
|
|
308
|
-
name=JobTemplateDefaults.NAME.value,
|
|
309
|
-
parallelism=JobTemplateDefaults.PARALLELISM.value,
|
|
310
|
-
completions=JobTemplateDefaults.COMPLETIONS.value,
|
|
311
|
-
container_name=JobTemplateDefaults.CONTAINER_NAME.value,
|
|
312
|
-
image=job_image,
|
|
313
|
-
working_directory=working_directory,
|
|
314
|
-
resources=resources,
|
|
315
|
-
node_selector=node_selector,
|
|
316
|
-
priority=args.priority if hasattr(args, "priority") else "medium",
|
|
317
|
-
service_account=service_account,
|
|
318
|
-
)
|
|
319
|
-
if system is not None and system.accelerator_type == AcceleratorType.GPU:
|
|
320
|
-
yml_string = decorate_job_template_with_gpu(yml_string, system)
|
|
321
|
-
|
|
322
|
-
return run_kubectl_apply(
|
|
323
|
-
yml_string,
|
|
324
|
-
task="Creating JobTemplate",
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
def create_pod_template_instance(service_account: str) -> int:
|
|
329
|
-
"""Create new PodTemplate instance on cluster with default settings.
|
|
330
|
-
|
|
331
|
-
Returns:
|
|
332
|
-
exit_code > 0 if creating PodTemplate fails, 0 otherwise
|
|
333
|
-
"""
|
|
334
|
-
pod_image = get_config().get(KJOB_SHELL_IMAGE)
|
|
335
|
-
if pod_image is None or len(pod_image) == 0:
|
|
336
|
-
pod_image = PodTemplateDefaults.IMAGE.value
|
|
337
|
-
working_directory = get_config().get(KJOB_SHELL_WORKING_DIRECTORY)
|
|
338
|
-
if working_directory is None or len(working_directory) == 0:
|
|
339
|
-
working_directory = PodTemplateDefaults.WORKING_DIRECTORY.value
|
|
340
|
-
|
|
341
|
-
return run_kubectl_apply(
|
|
342
|
-
yml_string=pod_template_yaml.format(
|
|
343
|
-
name=PodTemplateDefaults.NAME.value,
|
|
344
|
-
container_name=PodTemplateDefaults.CONTAINER_NAME.value,
|
|
345
|
-
image=pod_image,
|
|
346
|
-
working_directory=working_directory,
|
|
347
|
-
interactive_command=get_pod_template_interactive_command(),
|
|
348
|
-
service_account=service_account,
|
|
349
|
-
),
|
|
350
|
-
task="Creating PodTemplate",
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
def prepare_kjob(args: Namespace) -> int:
|
|
355
|
-
system = get_cluster_system_characteristics(args)
|
|
356
|
-
|
|
357
|
-
storages = []
|
|
358
|
-
if not is_dry_run():
|
|
359
|
-
k8s_api_client = setup_k8s_env(args)
|
|
360
|
-
storages = get_auto_mount_storages(k8s_api_client)
|
|
361
|
-
|
|
362
|
-
service_account = ""
|
|
363
|
-
if len(storages) > 0:
|
|
364
|
-
service_account = XPK_SA
|
|
365
|
-
|
|
366
|
-
job_err_code = create_job_template_instance(args, system, service_account)
|
|
367
|
-
if job_err_code > 0:
|
|
368
|
-
return job_err_code
|
|
369
|
-
pod_err_code = create_pod_template_instance(service_account)
|
|
370
|
-
if pod_err_code > 0:
|
|
371
|
-
return pod_err_code
|
|
372
|
-
|
|
373
|
-
volume_bundles = [item.name for item in storages]
|
|
374
|
-
|
|
375
|
-
return create_app_profile_instance(volume_bundles)
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
def apply_kjob_crds() -> int:
|
|
379
|
-
"""Apply kjob CRDs on cluster.
|
|
380
|
-
|
|
381
|
-
This function install kjob CRDs files from kjobctl printcrds.
|
|
382
|
-
It creates all neccessary kjob CRDs.
|
|
383
|
-
|
|
384
|
-
Returns:
|
|
385
|
-
None
|
|
386
|
-
"""
|
|
387
|
-
command = "kubectl kjob printcrds | kubectl apply --server-side -f -"
|
|
388
|
-
task = "Create kjob CRDs on cluster"
|
|
389
|
-
return_code = run_command_with_updates(command, task)
|
|
390
|
-
if return_code != 0:
|
|
391
|
-
xpk_print(f"{task} returned ERROR {return_code}")
|
|
392
|
-
return return_code
|
|
393
|
-
xpk_print("Creating kjob CRDs succeeded")
|
|
394
|
-
return 0
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
def create_volume_bundle_instance(
|
|
398
|
-
k8s_api_client: ApiClient,
|
|
399
|
-
name: str,
|
|
400
|
-
manifest: list[dict],
|
|
401
|
-
readonly: bool,
|
|
402
|
-
mount_point: str,
|
|
403
|
-
) -> None:
|
|
404
|
-
"""
|
|
405
|
-
Creates a new VolumeBundle resource in the Kubernetes cluster.
|
|
406
|
-
|
|
407
|
-
This function reads a VolumeBundle template from a YAML file, populates it with
|
|
408
|
-
values from the provided arguments, and then creates the VolumeBundle object
|
|
409
|
-
in the cluster.
|
|
410
|
-
|
|
411
|
-
Args:
|
|
412
|
-
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
413
|
-
args: An argparse Namespace object containing the arguments for creating
|
|
414
|
-
the Storage resource.
|
|
415
|
-
"""
|
|
416
|
-
data = templates.load(VOLUME_BUNDLE_TEMPLATE_PATH)
|
|
417
|
-
data["metadata"]["name"] = name
|
|
418
|
-
spec = data["spec"]
|
|
419
|
-
spec["volumes"] = []
|
|
420
|
-
spec["containerVolumeMounts"] = []
|
|
421
|
-
|
|
422
|
-
for obj in manifest:
|
|
423
|
-
if obj["kind"] == "PersistentVolumeClaim":
|
|
424
|
-
spec["volumes"].append({
|
|
425
|
-
"name": obj["metadata"]["name"],
|
|
426
|
-
"persistentVolumeClaim": {
|
|
427
|
-
"claimName": obj["metadata"]["name"],
|
|
428
|
-
"readOnly": readonly,
|
|
429
|
-
},
|
|
430
|
-
})
|
|
431
|
-
spec["containerVolumeMounts"].append({
|
|
432
|
-
"name": obj["metadata"]["name"],
|
|
433
|
-
"mountPath": mount_point,
|
|
434
|
-
})
|
|
435
|
-
|
|
436
|
-
data["spec"] = spec
|
|
437
|
-
|
|
438
|
-
api_instance = k8s_client.CustomObjectsApi(k8s_api_client)
|
|
439
|
-
try:
|
|
440
|
-
api_instance.create_namespaced_custom_object(
|
|
441
|
-
namespace=DEFAULT_NAMESPACE,
|
|
442
|
-
group=KJOB_API_GROUP_NAME,
|
|
443
|
-
version=KJOB_API_GROUP_VERSION,
|
|
444
|
-
plural=KJOB_API_VOLUME_BUNDLE_PLURAL,
|
|
445
|
-
body=data,
|
|
446
|
-
)
|
|
447
|
-
xpk_print(
|
|
448
|
-
f"Created {KJOB_API_VOLUME_BUNDLE_PLURAL}.{KJOB_API_GROUP_NAME} object:"
|
|
449
|
-
f" {data['metadata']['name']}"
|
|
450
|
-
)
|
|
451
|
-
except ApiException as e:
|
|
452
|
-
if e.status == 409:
|
|
453
|
-
xpk_print(f"VolumeBundle: {name} already exists. Skipping its creation")
|
|
454
|
-
else:
|
|
455
|
-
xpk_print(f"Encountered error during VolumeBundle creation: {e}")
|
|
456
|
-
xpk_exit(1)
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
def get_storage_annotations(args: Namespace) -> list[str]:
|
|
460
|
-
annotations = []
|
|
461
|
-
k8s_api_client = setup_k8s_env(args)
|
|
462
|
-
|
|
463
|
-
gcsfuse_storages = get_auto_mount_gcsfuse_storages(k8s_api_client)
|
|
464
|
-
if len(gcsfuse_storages) > 0:
|
|
465
|
-
for key, value in GCS_FUSE_ANNOTATIONS.items():
|
|
466
|
-
annotations.append(f"{key}={value}")
|
|
467
|
-
|
|
468
|
-
parallelstore_storages = get_auto_mount_parallelstore_storages(k8s_api_client)
|
|
469
|
-
if len(parallelstore_storages) > 0:
|
|
470
|
-
for key, value in PARALLELSTORE_ANNOTATIONS.items():
|
|
471
|
-
annotations.append(f"{key}={value}")
|
|
472
|
-
|
|
473
|
-
return annotations
|
xpk/parser/batch.py
DELETED
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Copyright 2024 Google LLC
|
|
3
|
-
|
|
4
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
you may not use this file except in compliance with the License.
|
|
6
|
-
You may obtain a copy of the License at
|
|
7
|
-
|
|
8
|
-
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
|
|
10
|
-
Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
See the License for the specific language governing permissions and
|
|
14
|
-
limitations under the License.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
from .common import (
|
|
18
|
-
add_shared_arguments,
|
|
19
|
-
add_slurm_arguments,
|
|
20
|
-
add_cluster_arguments,
|
|
21
|
-
add_kind_cluster_arguments,
|
|
22
|
-
)
|
|
23
|
-
from ..commands.batch import batch
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def set_batch_parser(batch_parser):
|
|
27
|
-
batch_required_arguments = batch_parser.add_argument_group(
|
|
28
|
-
'batch Built-in Arguments', 'Arguments required for `batch`.'
|
|
29
|
-
)
|
|
30
|
-
batch_optional_arguments = batch_parser.add_argument_group(
|
|
31
|
-
'Optional Arguments', 'Arguments optional for `batch`.'
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
### "batch" Required arguments
|
|
35
|
-
batch_required_arguments.add_argument(
|
|
36
|
-
'script', help='script with batch task to run'
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
add_cluster_arguments(batch_optional_arguments)
|
|
40
|
-
add_kind_cluster_arguments(batch_optional_arguments)
|
|
41
|
-
add_shared_arguments(batch_optional_arguments)
|
|
42
|
-
add_slurm_arguments(batch_optional_arguments)
|
|
43
|
-
batch_parser.set_defaults(func=batch)
|
xpk/parser/job.py
DELETED
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Copyright 2024 Google LLC
|
|
3
|
-
|
|
4
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
you may not use this file except in compliance with the License.
|
|
6
|
-
You may obtain a copy of the License at
|
|
7
|
-
|
|
8
|
-
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
|
|
10
|
-
Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
See the License for the specific language governing permissions and
|
|
14
|
-
limitations under the License.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
import argparse
|
|
18
|
-
from ..commands.job import job_info, job_list, job_cancel
|
|
19
|
-
|
|
20
|
-
from .common import add_shared_arguments
|
|
21
|
-
from .validators import name_type
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def set_job_parser(job_parser: argparse.ArgumentParser):
|
|
25
|
-
job_subcommands = job_parser.add_subparsers(
|
|
26
|
-
title='job subcommands',
|
|
27
|
-
dest='xpk_job_subcommands',
|
|
28
|
-
help=(
|
|
29
|
-
'These are commands related to job management. Look at help for'
|
|
30
|
-
' specific subcommands for more details.'
|
|
31
|
-
),
|
|
32
|
-
)
|
|
33
|
-
set_job_info_parser(
|
|
34
|
-
job_info_parser=job_subcommands.add_parser(
|
|
35
|
-
'info', help='Show information about specified job.'
|
|
36
|
-
)
|
|
37
|
-
)
|
|
38
|
-
set_job_list_parser(
|
|
39
|
-
job_list_parser=job_subcommands.add_parser('ls', help='List jobs.')
|
|
40
|
-
)
|
|
41
|
-
set_job_cancel_parser(
|
|
42
|
-
job_cancel_parser=job_subcommands.add_parser(
|
|
43
|
-
'cancel', help='Cancel job execution.'
|
|
44
|
-
)
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def set_job_info_parser(job_info_parser: argparse.ArgumentParser):
|
|
49
|
-
job_info_required_arguments = job_info_parser.add_argument_group(
|
|
50
|
-
'Required arguments',
|
|
51
|
-
'The basic information required to identify the job.',
|
|
52
|
-
)
|
|
53
|
-
job_info_optional_arguments = job_info_parser.add_argument_group(
|
|
54
|
-
'Optional Arguments', 'Arguments optional for job info.'
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
### Required arguments
|
|
58
|
-
job_info_required_arguments.add_argument(
|
|
59
|
-
'--cluster',
|
|
60
|
-
type=name_type,
|
|
61
|
-
default=None,
|
|
62
|
-
help='The name of the cluster to info jobs on.',
|
|
63
|
-
required=True,
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
job_info_optional_arguments.add_argument(
|
|
67
|
-
'--kind-cluster',
|
|
68
|
-
type=bool,
|
|
69
|
-
action=argparse.BooleanOptionalAction,
|
|
70
|
-
default=False,
|
|
71
|
-
help='Apply command to a local test cluster.',
|
|
72
|
-
)
|
|
73
|
-
job_info_required_arguments.add_argument(
|
|
74
|
-
'name',
|
|
75
|
-
type=str,
|
|
76
|
-
default=None,
|
|
77
|
-
help='Name of the job.',
|
|
78
|
-
)
|
|
79
|
-
job_info_parser.set_defaults(func=job_info)
|
|
80
|
-
add_shared_arguments(job_info_parser)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def set_job_list_parser(job_list_parser: argparse.ArgumentParser):
|
|
84
|
-
job_list_required_arguments = job_list_parser.add_argument_group(
|
|
85
|
-
'Required Arguments',
|
|
86
|
-
'Arguments required for job list.',
|
|
87
|
-
)
|
|
88
|
-
job_list_optional_arguments = job_list_parser.add_argument_group(
|
|
89
|
-
'Optional Arguments', 'Arguments optional for job list.'
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
### Required arguments
|
|
93
|
-
job_list_required_arguments.add_argument(
|
|
94
|
-
'--cluster',
|
|
95
|
-
type=name_type,
|
|
96
|
-
default=None,
|
|
97
|
-
help='The name of the cluster to list jobs on.',
|
|
98
|
-
required=True,
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
job_list_optional_arguments.add_argument(
|
|
102
|
-
'--kind-cluster',
|
|
103
|
-
type=bool,
|
|
104
|
-
action=argparse.BooleanOptionalAction,
|
|
105
|
-
default=False,
|
|
106
|
-
help='Apply command to a local test cluster.',
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
job_list_parser.set_defaults(func=job_list)
|
|
110
|
-
add_shared_arguments(job_list_optional_arguments)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def set_job_cancel_parser(job_cancel_parser: argparse.ArgumentParser):
|
|
114
|
-
job_cancel_required_arguments = job_cancel_parser.add_argument_group(
|
|
115
|
-
'Required Arguments',
|
|
116
|
-
'Arguments required for job cancel.',
|
|
117
|
-
)
|
|
118
|
-
job_cancel_optional_arguments = job_cancel_parser.add_argument_group(
|
|
119
|
-
'Optional Arguments', 'Arguments optional for job cancel.'
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
job_cancel_required_arguments.add_argument(
|
|
123
|
-
'name',
|
|
124
|
-
type=str,
|
|
125
|
-
default=None,
|
|
126
|
-
help='The name of the job to be cancelled.',
|
|
127
|
-
nargs='+',
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
job_cancel_required_arguments.add_argument(
|
|
131
|
-
'--cluster',
|
|
132
|
-
type=name_type,
|
|
133
|
-
default=None,
|
|
134
|
-
help='The name of the cluster to delete the job on.',
|
|
135
|
-
required=True,
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
job_cancel_optional_arguments.add_argument(
|
|
139
|
-
'--kind-cluster',
|
|
140
|
-
type=bool,
|
|
141
|
-
action=argparse.BooleanOptionalAction,
|
|
142
|
-
default=False,
|
|
143
|
-
help='Apply command to a local test cluster.',
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
job_cancel_parser.set_defaults(func=job_cancel)
|
|
147
|
-
add_shared_arguments(job_cancel_optional_arguments)
|
xpk/parser/run.py
DELETED
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Copyright 2025 Google LLC
|
|
3
|
-
|
|
4
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
you may not use this file except in compliance with the License.
|
|
6
|
-
You may obtain a copy of the License at
|
|
7
|
-
|
|
8
|
-
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
|
|
10
|
-
Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
See the License for the specific language governing permissions and
|
|
14
|
-
limitations under the License.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
from ..commands.run import run
|
|
18
|
-
from .common import (
|
|
19
|
-
add_shared_arguments,
|
|
20
|
-
add_slurm_arguments,
|
|
21
|
-
add_cluster_arguments,
|
|
22
|
-
add_kind_cluster_arguments,
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def set_run_parser(run_parser):
|
|
27
|
-
run_required_arguments = run_parser.add_argument_group(
|
|
28
|
-
'Required Arguments', 'Arguments required for `run`.'
|
|
29
|
-
)
|
|
30
|
-
run_optional_arguments = run_parser.add_argument_group(
|
|
31
|
-
'Optional Arguments', 'Arguments optional for `run`.'
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
run_required_arguments.add_argument('script', help='script with task to run')
|
|
35
|
-
run_optional_arguments.add_argument(
|
|
36
|
-
'--timeout',
|
|
37
|
-
type=int,
|
|
38
|
-
default=None,
|
|
39
|
-
help='Amount of time to wait for job in seconds',
|
|
40
|
-
required=False,
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
add_cluster_arguments(run_optional_arguments)
|
|
44
|
-
add_kind_cluster_arguments(run_optional_arguments)
|
|
45
|
-
add_slurm_arguments(run_optional_arguments)
|
|
46
|
-
add_shared_arguments(run_parser)
|
|
47
|
-
run_parser.set_defaults(func=run)
|