xpk 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/commands/storage.py CHANGED
@@ -23,7 +23,6 @@ from kubernetes.client.rest import ApiException
23
23
 
24
24
  from ..core import gcsfuse
25
25
  from ..core.cluster import (
26
- DEFAULT_NAMESPACE,
27
26
  add_zone_and_project,
28
27
  get_cluster_network,
29
28
  setup_k8s_env,
@@ -35,12 +34,6 @@ from ..core.cluster import (
35
34
  update_cluster_with_workload_identity_if_necessary,
36
35
  )
37
36
  from ..core.filestore import FilestoreClient, get_storage_class_name
38
- from ..core.kjob import (
39
- KJOB_API_GROUP_NAME,
40
- KJOB_API_GROUP_VERSION,
41
- KJOB_API_VOLUME_BUNDLE_PLURAL,
42
- create_volume_bundle_instance,
43
- )
44
37
  from ..core.storage import (
45
38
  GCP_FILESTORE_TYPE,
46
39
  GCS_FUSE_TYPE,
@@ -98,9 +91,6 @@ def storage_create(args: Namespace) -> None:
98
91
 
99
92
  k8s_api_client = setup_k8s_env(args)
100
93
  create_storage_crds(k8s_api_client, args, manifest)
101
- create_volume_bundle_instance(
102
- k8s_api_client, args.name, manifest, args.readonly, args.mount_point
103
- )
104
94
  # Not required for Filestore. Will be uncommented when adding GCSFuse create
105
95
  # return_code = update_cluster_with_workload_identity_if_necessary(args)
106
96
  # if return_code > 0:
@@ -214,9 +204,6 @@ def storage_attach(args: Namespace) -> None:
214
204
 
215
205
  k8s_api_client = setup_k8s_env(args)
216
206
  create_storage_crds(k8s_api_client, args, manifest)
217
- create_volume_bundle_instance(
218
- k8s_api_client, args.name, manifest, args.readonly, args.mount_point
219
- )
220
207
 
221
208
  enable_csi_drivers_if_necessary(args)
222
209
 
@@ -332,18 +319,6 @@ def delete_storage_resources(k8s_api_client: ApiClient, storage: Storage):
332
319
  "Storage Class",
333
320
  )
334
321
 
335
- delete_resource(
336
- lambda name: api_instance.delete_namespaced_custom_object(
337
- namespace=DEFAULT_NAMESPACE,
338
- name=name,
339
- group=KJOB_API_GROUP_NAME,
340
- version=KJOB_API_GROUP_VERSION,
341
- plural=KJOB_API_VOLUME_BUNDLE_PLURAL,
342
- ),
343
- storage.name,
344
- "VolumeBundle",
345
- )
346
-
347
322
  delete_resource(
348
323
  lambda name: api_instance.delete_cluster_custom_object(
349
324
  name=name,
xpk/commands/workload.py CHANGED
@@ -493,6 +493,7 @@ def workload_create(args) -> None:
493
493
  podFailurePolicy:
494
494
  rules:
495
495
  - action: FailJob
496
+ onPodConditions: []
496
497
  onExitCodes:
497
498
  containerName: {get_main_container_docker_image(args, workload_system)}
498
499
  operator: NotIn
xpk/core/cluster.py CHANGED
@@ -717,10 +717,8 @@ def get_cluster_credentials(args) -> int:
717
717
  location=location,
718
718
  dns_endpoint=True,
719
719
  )
720
- if return_code != 0:
721
- return return_code
722
720
 
723
- if not _are_credentials_valid():
721
+ if return_code != 0 or not _are_credentials_valid():
724
722
  xpk_print('Detected error. Retrying without --dns-endpoint flag...')
725
723
  return_code = _get_credentials(
726
724
  project=args.project,
xpk/parser/common.py CHANGED
@@ -180,157 +180,6 @@ def add_global_arguments(custom_parser_or_group: ParserOrArgumentGroup):
180
180
  )
181
181
 
182
182
 
183
- def add_slurm_arguments(custom_parser_or_group: ParserOrArgumentGroup):
184
- """Add Slurm job arguments to the parser.
185
-
186
- Args:
187
- custom_parser_or_group: parser or argument group to add global arguments to.
188
- """
189
- custom_parser_or_group.add_argument(
190
- '--ignore-unknown-flags',
191
- type=bool,
192
- action=argparse.BooleanOptionalAction,
193
- default=False,
194
- help='Ignore all the unsupported flags in the bash script.',
195
- )
196
- custom_parser_or_group.add_argument(
197
- '-a',
198
- '--array',
199
- type=str,
200
- default=None,
201
- help=(
202
- 'Submit a job array, multiple jobs to be executed with identical'
203
- ' parameters. The indexes specification identifies what array index'
204
- ' values should be used. For example, "--array=0-15" or'
205
- ' "--array=0,6,16-32". Multiple values may be specified using a comma'
206
- ' separated list and/or a range of values with a "-" separator. For'
207
- ' example "--array=0-15%%4" will limit the number of simultaneously'
208
- ' running tasks from this job array to 4. The minimum index value is'
209
- ' 0. The maximum index value is 2147483647.'
210
- ),
211
- )
212
- custom_parser_or_group.add_argument(
213
- '-c',
214
- '--cpus-per-task',
215
- type=str,
216
- default=None,
217
- help='How much cpus a container inside a pod requires.',
218
- )
219
- custom_parser_or_group.add_argument(
220
- '--gpus-per-task',
221
- type=str,
222
- default=None,
223
- help='How much gpus a container inside a pod requires.',
224
- )
225
- custom_parser_or_group.add_argument(
226
- '--mem',
227
- type=str,
228
- default=None,
229
- help='How much memory a pod requires.',
230
- )
231
- custom_parser_or_group.add_argument(
232
- '--mem-per-task',
233
- type=str,
234
- default=None,
235
- help='How much memory a container requires.',
236
- )
237
- custom_parser_or_group.add_argument(
238
- '--mem-per-cpu',
239
- type=str,
240
- default=None,
241
- help=(
242
- 'How much memory a container requires, it multiplies the number '
243
- 'of requested cpus per task by mem-per-cpu.'
244
- ),
245
- )
246
- custom_parser_or_group.add_argument(
247
- '--mem-per-gpu',
248
- type=str,
249
- default=None,
250
- help=(
251
- 'How much memory a container requires, it multiplies the number '
252
- 'of requested gpus per task by mem-per-gpu.'
253
- ),
254
- )
255
- custom_parser_or_group.add_argument(
256
- '-N',
257
- '--nodes',
258
- type=int,
259
- default=None,
260
- help='Number of pods to be used at a time.',
261
- )
262
- custom_parser_or_group.add_argument(
263
- '-n',
264
- '--ntasks',
265
- type=int,
266
- default=None,
267
- help='Number of identical containers inside of a pod, usually 1.',
268
- )
269
- custom_parser_or_group.add_argument(
270
- '-o',
271
- '--output',
272
- type=str,
273
- default=None,
274
- help=(
275
- 'Where to redirect the standard output stream of a task. If not'
276
- ' passed it proceeds to stdout, and is available via kubectl logs.'
277
- ),
278
- )
279
- custom_parser_or_group.add_argument(
280
- '-e',
281
- '--error',
282
- type=str,
283
- default=None,
284
- help=(
285
- 'Where to redirect std error stream of a task. If not passed it'
286
- ' proceeds to stdout, and is available via kubectl logs.'
287
- ),
288
- )
289
- custom_parser_or_group.add_argument(
290
- '--input',
291
- type=str,
292
- default=None,
293
- help='What to pipe into the script.',
294
- )
295
- custom_parser_or_group.add_argument(
296
- '-J',
297
- '--job-name',
298
- type=str,
299
- default=None,
300
- help='What is the job name.',
301
- )
302
- custom_parser_or_group.add_argument(
303
- '-D',
304
- '--chdir',
305
- type=str,
306
- default=None,
307
- help='Change directory before executing the script.',
308
- )
309
- custom_parser_or_group.add_argument(
310
- '-t',
311
- '--time',
312
- type=str,
313
- default=None,
314
- help=(
315
- 'Set a limit on the total run time of the job. '
316
- 'A time limit of zero requests that no time limit be imposed. '
317
- 'Acceptable time formats include "minutes", "minutes:seconds", '
318
- '"hours:minutes:seconds", "days-hours", "days-hours:minutes" '
319
- 'and "days-hours:minutes:seconds".'
320
- ),
321
- )
322
- custom_parser_or_group.add_argument(
323
- '--priority',
324
- type=str,
325
- default='medium',
326
- choices=['very-low', 'low', 'medium', 'high', 'very-high'],
327
- help=(
328
- 'A priority, one of `very-low`, `low`, `medium`, `high` or'
329
- ' `very-high`. Defaults to `medium`.'
330
- ),
331
- )
332
-
333
-
334
183
  def add_tpu_type_argument(
335
184
  custom_parser_or_group: ParserOrArgumentGroup,
336
185
  required: bool = False,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 0.17.0
3
+ Version: 0.17.2
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -36,14 +36,14 @@ xpk/commands/managed_ml_diagnostics.py,sha256=87wmFbnYQY-kEpJfPo1Up53xM5P_P5wOlX
36
36
  xpk/commands/managed_ml_diagnostics_test.py,sha256=pQ1YUGMGRQFJYTS_1o9YyGUzYdLaBdA84LjbnncaeEo,3828
37
37
  xpk/commands/run.py,sha256=D0zgmnGeBLATphYhzQj29EScxrMmAKqPRhP6nfWuYcY,4085
38
38
  xpk/commands/shell.py,sha256=mRHMwm3Izzsue4bocekm82Rg_cPUaGMClSlvNzNXQ-o,4467
39
- xpk/commands/storage.py,sha256=kPViq6mrfGeAJwScdMs_kUJg-QxEO6SrEvyBbXhCzEI,11439
39
+ xpk/commands/storage.py,sha256=cSTJN9Mjvdsvk_Nk43kVdQFhp89nxWbanDsTOGZCkpQ,10708
40
40
  xpk/commands/version.py,sha256=k30rdLP9clUM8eeSwRFhpfzSb1qwcQImTfuC59Ed6CA,771
41
- xpk/commands/workload.py,sha256=346bmRKX5q8R_ZkJAxXbp6ofIZp8NyrwP4OtoWsEGXw,31479
41
+ xpk/commands/workload.py,sha256=l99NRFLs7pXuaLdn5d-Pid-cZulKpB3FNus-HdNDtZw,31513
42
42
  xpk/commands/workload_test.py,sha256=iXTY7VR1KrlPZZyh1Zm0N946kIP1iV2Fnqx1NtOYDJU,7274
43
43
  xpk/core/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
44
44
  xpk/core/capacity.py,sha256=_TyWayBkNU8fBpz1LTbCddEFZiZW5Qz-xmJnQMsXh0c,10534
45
45
  xpk/core/capacity_test.py,sha256=jZjMHTYlFLdAmBN1t9k29iABCSE5hlW0--q7QLDQpfQ,4330
46
- xpk/core/cluster.py,sha256=zAv46s-UB2r-I6cSkH7YzwAkGgD8Vxn7dJsXb_CMEQI,24062
46
+ xpk/core/cluster.py,sha256=3nl77I_MgQpBZsZSzsiQ_7IyFRzfLrYNRUL1gsSNhKU,24036
47
47
  xpk/core/cluster_private.py,sha256=RLi0C7bV0NEUXl6QKQzvUT0weN9EdqPvjuuOQsNO0DY,6868
48
48
  xpk/core/cluster_test.py,sha256=J4Wk7E--ik_IsWWzL_iWGWbx99Ih03m-0bs-uU7gGDg,5853
49
49
  xpk/core/commands.py,sha256=at73VJHdZ4rVA8uvW997tNrvnCjP9v6zaw96bU0kd74,10841
@@ -59,7 +59,6 @@ xpk/core/gcloud_context_test.py,sha256=M8rp6S1zaEcAI7u4Bt8ukWKzv82HH5h9oYVojBcKg
59
59
  xpk/core/gcluster_manager.py,sha256=lyv_MvdnkByy9_PEBj_ugAEBwnCbFNiWTSrEFjrMlPc,6236
60
60
  xpk/core/gcsfuse.py,sha256=kg5pgxdTjgiqquuGjev9fXzJPb8oiWPTK6wzCddzheQ,2125
61
61
  xpk/core/jobset.py,sha256=PJ4Fd8TNNLuYKNOMehoMYRIUEXyc5jsbHctJGqfW_8Y,4037
62
- xpk/core/kjob.py,sha256=Ustta_ygXaacmgb1Av6QW4Epw0S_r-b-tjrMA6uNVj0,14240
63
62
  xpk/core/kueue_manager.py,sha256=JB8DcD-RFvBdC9Mk_DDCAkI2Km8W5-KMTRMVec06LlM,20010
64
63
  xpk/core/kueue_manager_test.py,sha256=FfBd1vninU_fcJ9wZev45-vpEsH12a9-XKysk_h4auo,22008
65
64
  xpk/core/monitoring.py,sha256=__bzTq_DIDAK8yIaN4F3MJh-yjYw5X1OlxmRgYOpf1g,4332
@@ -109,7 +108,7 @@ xpk/parser/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
109
108
  xpk/parser/batch.py,sha256=mJU-Cp1yTLje59vD-B1IiBcUeD-ZmEsoeB4xhj9cflc,1406
110
109
  xpk/parser/cluster.py,sha256=U2T-Q4yS86PWeFLNfknYWDDzZfubCKqIhqasxKLmErI,31342
111
110
  xpk/parser/cluster_test.py,sha256=xzQEC3IeAMpwsbNbHLuaNKxR3iaZcm3z4m3i61G62d4,6581
112
- xpk/parser/common.py,sha256=w6u6rqCOO23572C99PV1N8Fsp-vTP0C7Kv5tdWGEQO8,11691
111
+ xpk/parser/common.py,sha256=sJYGjrn2YgFxelDCYB18s1R8Md8GpDcMQNoAezxDDIs,7257
113
112
  xpk/parser/common_test.py,sha256=_6Fm2pUF7h4K0G5qxGabXSYr4ng9ihOzlViE6oLQwQs,1557
114
113
  xpk/parser/config.py,sha256=-XnWx9aFsBW4Uzo_hpOMD2ZQ0bdZLvq1ksv83_5jqSM,1633
115
114
  xpk/parser/core.py,sha256=VRJerlS92ufoQbG1mZv7B04DAP4qGkBHa4pRXgcbAs0,4761
@@ -139,7 +138,6 @@ xpk/templates/kueue_sub_slicing_topology.yaml.j2,sha256=UXjpRFqCIcoebwcMeD9Lo4fe
139
138
  xpk/templates/kueue_super_slicing_topology.yaml.j2,sha256=4WkSfQ2A5-jnKWiHWj2WXlv4sQmAcfxzbJCW-cWUE8E,264
140
139
  xpk/templates/mtc-cpc.yaml,sha256=MPx75tog09kjRAvHoNOPCEobigQ17d7pYCUnZCevSDQ,340
141
140
  xpk/templates/storage.yaml,sha256=AykdyMtDnKZF8Y_0BYxoYP03hEIzEk6iNalXAQHgAls,163
142
- xpk/templates/volume_bundle.yaml,sha256=sqeag7GPWqGNQ5doZtO9IVAX_vKYRO73-aBE7waEtSY,129
143
141
  xpk/utils/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
144
142
  xpk/utils/console.py,sha256=AJWSyjuWyLjb7SYt8kPb0gw9N84EN9LbLxYCXjC-6Ds,2464
145
143
  xpk/utils/console_test.py,sha256=x1v7v9VrIZwAKH-eOzj1lAY4EsHxJ6ruhfEOzpssO6o,2944
@@ -162,9 +160,9 @@ xpk/utils/validation.py,sha256=irL9579RbvwxiGn1t3zhhPo-0oHgdUPOSYsUuFqsDSM,3039
162
160
  xpk/utils/validation_test.py,sha256=PEDSMUqZdt_Lx1FSR-LOTXKKtsJ47JH1fxugM0Gfz6Y,1168
163
161
  xpk/utils/versions.py,sha256=_Ep68W70a9605XjiaOOpBa9Is9jXlsoOiwL8v5Xt-WA,897
164
162
  xpk/utils/yaml.py,sha256=j8xuAJ9yAAwnQi6ozwZ-nMnDyDnc3xWkeBZMtSuP4RU,844
165
- xpk-0.17.0.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
166
- xpk-0.17.0.dist-info/METADATA,sha256=a7flD2BsWV8tLk6cxCMvnYvLeJMMChu4TpyGfP1QbH8,7930
167
- xpk-0.17.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
168
- xpk-0.17.0.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
169
- xpk-0.17.0.dist-info/top_level.txt,sha256=TQKZWgV7LSElvmunYT9V_627qOMoxq3qYzWAFzKudB8,16
170
- xpk-0.17.0.dist-info/RECORD,,
163
+ xpk-0.17.2.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
164
+ xpk-0.17.2.dist-info/METADATA,sha256=_G5EPL08DVbtGWPXVmHAg_HxH_-op5be3Fx1rWRJiwI,7930
165
+ xpk-0.17.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ xpk-0.17.2.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
167
+ xpk-0.17.2.dist-info/top_level.txt,sha256=TQKZWgV7LSElvmunYT9V_627qOMoxq3qYzWAFzKudB8,16
168
+ xpk-0.17.2.dist-info/RECORD,,
xpk/core/kjob.py DELETED
@@ -1,473 +0,0 @@
1
- """
2
- Copyright 2024 Google LLC
3
-
4
- Licensed under the Apache License, Version 2.0 (the "License");
5
- you may not use this file except in compliance with the License.
6
- You may obtain a copy of the License at
7
-
8
- https://www.apache.org/licenses/LICENSE-2.0
9
-
10
- Unless required by applicable law or agreed to in writing, software
11
- distributed under the License is distributed on an "AS IS" BASIS,
12
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- See the License for the specific language governing permissions and
14
- limitations under the License.
15
- """
16
-
17
- from argparse import Namespace
18
- from enum import Enum
19
-
20
- import yaml
21
- from kubernetes import client as k8s_client
22
- from kubernetes.client import ApiClient
23
- from kubernetes.client.rest import ApiException
24
-
25
- from ..utils import templates
26
- from ..utils.execution_context import is_dry_run
27
- from ..utils.console import xpk_exit, xpk_print
28
- from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
29
- from .commands import (
30
- run_command_for_value,
31
- run_command_with_updates,
32
- run_kubectl_apply,
33
- )
34
- from .config import (
35
- KJOB_BATCH_IMAGE,
36
- KJOB_BATCH_WORKING_DIRECTORY,
37
- KJOB_SHELL_IMAGE,
38
- KJOB_SHELL_INTERACTIVE_COMMAND,
39
- KJOB_SHELL_WORKING_DIRECTORY,
40
- get_config,
41
- )
42
- from .network import get_cluster_subnetworks
43
- from .system_characteristics import AcceleratorType, SystemCharacteristics
44
- from .resources import get_cluster_system_characteristics
45
- from .storage import (
46
- GCS_FUSE_ANNOTATIONS,
47
- PARALLELSTORE_ANNOTATIONS,
48
- get_auto_mount_gcsfuse_storages,
49
- get_auto_mount_parallelstore_storages,
50
- get_auto_mount_storages,
51
- )
52
- from .workload_decorators import (
53
- rdma_decorator,
54
- tcpxo_decorator,
55
- )
56
- from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry
57
-
58
- KJOB_API_GROUP_NAME = "kjobctl.x-k8s.io"
59
- KJOB_API_GROUP_VERSION = "v1alpha1"
60
- KJOB_API_VOLUME_BUNDLE_PLURAL = "volumebundles"
61
- VOLUME_BUNDLE_TEMPLATE_PATH = "/../templates/volume_bundle.yaml"
62
-
63
-
64
- class AppProfileDefaults(Enum):
65
- NAME = "xpk-def-app-profile"
66
-
67
-
68
- class JobTemplateDefaults(Enum):
69
- NAME = "xpk-def-batch"
70
- PARALLELISM = 1
71
- COMPLETIONS = 1
72
- CONTAINER_NAME = "xpk-batch-container"
73
- IMAGE = "ubuntu:22.04"
74
- WORKING_DIRECTORY = "/"
75
-
76
-
77
- class PodTemplateDefaults(Enum):
78
- NAME = "xpk-def-pod"
79
- CONTAINER_NAME = "xpk-interactive-container"
80
- IMAGE = "busybox:1.28"
81
- WORKING_DIRECTORY = "/"
82
- INTERACTIVE_COMMAND = "/bin/sh"
83
-
84
-
85
- job_template_yaml = """
86
- apiVersion: kjobctl.x-k8s.io/v1alpha1
87
- kind: JobTemplate
88
- metadata:
89
- name: {name}
90
- namespace: default
91
- template:
92
- spec:
93
- parallelism: {parallelism}
94
- completions: {completions}
95
- completionMode: Indexed
96
- template:
97
- spec:
98
- dnsPolicy: ClusterFirstWithHostNet
99
- tolerations:
100
- - operator: "Exists"
101
- key: nvidia.com/gpu
102
- containers:
103
- - name: {container_name}
104
- image: {image}
105
- workingDir: {working_directory}
106
- {resources}
107
- {node_selector}
108
- priorityClassName: {priority}
109
- restartPolicy: OnFailure
110
- serviceAccountName: {service_account}
111
- """
112
- job_node_selector_template = """
113
- nodeSelector:
114
- cloud.google.com/gke-accelerator: {gpu_name}
115
- """
116
- job_resources_template = """
117
- resources:
118
- limits:
119
- nvidia.com/gpu: {gpu_per_node}
120
- """
121
-
122
- app_profile_yaml = """
123
- apiVersion: kjobctl.x-k8s.io/v1alpha1
124
- kind: ApplicationProfile
125
- metadata:
126
- name: {name}
127
- namespace: default
128
- spec:
129
- supportedModes:
130
- - name: Slurm
131
- template: {batch_template}
132
- requiredFlags: []
133
- - name: Interactive
134
- template: {interactive_template}
135
- volumeBundles: {volume_bundles}
136
- """
137
-
138
- pod_template_yaml = """
139
- apiVersion: v1
140
- kind: PodTemplate
141
- metadata:
142
- name: {name}
143
- namespace: default
144
- template:
145
- spec:
146
- tolerations:
147
- - effect: NoSchedule
148
- key: components.gke.io/gke-managed-components
149
- operator: Equal
150
- value: "true"
151
- containers:
152
- - name: {container_name}
153
- image: {image}
154
- command: [{interactive_command}]
155
- workingDir: {working_directory}
156
- initContainers:
157
- - name: init
158
- image: {image}
159
- command: ['/bin/mkdir', '-p', '{working_directory}']
160
- serviceAccountName: {service_account}
161
- """
162
-
163
- Kueue_TAS_annotation = "kueue.x-k8s.io/podset-preferred-topology=cloud.google.com/gce-topology-host"
164
-
165
- default_interface_annotation = "networking.gke.io/default-interface=eth0"
166
-
167
-
168
- def get_a4_pod_template_annotations() -> tuple[str, str]:
169
- sub_networks = get_cluster_subnetworks()
170
- interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
171
- sub_networks
172
- )
173
-
174
- return (
175
- default_interface_annotation,
176
- f"{interfaces_key}=$'{interfaces_value}'",
177
- )
178
-
179
-
180
- def get_a3ultra_pod_template_annotations() -> tuple[str, str]:
181
- sub_networks = get_cluster_subnetworks()
182
- interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
183
- sub_networks
184
- )
185
-
186
- return (
187
- default_interface_annotation,
188
- f"{interfaces_key}=$'{interfaces_value}'",
189
- )
190
-
191
-
192
- def get_a3mega_pod_template_annotations() -> tuple[str, str, str]:
193
- """Adds or updates annotations in the Pod template."""
194
- sub_networks = get_cluster_subnetworks()
195
- tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
196
- interfaces_key, interfaces_value = tcpxo_decorator.get_interfaces_entry(
197
- sub_networks
198
- )
199
- tcpxo = f"{tcpxo_deamon_key}=$'{tcpxo_deamon_paths}'"
200
- interfaces = f"{interfaces_key}=$'{interfaces_value}'"
201
- return tcpxo, interfaces, default_interface_annotation
202
-
203
-
204
- def verify_kjob_installed() -> int:
205
- """Check if kjob is installed. If not provide user with proper communicate and exit.
206
- Returns:
207
- error code > if kjob not installed, otherwise 0
208
- """
209
- command = "kubectl-kjob help"
210
- task = "Verify kjob installation "
211
- verify_kjob_installed_code, _ = run_command_for_value(command, task)
212
-
213
- if verify_kjob_installed_code == 0:
214
- xpk_print("kjob found")
215
- return 0
216
-
217
- if verify_kjob_installed_code != 0:
218
- xpk_print(
219
- " kjob not found. Please follow"
220
- " https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md"
221
- " to install kjob."
222
- )
223
- return verify_kjob_installed_code
224
- return 0
225
-
226
-
227
- def get_pod_template_interactive_command() -> str:
228
- """Gets the interactive command for PodTemplate from config otherwise the default value.
229
-
230
- Args:
231
- args - user provided arguments
232
- Returns:
233
- str - PodTemplate's interactive command
234
- """
235
- pod_command = get_config().get(KJOB_SHELL_INTERACTIVE_COMMAND)
236
- if pod_command is None or len(pod_command) == 0:
237
- pod_command = PodTemplateDefaults.INTERACTIVE_COMMAND.value
238
-
239
- return pod_command
240
-
241
-
242
- def create_app_profile_instance(volume_bundles: list[str]) -> int:
243
- """Create new AppProfile instance on cluster with default settings.
244
-
245
- Args:
246
- args - user provided arguments
247
- Returns:
248
- exit_code > 0 if creating AppProfile fails, 0 otherwise
249
- """
250
- return run_kubectl_apply(
251
- yml_string=app_profile_yaml.format(
252
- name=AppProfileDefaults.NAME.value,
253
- batch_template=JobTemplateDefaults.NAME.value,
254
- interactive_template=PodTemplateDefaults.NAME.value,
255
- volume_bundles=volume_bundles,
256
- ),
257
- task="Creating AppProfile",
258
- )
259
-
260
-
261
- def decorate_job_template_with_gpu(
262
- yml_string: str, system: SystemCharacteristics
263
- ) -> str:
264
- job_spec = yaml.safe_load(yml_string)["template"]
265
- kjob_decorator = (
266
- system.gpu_config.kjob_decorator_fn
267
- if system.gpu_config and system.gpu_config.kjob_decorator_fn
268
- else None
269
- )
270
- if kjob_decorator:
271
- job_spec = kjob_decorator(job_spec)
272
- job_template_dict = yaml.safe_load(yml_string)
273
- job_template_dict["template"] = job_spec
274
- yaml_result: str = yaml.dump(job_template_dict, sort_keys=False)
275
- return yaml_result
276
-
277
-
278
- def create_job_template_instance(
279
- args: Namespace,
280
- system: SystemCharacteristics | None,
281
- service_account: str,
282
- ) -> int:
283
- """Create new JobTemplate instance on cluster with default settings.
284
-
285
- Args:
286
- args - user provided arguments
287
- Returns:
288
- exit_code > 0 if creating JobTemplate fails, 0 otherwise
289
- """
290
- job_image = get_config().get(KJOB_BATCH_IMAGE)
291
- if job_image is None or len(job_image) == 0:
292
- job_image = JobTemplateDefaults.IMAGE.value
293
- working_directory = get_config().get(KJOB_BATCH_WORKING_DIRECTORY)
294
- if working_directory is None or len(working_directory) == 0:
295
- working_directory = JobTemplateDefaults.WORKING_DIRECTORY.value
296
- resources = (
297
- job_resources_template.format(gpu_per_node=system.chips_per_vm)
298
- if system is not None and system.accelerator_type == AcceleratorType.GPU
299
- else ""
300
- )
301
-
302
- node_selector = (
303
- job_node_selector_template.format(gpu_name=system.gke_accelerator)
304
- if system is not None and system.accelerator_type == AcceleratorType.GPU
305
- else ""
306
- )
307
- yml_string = job_template_yaml.format(
308
- name=JobTemplateDefaults.NAME.value,
309
- parallelism=JobTemplateDefaults.PARALLELISM.value,
310
- completions=JobTemplateDefaults.COMPLETIONS.value,
311
- container_name=JobTemplateDefaults.CONTAINER_NAME.value,
312
- image=job_image,
313
- working_directory=working_directory,
314
- resources=resources,
315
- node_selector=node_selector,
316
- priority=args.priority if hasattr(args, "priority") else "medium",
317
- service_account=service_account,
318
- )
319
- if system is not None and system.accelerator_type == AcceleratorType.GPU:
320
- yml_string = decorate_job_template_with_gpu(yml_string, system)
321
-
322
- return run_kubectl_apply(
323
- yml_string,
324
- task="Creating JobTemplate",
325
- )
326
-
327
-
328
- def create_pod_template_instance(service_account: str) -> int:
329
- """Create new PodTemplate instance on cluster with default settings.
330
-
331
- Returns:
332
- exit_code > 0 if creating PodTemplate fails, 0 otherwise
333
- """
334
- pod_image = get_config().get(KJOB_SHELL_IMAGE)
335
- if pod_image is None or len(pod_image) == 0:
336
- pod_image = PodTemplateDefaults.IMAGE.value
337
- working_directory = get_config().get(KJOB_SHELL_WORKING_DIRECTORY)
338
- if working_directory is None or len(working_directory) == 0:
339
- working_directory = PodTemplateDefaults.WORKING_DIRECTORY.value
340
-
341
- return run_kubectl_apply(
342
- yml_string=pod_template_yaml.format(
343
- name=PodTemplateDefaults.NAME.value,
344
- container_name=PodTemplateDefaults.CONTAINER_NAME.value,
345
- image=pod_image,
346
- working_directory=working_directory,
347
- interactive_command=get_pod_template_interactive_command(),
348
- service_account=service_account,
349
- ),
350
- task="Creating PodTemplate",
351
- )
352
-
353
-
354
- def prepare_kjob(args: Namespace) -> int:
355
- system = get_cluster_system_characteristics(args)
356
-
357
- storages = []
358
- if not is_dry_run():
359
- k8s_api_client = setup_k8s_env(args)
360
- storages = get_auto_mount_storages(k8s_api_client)
361
-
362
- service_account = ""
363
- if len(storages) > 0:
364
- service_account = XPK_SA
365
-
366
- job_err_code = create_job_template_instance(args, system, service_account)
367
- if job_err_code > 0:
368
- return job_err_code
369
- pod_err_code = create_pod_template_instance(service_account)
370
- if pod_err_code > 0:
371
- return pod_err_code
372
-
373
- volume_bundles = [item.name for item in storages]
374
-
375
- return create_app_profile_instance(volume_bundles)
376
-
377
-
378
- def apply_kjob_crds() -> int:
379
- """Apply kjob CRDs on cluster.
380
-
381
- This function install kjob CRDs files from kjobctl printcrds.
382
- It creates all neccessary kjob CRDs.
383
-
384
- Returns:
385
- None
386
- """
387
- command = "kubectl kjob printcrds | kubectl apply --server-side -f -"
388
- task = "Create kjob CRDs on cluster"
389
- return_code = run_command_with_updates(command, task)
390
- if return_code != 0:
391
- xpk_print(f"{task} returned ERROR {return_code}")
392
- return return_code
393
- xpk_print("Creating kjob CRDs succeeded")
394
- return 0
395
-
396
-
397
- def create_volume_bundle_instance(
398
- k8s_api_client: ApiClient,
399
- name: str,
400
- manifest: list[dict],
401
- readonly: bool,
402
- mount_point: str,
403
- ) -> None:
404
- """
405
- Creates a new VolumeBundle resource in the Kubernetes cluster.
406
-
407
- This function reads a VolumeBundle template from a YAML file, populates it with
408
- values from the provided arguments, and then creates the VolumeBundle object
409
- in the cluster.
410
-
411
- Args:
412
- k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
413
- args: An argparse Namespace object containing the arguments for creating
414
- the Storage resource.
415
- """
416
- data = templates.load(VOLUME_BUNDLE_TEMPLATE_PATH)
417
- data["metadata"]["name"] = name
418
- spec = data["spec"]
419
- spec["volumes"] = []
420
- spec["containerVolumeMounts"] = []
421
-
422
- for obj in manifest:
423
- if obj["kind"] == "PersistentVolumeClaim":
424
- spec["volumes"].append({
425
- "name": obj["metadata"]["name"],
426
- "persistentVolumeClaim": {
427
- "claimName": obj["metadata"]["name"],
428
- "readOnly": readonly,
429
- },
430
- })
431
- spec["containerVolumeMounts"].append({
432
- "name": obj["metadata"]["name"],
433
- "mountPath": mount_point,
434
- })
435
-
436
- data["spec"] = spec
437
-
438
- api_instance = k8s_client.CustomObjectsApi(k8s_api_client)
439
- try:
440
- api_instance.create_namespaced_custom_object(
441
- namespace=DEFAULT_NAMESPACE,
442
- group=KJOB_API_GROUP_NAME,
443
- version=KJOB_API_GROUP_VERSION,
444
- plural=KJOB_API_VOLUME_BUNDLE_PLURAL,
445
- body=data,
446
- )
447
- xpk_print(
448
- f"Created {KJOB_API_VOLUME_BUNDLE_PLURAL}.{KJOB_API_GROUP_NAME} object:"
449
- f" {data['metadata']['name']}"
450
- )
451
- except ApiException as e:
452
- if e.status == 409:
453
- xpk_print(f"VolumeBundle: {name} already exists. Skipping its creation")
454
- else:
455
- xpk_print(f"Encountered error during VolumeBundle creation: {e}")
456
- xpk_exit(1)
457
-
458
-
459
- def get_storage_annotations(args: Namespace) -> list[str]:
460
- annotations = []
461
- k8s_api_client = setup_k8s_env(args)
462
-
463
- gcsfuse_storages = get_auto_mount_gcsfuse_storages(k8s_api_client)
464
- if len(gcsfuse_storages) > 0:
465
- for key, value in GCS_FUSE_ANNOTATIONS.items():
466
- annotations.append(f"{key}={value}")
467
-
468
- parallelstore_storages = get_auto_mount_parallelstore_storages(k8s_api_client)
469
- if len(parallelstore_storages) > 0:
470
- for key, value in PARALLELSTORE_ANNOTATIONS.items():
471
- annotations.append(f"{key}={value}")
472
-
473
- return annotations
@@ -1,7 +0,0 @@
1
- apiVersion: kjobctl.x-k8s.io/v1alpha1
2
- kind: VolumeBundle
3
- metadata:
4
- name: $NAME
5
- spec:
6
- volumes: []
7
- containerVolumeMounts: []
File without changes