xpk 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/storage.py +0 -25
- xpk/commands/workload.py +1 -0
- xpk/core/cluster.py +1 -3
- xpk/parser/common.py +0 -151
- {xpk-0.17.0.dist-info → xpk-0.17.2.dist-info}/METADATA +1 -1
- {xpk-0.17.0.dist-info → xpk-0.17.2.dist-info}/RECORD +10 -12
- xpk/core/kjob.py +0 -473
- xpk/templates/volume_bundle.yaml +0 -7
- {xpk-0.17.0.dist-info → xpk-0.17.2.dist-info}/WHEEL +0 -0
- {xpk-0.17.0.dist-info → xpk-0.17.2.dist-info}/entry_points.txt +0 -0
- {xpk-0.17.0.dist-info → xpk-0.17.2.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.17.0.dist-info → xpk-0.17.2.dist-info}/top_level.txt +0 -0
xpk/commands/storage.py
CHANGED
|
@@ -23,7 +23,6 @@ from kubernetes.client.rest import ApiException
|
|
|
23
23
|
|
|
24
24
|
from ..core import gcsfuse
|
|
25
25
|
from ..core.cluster import (
|
|
26
|
-
DEFAULT_NAMESPACE,
|
|
27
26
|
add_zone_and_project,
|
|
28
27
|
get_cluster_network,
|
|
29
28
|
setup_k8s_env,
|
|
@@ -35,12 +34,6 @@ from ..core.cluster import (
|
|
|
35
34
|
update_cluster_with_workload_identity_if_necessary,
|
|
36
35
|
)
|
|
37
36
|
from ..core.filestore import FilestoreClient, get_storage_class_name
|
|
38
|
-
from ..core.kjob import (
|
|
39
|
-
KJOB_API_GROUP_NAME,
|
|
40
|
-
KJOB_API_GROUP_VERSION,
|
|
41
|
-
KJOB_API_VOLUME_BUNDLE_PLURAL,
|
|
42
|
-
create_volume_bundle_instance,
|
|
43
|
-
)
|
|
44
37
|
from ..core.storage import (
|
|
45
38
|
GCP_FILESTORE_TYPE,
|
|
46
39
|
GCS_FUSE_TYPE,
|
|
@@ -98,9 +91,6 @@ def storage_create(args: Namespace) -> None:
|
|
|
98
91
|
|
|
99
92
|
k8s_api_client = setup_k8s_env(args)
|
|
100
93
|
create_storage_crds(k8s_api_client, args, manifest)
|
|
101
|
-
create_volume_bundle_instance(
|
|
102
|
-
k8s_api_client, args.name, manifest, args.readonly, args.mount_point
|
|
103
|
-
)
|
|
104
94
|
# Not required for Filestore. Will be uncommented when adding GCSFuse create
|
|
105
95
|
# return_code = update_cluster_with_workload_identity_if_necessary(args)
|
|
106
96
|
# if return_code > 0:
|
|
@@ -214,9 +204,6 @@ def storage_attach(args: Namespace) -> None:
|
|
|
214
204
|
|
|
215
205
|
k8s_api_client = setup_k8s_env(args)
|
|
216
206
|
create_storage_crds(k8s_api_client, args, manifest)
|
|
217
|
-
create_volume_bundle_instance(
|
|
218
|
-
k8s_api_client, args.name, manifest, args.readonly, args.mount_point
|
|
219
|
-
)
|
|
220
207
|
|
|
221
208
|
enable_csi_drivers_if_necessary(args)
|
|
222
209
|
|
|
@@ -332,18 +319,6 @@ def delete_storage_resources(k8s_api_client: ApiClient, storage: Storage):
|
|
|
332
319
|
"Storage Class",
|
|
333
320
|
)
|
|
334
321
|
|
|
335
|
-
delete_resource(
|
|
336
|
-
lambda name: api_instance.delete_namespaced_custom_object(
|
|
337
|
-
namespace=DEFAULT_NAMESPACE,
|
|
338
|
-
name=name,
|
|
339
|
-
group=KJOB_API_GROUP_NAME,
|
|
340
|
-
version=KJOB_API_GROUP_VERSION,
|
|
341
|
-
plural=KJOB_API_VOLUME_BUNDLE_PLURAL,
|
|
342
|
-
),
|
|
343
|
-
storage.name,
|
|
344
|
-
"VolumeBundle",
|
|
345
|
-
)
|
|
346
|
-
|
|
347
322
|
delete_resource(
|
|
348
323
|
lambda name: api_instance.delete_cluster_custom_object(
|
|
349
324
|
name=name,
|
xpk/commands/workload.py
CHANGED
xpk/core/cluster.py
CHANGED
|
@@ -717,10 +717,8 @@ def get_cluster_credentials(args) -> int:
|
|
|
717
717
|
location=location,
|
|
718
718
|
dns_endpoint=True,
|
|
719
719
|
)
|
|
720
|
-
if return_code != 0:
|
|
721
|
-
return return_code
|
|
722
720
|
|
|
723
|
-
if not _are_credentials_valid():
|
|
721
|
+
if return_code != 0 or not _are_credentials_valid():
|
|
724
722
|
xpk_print('Detected error. Retrying without --dns-endpoint flag...')
|
|
725
723
|
return_code = _get_credentials(
|
|
726
724
|
project=args.project,
|
xpk/parser/common.py
CHANGED
|
@@ -180,157 +180,6 @@ def add_global_arguments(custom_parser_or_group: ParserOrArgumentGroup):
|
|
|
180
180
|
)
|
|
181
181
|
|
|
182
182
|
|
|
183
|
-
def add_slurm_arguments(custom_parser_or_group: ParserOrArgumentGroup):
|
|
184
|
-
"""Add Slurm job arguments to the parser.
|
|
185
|
-
|
|
186
|
-
Args:
|
|
187
|
-
custom_parser_or_group: parser or argument group to add global arguments to.
|
|
188
|
-
"""
|
|
189
|
-
custom_parser_or_group.add_argument(
|
|
190
|
-
'--ignore-unknown-flags',
|
|
191
|
-
type=bool,
|
|
192
|
-
action=argparse.BooleanOptionalAction,
|
|
193
|
-
default=False,
|
|
194
|
-
help='Ignore all the unsupported flags in the bash script.',
|
|
195
|
-
)
|
|
196
|
-
custom_parser_or_group.add_argument(
|
|
197
|
-
'-a',
|
|
198
|
-
'--array',
|
|
199
|
-
type=str,
|
|
200
|
-
default=None,
|
|
201
|
-
help=(
|
|
202
|
-
'Submit a job array, multiple jobs to be executed with identical'
|
|
203
|
-
' parameters. The indexes specification identifies what array index'
|
|
204
|
-
' values should be used. For example, "--array=0-15" or'
|
|
205
|
-
' "--array=0,6,16-32". Multiple values may be specified using a comma'
|
|
206
|
-
' separated list and/or a range of values with a "-" separator. For'
|
|
207
|
-
' example "--array=0-15%%4" will limit the number of simultaneously'
|
|
208
|
-
' running tasks from this job array to 4. The minimum index value is'
|
|
209
|
-
' 0. The maximum index value is 2147483647.'
|
|
210
|
-
),
|
|
211
|
-
)
|
|
212
|
-
custom_parser_or_group.add_argument(
|
|
213
|
-
'-c',
|
|
214
|
-
'--cpus-per-task',
|
|
215
|
-
type=str,
|
|
216
|
-
default=None,
|
|
217
|
-
help='How much cpus a container inside a pod requires.',
|
|
218
|
-
)
|
|
219
|
-
custom_parser_or_group.add_argument(
|
|
220
|
-
'--gpus-per-task',
|
|
221
|
-
type=str,
|
|
222
|
-
default=None,
|
|
223
|
-
help='How much gpus a container inside a pod requires.',
|
|
224
|
-
)
|
|
225
|
-
custom_parser_or_group.add_argument(
|
|
226
|
-
'--mem',
|
|
227
|
-
type=str,
|
|
228
|
-
default=None,
|
|
229
|
-
help='How much memory a pod requires.',
|
|
230
|
-
)
|
|
231
|
-
custom_parser_or_group.add_argument(
|
|
232
|
-
'--mem-per-task',
|
|
233
|
-
type=str,
|
|
234
|
-
default=None,
|
|
235
|
-
help='How much memory a container requires.',
|
|
236
|
-
)
|
|
237
|
-
custom_parser_or_group.add_argument(
|
|
238
|
-
'--mem-per-cpu',
|
|
239
|
-
type=str,
|
|
240
|
-
default=None,
|
|
241
|
-
help=(
|
|
242
|
-
'How much memory a container requires, it multiplies the number '
|
|
243
|
-
'of requested cpus per task by mem-per-cpu.'
|
|
244
|
-
),
|
|
245
|
-
)
|
|
246
|
-
custom_parser_or_group.add_argument(
|
|
247
|
-
'--mem-per-gpu',
|
|
248
|
-
type=str,
|
|
249
|
-
default=None,
|
|
250
|
-
help=(
|
|
251
|
-
'How much memory a container requires, it multiplies the number '
|
|
252
|
-
'of requested gpus per task by mem-per-gpu.'
|
|
253
|
-
),
|
|
254
|
-
)
|
|
255
|
-
custom_parser_or_group.add_argument(
|
|
256
|
-
'-N',
|
|
257
|
-
'--nodes',
|
|
258
|
-
type=int,
|
|
259
|
-
default=None,
|
|
260
|
-
help='Number of pods to be used at a time.',
|
|
261
|
-
)
|
|
262
|
-
custom_parser_or_group.add_argument(
|
|
263
|
-
'-n',
|
|
264
|
-
'--ntasks',
|
|
265
|
-
type=int,
|
|
266
|
-
default=None,
|
|
267
|
-
help='Number of identical containers inside of a pod, usually 1.',
|
|
268
|
-
)
|
|
269
|
-
custom_parser_or_group.add_argument(
|
|
270
|
-
'-o',
|
|
271
|
-
'--output',
|
|
272
|
-
type=str,
|
|
273
|
-
default=None,
|
|
274
|
-
help=(
|
|
275
|
-
'Where to redirect the standard output stream of a task. If not'
|
|
276
|
-
' passed it proceeds to stdout, and is available via kubectl logs.'
|
|
277
|
-
),
|
|
278
|
-
)
|
|
279
|
-
custom_parser_or_group.add_argument(
|
|
280
|
-
'-e',
|
|
281
|
-
'--error',
|
|
282
|
-
type=str,
|
|
283
|
-
default=None,
|
|
284
|
-
help=(
|
|
285
|
-
'Where to redirect std error stream of a task. If not passed it'
|
|
286
|
-
' proceeds to stdout, and is available via kubectl logs.'
|
|
287
|
-
),
|
|
288
|
-
)
|
|
289
|
-
custom_parser_or_group.add_argument(
|
|
290
|
-
'--input',
|
|
291
|
-
type=str,
|
|
292
|
-
default=None,
|
|
293
|
-
help='What to pipe into the script.',
|
|
294
|
-
)
|
|
295
|
-
custom_parser_or_group.add_argument(
|
|
296
|
-
'-J',
|
|
297
|
-
'--job-name',
|
|
298
|
-
type=str,
|
|
299
|
-
default=None,
|
|
300
|
-
help='What is the job name.',
|
|
301
|
-
)
|
|
302
|
-
custom_parser_or_group.add_argument(
|
|
303
|
-
'-D',
|
|
304
|
-
'--chdir',
|
|
305
|
-
type=str,
|
|
306
|
-
default=None,
|
|
307
|
-
help='Change directory before executing the script.',
|
|
308
|
-
)
|
|
309
|
-
custom_parser_or_group.add_argument(
|
|
310
|
-
'-t',
|
|
311
|
-
'--time',
|
|
312
|
-
type=str,
|
|
313
|
-
default=None,
|
|
314
|
-
help=(
|
|
315
|
-
'Set a limit on the total run time of the job. '
|
|
316
|
-
'A time limit of zero requests that no time limit be imposed. '
|
|
317
|
-
'Acceptable time formats include "minutes", "minutes:seconds", '
|
|
318
|
-
'"hours:minutes:seconds", "days-hours", "days-hours:minutes" '
|
|
319
|
-
'and "days-hours:minutes:seconds".'
|
|
320
|
-
),
|
|
321
|
-
)
|
|
322
|
-
custom_parser_or_group.add_argument(
|
|
323
|
-
'--priority',
|
|
324
|
-
type=str,
|
|
325
|
-
default='medium',
|
|
326
|
-
choices=['very-low', 'low', 'medium', 'high', 'very-high'],
|
|
327
|
-
help=(
|
|
328
|
-
'A priority, one of `very-low`, `low`, `medium`, `high` or'
|
|
329
|
-
' `very-high`. Defaults to `medium`.'
|
|
330
|
-
),
|
|
331
|
-
)
|
|
332
|
-
|
|
333
|
-
|
|
334
183
|
def add_tpu_type_argument(
|
|
335
184
|
custom_parser_or_group: ParserOrArgumentGroup,
|
|
336
185
|
required: bool = False,
|
|
@@ -36,14 +36,14 @@ xpk/commands/managed_ml_diagnostics.py,sha256=87wmFbnYQY-kEpJfPo1Up53xM5P_P5wOlX
|
|
|
36
36
|
xpk/commands/managed_ml_diagnostics_test.py,sha256=pQ1YUGMGRQFJYTS_1o9YyGUzYdLaBdA84LjbnncaeEo,3828
|
|
37
37
|
xpk/commands/run.py,sha256=D0zgmnGeBLATphYhzQj29EScxrMmAKqPRhP6nfWuYcY,4085
|
|
38
38
|
xpk/commands/shell.py,sha256=mRHMwm3Izzsue4bocekm82Rg_cPUaGMClSlvNzNXQ-o,4467
|
|
39
|
-
xpk/commands/storage.py,sha256=
|
|
39
|
+
xpk/commands/storage.py,sha256=cSTJN9Mjvdsvk_Nk43kVdQFhp89nxWbanDsTOGZCkpQ,10708
|
|
40
40
|
xpk/commands/version.py,sha256=k30rdLP9clUM8eeSwRFhpfzSb1qwcQImTfuC59Ed6CA,771
|
|
41
|
-
xpk/commands/workload.py,sha256=
|
|
41
|
+
xpk/commands/workload.py,sha256=l99NRFLs7pXuaLdn5d-Pid-cZulKpB3FNus-HdNDtZw,31513
|
|
42
42
|
xpk/commands/workload_test.py,sha256=iXTY7VR1KrlPZZyh1Zm0N946kIP1iV2Fnqx1NtOYDJU,7274
|
|
43
43
|
xpk/core/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
44
44
|
xpk/core/capacity.py,sha256=_TyWayBkNU8fBpz1LTbCddEFZiZW5Qz-xmJnQMsXh0c,10534
|
|
45
45
|
xpk/core/capacity_test.py,sha256=jZjMHTYlFLdAmBN1t9k29iABCSE5hlW0--q7QLDQpfQ,4330
|
|
46
|
-
xpk/core/cluster.py,sha256=
|
|
46
|
+
xpk/core/cluster.py,sha256=3nl77I_MgQpBZsZSzsiQ_7IyFRzfLrYNRUL1gsSNhKU,24036
|
|
47
47
|
xpk/core/cluster_private.py,sha256=RLi0C7bV0NEUXl6QKQzvUT0weN9EdqPvjuuOQsNO0DY,6868
|
|
48
48
|
xpk/core/cluster_test.py,sha256=J4Wk7E--ik_IsWWzL_iWGWbx99Ih03m-0bs-uU7gGDg,5853
|
|
49
49
|
xpk/core/commands.py,sha256=at73VJHdZ4rVA8uvW997tNrvnCjP9v6zaw96bU0kd74,10841
|
|
@@ -59,7 +59,6 @@ xpk/core/gcloud_context_test.py,sha256=M8rp6S1zaEcAI7u4Bt8ukWKzv82HH5h9oYVojBcKg
|
|
|
59
59
|
xpk/core/gcluster_manager.py,sha256=lyv_MvdnkByy9_PEBj_ugAEBwnCbFNiWTSrEFjrMlPc,6236
|
|
60
60
|
xpk/core/gcsfuse.py,sha256=kg5pgxdTjgiqquuGjev9fXzJPb8oiWPTK6wzCddzheQ,2125
|
|
61
61
|
xpk/core/jobset.py,sha256=PJ4Fd8TNNLuYKNOMehoMYRIUEXyc5jsbHctJGqfW_8Y,4037
|
|
62
|
-
xpk/core/kjob.py,sha256=Ustta_ygXaacmgb1Av6QW4Epw0S_r-b-tjrMA6uNVj0,14240
|
|
63
62
|
xpk/core/kueue_manager.py,sha256=JB8DcD-RFvBdC9Mk_DDCAkI2Km8W5-KMTRMVec06LlM,20010
|
|
64
63
|
xpk/core/kueue_manager_test.py,sha256=FfBd1vninU_fcJ9wZev45-vpEsH12a9-XKysk_h4auo,22008
|
|
65
64
|
xpk/core/monitoring.py,sha256=__bzTq_DIDAK8yIaN4F3MJh-yjYw5X1OlxmRgYOpf1g,4332
|
|
@@ -109,7 +108,7 @@ xpk/parser/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
|
109
108
|
xpk/parser/batch.py,sha256=mJU-Cp1yTLje59vD-B1IiBcUeD-ZmEsoeB4xhj9cflc,1406
|
|
110
109
|
xpk/parser/cluster.py,sha256=U2T-Q4yS86PWeFLNfknYWDDzZfubCKqIhqasxKLmErI,31342
|
|
111
110
|
xpk/parser/cluster_test.py,sha256=xzQEC3IeAMpwsbNbHLuaNKxR3iaZcm3z4m3i61G62d4,6581
|
|
112
|
-
xpk/parser/common.py,sha256=
|
|
111
|
+
xpk/parser/common.py,sha256=sJYGjrn2YgFxelDCYB18s1R8Md8GpDcMQNoAezxDDIs,7257
|
|
113
112
|
xpk/parser/common_test.py,sha256=_6Fm2pUF7h4K0G5qxGabXSYr4ng9ihOzlViE6oLQwQs,1557
|
|
114
113
|
xpk/parser/config.py,sha256=-XnWx9aFsBW4Uzo_hpOMD2ZQ0bdZLvq1ksv83_5jqSM,1633
|
|
115
114
|
xpk/parser/core.py,sha256=VRJerlS92ufoQbG1mZv7B04DAP4qGkBHa4pRXgcbAs0,4761
|
|
@@ -139,7 +138,6 @@ xpk/templates/kueue_sub_slicing_topology.yaml.j2,sha256=UXjpRFqCIcoebwcMeD9Lo4fe
|
|
|
139
138
|
xpk/templates/kueue_super_slicing_topology.yaml.j2,sha256=4WkSfQ2A5-jnKWiHWj2WXlv4sQmAcfxzbJCW-cWUE8E,264
|
|
140
139
|
xpk/templates/mtc-cpc.yaml,sha256=MPx75tog09kjRAvHoNOPCEobigQ17d7pYCUnZCevSDQ,340
|
|
141
140
|
xpk/templates/storage.yaml,sha256=AykdyMtDnKZF8Y_0BYxoYP03hEIzEk6iNalXAQHgAls,163
|
|
142
|
-
xpk/templates/volume_bundle.yaml,sha256=sqeag7GPWqGNQ5doZtO9IVAX_vKYRO73-aBE7waEtSY,129
|
|
143
141
|
xpk/utils/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
|
|
144
142
|
xpk/utils/console.py,sha256=AJWSyjuWyLjb7SYt8kPb0gw9N84EN9LbLxYCXjC-6Ds,2464
|
|
145
143
|
xpk/utils/console_test.py,sha256=x1v7v9VrIZwAKH-eOzj1lAY4EsHxJ6ruhfEOzpssO6o,2944
|
|
@@ -162,9 +160,9 @@ xpk/utils/validation.py,sha256=irL9579RbvwxiGn1t3zhhPo-0oHgdUPOSYsUuFqsDSM,3039
|
|
|
162
160
|
xpk/utils/validation_test.py,sha256=PEDSMUqZdt_Lx1FSR-LOTXKKtsJ47JH1fxugM0Gfz6Y,1168
|
|
163
161
|
xpk/utils/versions.py,sha256=_Ep68W70a9605XjiaOOpBa9Is9jXlsoOiwL8v5Xt-WA,897
|
|
164
162
|
xpk/utils/yaml.py,sha256=j8xuAJ9yAAwnQi6ozwZ-nMnDyDnc3xWkeBZMtSuP4RU,844
|
|
165
|
-
xpk-0.17.
|
|
166
|
-
xpk-0.17.
|
|
167
|
-
xpk-0.17.
|
|
168
|
-
xpk-0.17.
|
|
169
|
-
xpk-0.17.
|
|
170
|
-
xpk-0.17.
|
|
163
|
+
xpk-0.17.2.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
164
|
+
xpk-0.17.2.dist-info/METADATA,sha256=_G5EPL08DVbtGWPXVmHAg_HxH_-op5be3Fx1rWRJiwI,7930
|
|
165
|
+
xpk-0.17.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
166
|
+
xpk-0.17.2.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
|
|
167
|
+
xpk-0.17.2.dist-info/top_level.txt,sha256=TQKZWgV7LSElvmunYT9V_627qOMoxq3qYzWAFzKudB8,16
|
|
168
|
+
xpk-0.17.2.dist-info/RECORD,,
|
xpk/core/kjob.py
DELETED
|
@@ -1,473 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Copyright 2024 Google LLC
|
|
3
|
-
|
|
4
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
you may not use this file except in compliance with the License.
|
|
6
|
-
You may obtain a copy of the License at
|
|
7
|
-
|
|
8
|
-
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
|
|
10
|
-
Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
See the License for the specific language governing permissions and
|
|
14
|
-
limitations under the License.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
from argparse import Namespace
|
|
18
|
-
from enum import Enum
|
|
19
|
-
|
|
20
|
-
import yaml
|
|
21
|
-
from kubernetes import client as k8s_client
|
|
22
|
-
from kubernetes.client import ApiClient
|
|
23
|
-
from kubernetes.client.rest import ApiException
|
|
24
|
-
|
|
25
|
-
from ..utils import templates
|
|
26
|
-
from ..utils.execution_context import is_dry_run
|
|
27
|
-
from ..utils.console import xpk_exit, xpk_print
|
|
28
|
-
from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
|
|
29
|
-
from .commands import (
|
|
30
|
-
run_command_for_value,
|
|
31
|
-
run_command_with_updates,
|
|
32
|
-
run_kubectl_apply,
|
|
33
|
-
)
|
|
34
|
-
from .config import (
|
|
35
|
-
KJOB_BATCH_IMAGE,
|
|
36
|
-
KJOB_BATCH_WORKING_DIRECTORY,
|
|
37
|
-
KJOB_SHELL_IMAGE,
|
|
38
|
-
KJOB_SHELL_INTERACTIVE_COMMAND,
|
|
39
|
-
KJOB_SHELL_WORKING_DIRECTORY,
|
|
40
|
-
get_config,
|
|
41
|
-
)
|
|
42
|
-
from .network import get_cluster_subnetworks
|
|
43
|
-
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
44
|
-
from .resources import get_cluster_system_characteristics
|
|
45
|
-
from .storage import (
|
|
46
|
-
GCS_FUSE_ANNOTATIONS,
|
|
47
|
-
PARALLELSTORE_ANNOTATIONS,
|
|
48
|
-
get_auto_mount_gcsfuse_storages,
|
|
49
|
-
get_auto_mount_parallelstore_storages,
|
|
50
|
-
get_auto_mount_storages,
|
|
51
|
-
)
|
|
52
|
-
from .workload_decorators import (
|
|
53
|
-
rdma_decorator,
|
|
54
|
-
tcpxo_decorator,
|
|
55
|
-
)
|
|
56
|
-
from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry
|
|
57
|
-
|
|
58
|
-
KJOB_API_GROUP_NAME = "kjobctl.x-k8s.io"
|
|
59
|
-
KJOB_API_GROUP_VERSION = "v1alpha1"
|
|
60
|
-
KJOB_API_VOLUME_BUNDLE_PLURAL = "volumebundles"
|
|
61
|
-
VOLUME_BUNDLE_TEMPLATE_PATH = "/../templates/volume_bundle.yaml"
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class AppProfileDefaults(Enum):
|
|
65
|
-
NAME = "xpk-def-app-profile"
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class JobTemplateDefaults(Enum):
|
|
69
|
-
NAME = "xpk-def-batch"
|
|
70
|
-
PARALLELISM = 1
|
|
71
|
-
COMPLETIONS = 1
|
|
72
|
-
CONTAINER_NAME = "xpk-batch-container"
|
|
73
|
-
IMAGE = "ubuntu:22.04"
|
|
74
|
-
WORKING_DIRECTORY = "/"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class PodTemplateDefaults(Enum):
|
|
78
|
-
NAME = "xpk-def-pod"
|
|
79
|
-
CONTAINER_NAME = "xpk-interactive-container"
|
|
80
|
-
IMAGE = "busybox:1.28"
|
|
81
|
-
WORKING_DIRECTORY = "/"
|
|
82
|
-
INTERACTIVE_COMMAND = "/bin/sh"
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
job_template_yaml = """
|
|
86
|
-
apiVersion: kjobctl.x-k8s.io/v1alpha1
|
|
87
|
-
kind: JobTemplate
|
|
88
|
-
metadata:
|
|
89
|
-
name: {name}
|
|
90
|
-
namespace: default
|
|
91
|
-
template:
|
|
92
|
-
spec:
|
|
93
|
-
parallelism: {parallelism}
|
|
94
|
-
completions: {completions}
|
|
95
|
-
completionMode: Indexed
|
|
96
|
-
template:
|
|
97
|
-
spec:
|
|
98
|
-
dnsPolicy: ClusterFirstWithHostNet
|
|
99
|
-
tolerations:
|
|
100
|
-
- operator: "Exists"
|
|
101
|
-
key: nvidia.com/gpu
|
|
102
|
-
containers:
|
|
103
|
-
- name: {container_name}
|
|
104
|
-
image: {image}
|
|
105
|
-
workingDir: {working_directory}
|
|
106
|
-
{resources}
|
|
107
|
-
{node_selector}
|
|
108
|
-
priorityClassName: {priority}
|
|
109
|
-
restartPolicy: OnFailure
|
|
110
|
-
serviceAccountName: {service_account}
|
|
111
|
-
"""
|
|
112
|
-
job_node_selector_template = """
|
|
113
|
-
nodeSelector:
|
|
114
|
-
cloud.google.com/gke-accelerator: {gpu_name}
|
|
115
|
-
"""
|
|
116
|
-
job_resources_template = """
|
|
117
|
-
resources:
|
|
118
|
-
limits:
|
|
119
|
-
nvidia.com/gpu: {gpu_per_node}
|
|
120
|
-
"""
|
|
121
|
-
|
|
122
|
-
app_profile_yaml = """
|
|
123
|
-
apiVersion: kjobctl.x-k8s.io/v1alpha1
|
|
124
|
-
kind: ApplicationProfile
|
|
125
|
-
metadata:
|
|
126
|
-
name: {name}
|
|
127
|
-
namespace: default
|
|
128
|
-
spec:
|
|
129
|
-
supportedModes:
|
|
130
|
-
- name: Slurm
|
|
131
|
-
template: {batch_template}
|
|
132
|
-
requiredFlags: []
|
|
133
|
-
- name: Interactive
|
|
134
|
-
template: {interactive_template}
|
|
135
|
-
volumeBundles: {volume_bundles}
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
pod_template_yaml = """
|
|
139
|
-
apiVersion: v1
|
|
140
|
-
kind: PodTemplate
|
|
141
|
-
metadata:
|
|
142
|
-
name: {name}
|
|
143
|
-
namespace: default
|
|
144
|
-
template:
|
|
145
|
-
spec:
|
|
146
|
-
tolerations:
|
|
147
|
-
- effect: NoSchedule
|
|
148
|
-
key: components.gke.io/gke-managed-components
|
|
149
|
-
operator: Equal
|
|
150
|
-
value: "true"
|
|
151
|
-
containers:
|
|
152
|
-
- name: {container_name}
|
|
153
|
-
image: {image}
|
|
154
|
-
command: [{interactive_command}]
|
|
155
|
-
workingDir: {working_directory}
|
|
156
|
-
initContainers:
|
|
157
|
-
- name: init
|
|
158
|
-
image: {image}
|
|
159
|
-
command: ['/bin/mkdir', '-p', '{working_directory}']
|
|
160
|
-
serviceAccountName: {service_account}
|
|
161
|
-
"""
|
|
162
|
-
|
|
163
|
-
Kueue_TAS_annotation = "kueue.x-k8s.io/podset-preferred-topology=cloud.google.com/gce-topology-host"
|
|
164
|
-
|
|
165
|
-
default_interface_annotation = "networking.gke.io/default-interface=eth0"
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
def get_a4_pod_template_annotations() -> tuple[str, str]:
|
|
169
|
-
sub_networks = get_cluster_subnetworks()
|
|
170
|
-
interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
|
|
171
|
-
sub_networks
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
return (
|
|
175
|
-
default_interface_annotation,
|
|
176
|
-
f"{interfaces_key}=$'{interfaces_value}'",
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def get_a3ultra_pod_template_annotations() -> tuple[str, str]:
|
|
181
|
-
sub_networks = get_cluster_subnetworks()
|
|
182
|
-
interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
|
|
183
|
-
sub_networks
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
return (
|
|
187
|
-
default_interface_annotation,
|
|
188
|
-
f"{interfaces_key}=$'{interfaces_value}'",
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def get_a3mega_pod_template_annotations() -> tuple[str, str, str]:
|
|
193
|
-
"""Adds or updates annotations in the Pod template."""
|
|
194
|
-
sub_networks = get_cluster_subnetworks()
|
|
195
|
-
tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
|
|
196
|
-
interfaces_key, interfaces_value = tcpxo_decorator.get_interfaces_entry(
|
|
197
|
-
sub_networks
|
|
198
|
-
)
|
|
199
|
-
tcpxo = f"{tcpxo_deamon_key}=$'{tcpxo_deamon_paths}'"
|
|
200
|
-
interfaces = f"{interfaces_key}=$'{interfaces_value}'"
|
|
201
|
-
return tcpxo, interfaces, default_interface_annotation
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def verify_kjob_installed() -> int:
|
|
205
|
-
"""Check if kjob is installed. If not provide user with proper communicate and exit.
|
|
206
|
-
Returns:
|
|
207
|
-
error code > if kjob not installed, otherwise 0
|
|
208
|
-
"""
|
|
209
|
-
command = "kubectl-kjob help"
|
|
210
|
-
task = "Verify kjob installation "
|
|
211
|
-
verify_kjob_installed_code, _ = run_command_for_value(command, task)
|
|
212
|
-
|
|
213
|
-
if verify_kjob_installed_code == 0:
|
|
214
|
-
xpk_print("kjob found")
|
|
215
|
-
return 0
|
|
216
|
-
|
|
217
|
-
if verify_kjob_installed_code != 0:
|
|
218
|
-
xpk_print(
|
|
219
|
-
" kjob not found. Please follow"
|
|
220
|
-
" https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md"
|
|
221
|
-
" to install kjob."
|
|
222
|
-
)
|
|
223
|
-
return verify_kjob_installed_code
|
|
224
|
-
return 0
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
def get_pod_template_interactive_command() -> str:
|
|
228
|
-
"""Gets the interactive command for PodTemplate from config otherwise the default value.
|
|
229
|
-
|
|
230
|
-
Args:
|
|
231
|
-
args - user provided arguments
|
|
232
|
-
Returns:
|
|
233
|
-
str - PodTemplate's interactive command
|
|
234
|
-
"""
|
|
235
|
-
pod_command = get_config().get(KJOB_SHELL_INTERACTIVE_COMMAND)
|
|
236
|
-
if pod_command is None or len(pod_command) == 0:
|
|
237
|
-
pod_command = PodTemplateDefaults.INTERACTIVE_COMMAND.value
|
|
238
|
-
|
|
239
|
-
return pod_command
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
def create_app_profile_instance(volume_bundles: list[str]) -> int:
|
|
243
|
-
"""Create new AppProfile instance on cluster with default settings.
|
|
244
|
-
|
|
245
|
-
Args:
|
|
246
|
-
args - user provided arguments
|
|
247
|
-
Returns:
|
|
248
|
-
exit_code > 0 if creating AppProfile fails, 0 otherwise
|
|
249
|
-
"""
|
|
250
|
-
return run_kubectl_apply(
|
|
251
|
-
yml_string=app_profile_yaml.format(
|
|
252
|
-
name=AppProfileDefaults.NAME.value,
|
|
253
|
-
batch_template=JobTemplateDefaults.NAME.value,
|
|
254
|
-
interactive_template=PodTemplateDefaults.NAME.value,
|
|
255
|
-
volume_bundles=volume_bundles,
|
|
256
|
-
),
|
|
257
|
-
task="Creating AppProfile",
|
|
258
|
-
)
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def decorate_job_template_with_gpu(
|
|
262
|
-
yml_string: str, system: SystemCharacteristics
|
|
263
|
-
) -> str:
|
|
264
|
-
job_spec = yaml.safe_load(yml_string)["template"]
|
|
265
|
-
kjob_decorator = (
|
|
266
|
-
system.gpu_config.kjob_decorator_fn
|
|
267
|
-
if system.gpu_config and system.gpu_config.kjob_decorator_fn
|
|
268
|
-
else None
|
|
269
|
-
)
|
|
270
|
-
if kjob_decorator:
|
|
271
|
-
job_spec = kjob_decorator(job_spec)
|
|
272
|
-
job_template_dict = yaml.safe_load(yml_string)
|
|
273
|
-
job_template_dict["template"] = job_spec
|
|
274
|
-
yaml_result: str = yaml.dump(job_template_dict, sort_keys=False)
|
|
275
|
-
return yaml_result
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
def create_job_template_instance(
|
|
279
|
-
args: Namespace,
|
|
280
|
-
system: SystemCharacteristics | None,
|
|
281
|
-
service_account: str,
|
|
282
|
-
) -> int:
|
|
283
|
-
"""Create new JobTemplate instance on cluster with default settings.
|
|
284
|
-
|
|
285
|
-
Args:
|
|
286
|
-
args - user provided arguments
|
|
287
|
-
Returns:
|
|
288
|
-
exit_code > 0 if creating JobTemplate fails, 0 otherwise
|
|
289
|
-
"""
|
|
290
|
-
job_image = get_config().get(KJOB_BATCH_IMAGE)
|
|
291
|
-
if job_image is None or len(job_image) == 0:
|
|
292
|
-
job_image = JobTemplateDefaults.IMAGE.value
|
|
293
|
-
working_directory = get_config().get(KJOB_BATCH_WORKING_DIRECTORY)
|
|
294
|
-
if working_directory is None or len(working_directory) == 0:
|
|
295
|
-
working_directory = JobTemplateDefaults.WORKING_DIRECTORY.value
|
|
296
|
-
resources = (
|
|
297
|
-
job_resources_template.format(gpu_per_node=system.chips_per_vm)
|
|
298
|
-
if system is not None and system.accelerator_type == AcceleratorType.GPU
|
|
299
|
-
else ""
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
node_selector = (
|
|
303
|
-
job_node_selector_template.format(gpu_name=system.gke_accelerator)
|
|
304
|
-
if system is not None and system.accelerator_type == AcceleratorType.GPU
|
|
305
|
-
else ""
|
|
306
|
-
)
|
|
307
|
-
yml_string = job_template_yaml.format(
|
|
308
|
-
name=JobTemplateDefaults.NAME.value,
|
|
309
|
-
parallelism=JobTemplateDefaults.PARALLELISM.value,
|
|
310
|
-
completions=JobTemplateDefaults.COMPLETIONS.value,
|
|
311
|
-
container_name=JobTemplateDefaults.CONTAINER_NAME.value,
|
|
312
|
-
image=job_image,
|
|
313
|
-
working_directory=working_directory,
|
|
314
|
-
resources=resources,
|
|
315
|
-
node_selector=node_selector,
|
|
316
|
-
priority=args.priority if hasattr(args, "priority") else "medium",
|
|
317
|
-
service_account=service_account,
|
|
318
|
-
)
|
|
319
|
-
if system is not None and system.accelerator_type == AcceleratorType.GPU:
|
|
320
|
-
yml_string = decorate_job_template_with_gpu(yml_string, system)
|
|
321
|
-
|
|
322
|
-
return run_kubectl_apply(
|
|
323
|
-
yml_string,
|
|
324
|
-
task="Creating JobTemplate",
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
def create_pod_template_instance(service_account: str) -> int:
|
|
329
|
-
"""Create new PodTemplate instance on cluster with default settings.
|
|
330
|
-
|
|
331
|
-
Returns:
|
|
332
|
-
exit_code > 0 if creating PodTemplate fails, 0 otherwise
|
|
333
|
-
"""
|
|
334
|
-
pod_image = get_config().get(KJOB_SHELL_IMAGE)
|
|
335
|
-
if pod_image is None or len(pod_image) == 0:
|
|
336
|
-
pod_image = PodTemplateDefaults.IMAGE.value
|
|
337
|
-
working_directory = get_config().get(KJOB_SHELL_WORKING_DIRECTORY)
|
|
338
|
-
if working_directory is None or len(working_directory) == 0:
|
|
339
|
-
working_directory = PodTemplateDefaults.WORKING_DIRECTORY.value
|
|
340
|
-
|
|
341
|
-
return run_kubectl_apply(
|
|
342
|
-
yml_string=pod_template_yaml.format(
|
|
343
|
-
name=PodTemplateDefaults.NAME.value,
|
|
344
|
-
container_name=PodTemplateDefaults.CONTAINER_NAME.value,
|
|
345
|
-
image=pod_image,
|
|
346
|
-
working_directory=working_directory,
|
|
347
|
-
interactive_command=get_pod_template_interactive_command(),
|
|
348
|
-
service_account=service_account,
|
|
349
|
-
),
|
|
350
|
-
task="Creating PodTemplate",
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
def prepare_kjob(args: Namespace) -> int:
|
|
355
|
-
system = get_cluster_system_characteristics(args)
|
|
356
|
-
|
|
357
|
-
storages = []
|
|
358
|
-
if not is_dry_run():
|
|
359
|
-
k8s_api_client = setup_k8s_env(args)
|
|
360
|
-
storages = get_auto_mount_storages(k8s_api_client)
|
|
361
|
-
|
|
362
|
-
service_account = ""
|
|
363
|
-
if len(storages) > 0:
|
|
364
|
-
service_account = XPK_SA
|
|
365
|
-
|
|
366
|
-
job_err_code = create_job_template_instance(args, system, service_account)
|
|
367
|
-
if job_err_code > 0:
|
|
368
|
-
return job_err_code
|
|
369
|
-
pod_err_code = create_pod_template_instance(service_account)
|
|
370
|
-
if pod_err_code > 0:
|
|
371
|
-
return pod_err_code
|
|
372
|
-
|
|
373
|
-
volume_bundles = [item.name for item in storages]
|
|
374
|
-
|
|
375
|
-
return create_app_profile_instance(volume_bundles)
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
def apply_kjob_crds() -> int:
|
|
379
|
-
"""Apply kjob CRDs on cluster.
|
|
380
|
-
|
|
381
|
-
This function install kjob CRDs files from kjobctl printcrds.
|
|
382
|
-
It creates all neccessary kjob CRDs.
|
|
383
|
-
|
|
384
|
-
Returns:
|
|
385
|
-
None
|
|
386
|
-
"""
|
|
387
|
-
command = "kubectl kjob printcrds | kubectl apply --server-side -f -"
|
|
388
|
-
task = "Create kjob CRDs on cluster"
|
|
389
|
-
return_code = run_command_with_updates(command, task)
|
|
390
|
-
if return_code != 0:
|
|
391
|
-
xpk_print(f"{task} returned ERROR {return_code}")
|
|
392
|
-
return return_code
|
|
393
|
-
xpk_print("Creating kjob CRDs succeeded")
|
|
394
|
-
return 0
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
def create_volume_bundle_instance(
|
|
398
|
-
k8s_api_client: ApiClient,
|
|
399
|
-
name: str,
|
|
400
|
-
manifest: list[dict],
|
|
401
|
-
readonly: bool,
|
|
402
|
-
mount_point: str,
|
|
403
|
-
) -> None:
|
|
404
|
-
"""
|
|
405
|
-
Creates a new VolumeBundle resource in the Kubernetes cluster.
|
|
406
|
-
|
|
407
|
-
This function reads a VolumeBundle template from a YAML file, populates it with
|
|
408
|
-
values from the provided arguments, and then creates the VolumeBundle object
|
|
409
|
-
in the cluster.
|
|
410
|
-
|
|
411
|
-
Args:
|
|
412
|
-
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
413
|
-
args: An argparse Namespace object containing the arguments for creating
|
|
414
|
-
the Storage resource.
|
|
415
|
-
"""
|
|
416
|
-
data = templates.load(VOLUME_BUNDLE_TEMPLATE_PATH)
|
|
417
|
-
data["metadata"]["name"] = name
|
|
418
|
-
spec = data["spec"]
|
|
419
|
-
spec["volumes"] = []
|
|
420
|
-
spec["containerVolumeMounts"] = []
|
|
421
|
-
|
|
422
|
-
for obj in manifest:
|
|
423
|
-
if obj["kind"] == "PersistentVolumeClaim":
|
|
424
|
-
spec["volumes"].append({
|
|
425
|
-
"name": obj["metadata"]["name"],
|
|
426
|
-
"persistentVolumeClaim": {
|
|
427
|
-
"claimName": obj["metadata"]["name"],
|
|
428
|
-
"readOnly": readonly,
|
|
429
|
-
},
|
|
430
|
-
})
|
|
431
|
-
spec["containerVolumeMounts"].append({
|
|
432
|
-
"name": obj["metadata"]["name"],
|
|
433
|
-
"mountPath": mount_point,
|
|
434
|
-
})
|
|
435
|
-
|
|
436
|
-
data["spec"] = spec
|
|
437
|
-
|
|
438
|
-
api_instance = k8s_client.CustomObjectsApi(k8s_api_client)
|
|
439
|
-
try:
|
|
440
|
-
api_instance.create_namespaced_custom_object(
|
|
441
|
-
namespace=DEFAULT_NAMESPACE,
|
|
442
|
-
group=KJOB_API_GROUP_NAME,
|
|
443
|
-
version=KJOB_API_GROUP_VERSION,
|
|
444
|
-
plural=KJOB_API_VOLUME_BUNDLE_PLURAL,
|
|
445
|
-
body=data,
|
|
446
|
-
)
|
|
447
|
-
xpk_print(
|
|
448
|
-
f"Created {KJOB_API_VOLUME_BUNDLE_PLURAL}.{KJOB_API_GROUP_NAME} object:"
|
|
449
|
-
f" {data['metadata']['name']}"
|
|
450
|
-
)
|
|
451
|
-
except ApiException as e:
|
|
452
|
-
if e.status == 409:
|
|
453
|
-
xpk_print(f"VolumeBundle: {name} already exists. Skipping its creation")
|
|
454
|
-
else:
|
|
455
|
-
xpk_print(f"Encountered error during VolumeBundle creation: {e}")
|
|
456
|
-
xpk_exit(1)
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
def get_storage_annotations(args: Namespace) -> list[str]:
|
|
460
|
-
annotations = []
|
|
461
|
-
k8s_api_client = setup_k8s_env(args)
|
|
462
|
-
|
|
463
|
-
gcsfuse_storages = get_auto_mount_gcsfuse_storages(k8s_api_client)
|
|
464
|
-
if len(gcsfuse_storages) > 0:
|
|
465
|
-
for key, value in GCS_FUSE_ANNOTATIONS.items():
|
|
466
|
-
annotations.append(f"{key}={value}")
|
|
467
|
-
|
|
468
|
-
parallelstore_storages = get_auto_mount_parallelstore_storages(k8s_api_client)
|
|
469
|
-
if len(parallelstore_storages) > 0:
|
|
470
|
-
for key, value in PARALLELSTORE_ANNOTATIONS.items():
|
|
471
|
-
annotations.append(f"{key}={value}")
|
|
472
|
-
|
|
473
|
-
return annotations
|
xpk/templates/volume_bundle.yaml
DELETED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|