xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -13
- xpk/commands/cluster.py +240 -71
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/common.py +33 -1
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +30 -18
- xpk/commands/run.py +17 -12
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +75 -19
- xpk/commands/workload.py +161 -324
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +335 -45
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +193 -12
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +5 -1
- xpk/core/gcsfuse.py +27 -6
- xpk/core/kjob.py +66 -20
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/nap.py +4 -0
- xpk/core/network.py +34 -22
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/resources.py +21 -0
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +28 -83
- xpk/core/workload_decorators/rdma_decorator.py +11 -15
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
- xpk/parser/cluster.py +574 -381
- xpk/parser/storage.py +25 -5
- xpk/parser/workload.py +59 -31
- xpk/utils/kubectl.py +4 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/parser/storage.py
CHANGED
|
@@ -70,10 +70,10 @@ def add_storage_attach_parser(
|
|
|
70
70
|
'--type',
|
|
71
71
|
type=str,
|
|
72
72
|
help=(
|
|
73
|
-
'The type of storage. Currently supported types:
|
|
74
|
-
' "gcpfilestore"
|
|
73
|
+
'The type of storage. Currently supported types: "gcsfuse",'
|
|
74
|
+
' "gcpfilestore", "parallelstore", "pd"'
|
|
75
75
|
),
|
|
76
|
-
choices=['gcsfuse', 'gcpfilestore'],
|
|
76
|
+
choices=['gcsfuse', 'gcpfilestore', 'parallelstore', 'pd'],
|
|
77
77
|
required=True,
|
|
78
78
|
)
|
|
79
79
|
add_cluster_arguments(req_args, required=True)
|
|
@@ -114,6 +114,15 @@ def add_storage_attach_parser(
|
|
|
114
114
|
' is infered as a bucket name.'
|
|
115
115
|
),
|
|
116
116
|
)
|
|
117
|
+
gcsfuse_args.add_argument(
|
|
118
|
+
'--prefetch-metadata',
|
|
119
|
+
action=argparse.BooleanOptionalAction,
|
|
120
|
+
default=True,
|
|
121
|
+
help=(
|
|
122
|
+
'(optional) Enables metadata pre-population when'
|
|
123
|
+
' mounting the volume. True by default.'
|
|
124
|
+
),
|
|
125
|
+
)
|
|
117
126
|
|
|
118
127
|
gcpfilestore_args = storage_attach_parser.add_argument_group(
|
|
119
128
|
'Filestore arguments',
|
|
@@ -146,13 +155,19 @@ def add_storage_attach_parser(
|
|
|
146
155
|
|
|
147
156
|
opt_args = storage_attach_parser.add_argument_group(
|
|
148
157
|
'Optional Arguments',
|
|
149
|
-
'Optional arguments for storage
|
|
158
|
+
'Optional arguments for storage attach.',
|
|
150
159
|
)
|
|
151
160
|
opt_args.add_argument(
|
|
152
161
|
'--manifest',
|
|
153
162
|
type=str,
|
|
154
163
|
help='Path to manifest file containing volume definitions',
|
|
155
164
|
)
|
|
165
|
+
opt_args.add_argument(
|
|
166
|
+
'--mount-options',
|
|
167
|
+
type=str,
|
|
168
|
+
help='Comma-separated list of mountOptions for PersistentVolume',
|
|
169
|
+
default='implicit-dirs',
|
|
170
|
+
)
|
|
156
171
|
add_kind_cluster_arguments(opt_args)
|
|
157
172
|
|
|
158
173
|
|
|
@@ -184,7 +199,6 @@ def add_storage_create_parser(
|
|
|
184
199
|
),
|
|
185
200
|
required=True,
|
|
186
201
|
)
|
|
187
|
-
|
|
188
202
|
req_args.add_argument(
|
|
189
203
|
'--type',
|
|
190
204
|
type=str,
|
|
@@ -248,6 +262,12 @@ def add_storage_create_parser(
|
|
|
248
262
|
type=str,
|
|
249
263
|
help='Path to manifest file containing volume definitions',
|
|
250
264
|
)
|
|
265
|
+
opt_args.add_argument(
|
|
266
|
+
'--mount-options',
|
|
267
|
+
type=str,
|
|
268
|
+
help='Comma-separated list of mountOptions for PersistentVolume',
|
|
269
|
+
default='',
|
|
270
|
+
)
|
|
251
271
|
|
|
252
272
|
add_kind_cluster_arguments(opt_args)
|
|
253
273
|
|
xpk/parser/workload.py
CHANGED
|
@@ -134,6 +134,24 @@ def set_workload_parsers(workload_parser):
|
|
|
134
134
|
' to use `gke.io/topology-aware-auto`.'
|
|
135
135
|
),
|
|
136
136
|
)
|
|
137
|
+
workload_create_parser_optional_arguments.add_argument(
|
|
138
|
+
'--ramdisk-directory',
|
|
139
|
+
type=str,
|
|
140
|
+
default='',
|
|
141
|
+
help=(
|
|
142
|
+
'The directory of the locally mounted RAM disk. This is only to'
|
|
143
|
+
' be used with the CSI driver provided by GKE.'
|
|
144
|
+
),
|
|
145
|
+
)
|
|
146
|
+
workload_create_parser_optional_arguments.add_argument(
|
|
147
|
+
'--mtc-enabled',
|
|
148
|
+
action='store_true',
|
|
149
|
+
help=(
|
|
150
|
+
'The workload can use multi-tier checkpointing controllers when the'
|
|
151
|
+
' --ramdisk-directory argument is used with this additional'
|
|
152
|
+
' argument.'
|
|
153
|
+
),
|
|
154
|
+
)
|
|
137
155
|
workload_create_parser_optional_arguments.add_argument(
|
|
138
156
|
'--debug-dump-gcs',
|
|
139
157
|
type=str,
|
|
@@ -161,6 +179,19 @@ def set_workload_parsers(workload_parser):
|
|
|
161
179
|
' create Pathways workloads.'
|
|
162
180
|
),
|
|
163
181
|
)
|
|
182
|
+
workload_create_parser_optional_arguments.add_argument(
|
|
183
|
+
'--restart-on-exit-codes',
|
|
184
|
+
type=str,
|
|
185
|
+
default=None,
|
|
186
|
+
help=(
|
|
187
|
+
'Adding this argument specifies additional user-defined exit codes'
|
|
188
|
+
' that allow restarting the workload when --max-restarts is set to'
|
|
189
|
+
' a value greater than 0. By default, workloads restart on exit'
|
|
190
|
+
' codes 42 and 127-255. Any exit codes provided through this flag'
|
|
191
|
+
' will be included alongside the default codes for restarting'
|
|
192
|
+
' conditions.'
|
|
193
|
+
),
|
|
194
|
+
)
|
|
164
195
|
|
|
165
196
|
# Autoprovisioning workload arguments
|
|
166
197
|
workload_create_autoprovisioning_arguments.add_argument(
|
|
@@ -244,9 +275,7 @@ def set_workload_parsers(workload_parser):
|
|
|
244
275
|
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
245
276
|
'--proxy-server-image',
|
|
246
277
|
type=str,
|
|
247
|
-
default=
|
|
248
|
-
'us-docker.pkg.dev/cloud-tpu-v2-images/pathways/proxy_server:latest'
|
|
249
|
-
),
|
|
278
|
+
default='',
|
|
250
279
|
help=(
|
|
251
280
|
'Please provide the proxy server image for Pathways. This arg can'
|
|
252
281
|
' only be used in `xpk workload create-pathways`.'
|
|
@@ -255,7 +284,7 @@ def set_workload_parsers(workload_parser):
|
|
|
255
284
|
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
256
285
|
'--server-image',
|
|
257
286
|
type=str,
|
|
258
|
-
default='
|
|
287
|
+
default='',
|
|
259
288
|
help=(
|
|
260
289
|
'Please provide the server image for Pathways. This arg can only be'
|
|
261
290
|
' used in `xpk workload create-pathways`.'
|
|
@@ -293,7 +322,7 @@ def set_workload_parsers(workload_parser):
|
|
|
293
322
|
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
294
323
|
'--custom-pathways-server-args',
|
|
295
324
|
type=str,
|
|
296
|
-
default=
|
|
325
|
+
default='',
|
|
297
326
|
help=(
|
|
298
327
|
'Provide custom Pathways server args as follows -'
|
|
299
328
|
" --custom-pathways-server-args='--arg_1=xxx --arg2=yyy'"
|
|
@@ -304,7 +333,7 @@ def set_workload_parsers(workload_parser):
|
|
|
304
333
|
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
305
334
|
'--custom-pathways-proxy-server-args',
|
|
306
335
|
type=str,
|
|
307
|
-
default=
|
|
336
|
+
default='',
|
|
308
337
|
help=(
|
|
309
338
|
'Provide custom Pathways proxy server args as follows -'
|
|
310
339
|
" --custom-pathways-proxy-server-args='--arg_1=xxx --arg2=yyy'"
|
|
@@ -315,7 +344,7 @@ def set_workload_parsers(workload_parser):
|
|
|
315
344
|
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
316
345
|
'--custom-pathways-worker-args',
|
|
317
346
|
type=str,
|
|
318
|
-
default=
|
|
347
|
+
default='',
|
|
319
348
|
help=(
|
|
320
349
|
'Provide custom Pathways worker args as follows -'
|
|
321
350
|
" --custom-pathways-worker-args='--arg_1=xxx --arg2=yyy'"
|
|
@@ -323,6 +352,27 @@ def set_workload_parsers(workload_parser):
|
|
|
323
352
|
required=False,
|
|
324
353
|
)
|
|
325
354
|
|
|
355
|
+
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
356
|
+
'--elastic-slices',
|
|
357
|
+
type=int,
|
|
358
|
+
default=0,
|
|
359
|
+
help=(
|
|
360
|
+
'Enable elastic slices in Pathways and specify'
|
|
361
|
+
' the number of slices the workload could lose.'
|
|
362
|
+
),
|
|
363
|
+
required=False,
|
|
364
|
+
)
|
|
365
|
+
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
366
|
+
'--max-slice-restarts',
|
|
367
|
+
type=int,
|
|
368
|
+
default=1,
|
|
369
|
+
help=(
|
|
370
|
+
'Specify the maximum times the workers in a slice can be'
|
|
371
|
+
' restarted. Used with --elastic-slices for Pathways workloads.'
|
|
372
|
+
),
|
|
373
|
+
required=False,
|
|
374
|
+
)
|
|
375
|
+
|
|
326
376
|
add_shared_workload_create_required_arguments([
|
|
327
377
|
workload_create_parser_required_arguments,
|
|
328
378
|
workload_create_pathways_parser_required_arguments,
|
|
@@ -583,9 +633,9 @@ def add_shared_workload_create_optional_arguments(args_parsers):
|
|
|
583
633
|
),
|
|
584
634
|
)
|
|
585
635
|
custom_parser.add_argument(
|
|
586
|
-
'--
|
|
636
|
+
'--colocated-python-sidecar-image',
|
|
587
637
|
type=str,
|
|
588
|
-
default=
|
|
638
|
+
default='',
|
|
589
639
|
help='Remote Python sidecar server image.',
|
|
590
640
|
)
|
|
591
641
|
custom_parser.add_argument(
|
|
@@ -596,28 +646,6 @@ def add_shared_workload_create_optional_arguments(args_parsers):
|
|
|
596
646
|
' the workload.'
|
|
597
647
|
),
|
|
598
648
|
)
|
|
599
|
-
custom_parser.add_argument(
|
|
600
|
-
'--restart-on-exit-codes',
|
|
601
|
-
type=str,
|
|
602
|
-
default=None,
|
|
603
|
-
help=(
|
|
604
|
-
'Adding this argument specifies additional user-defined exit codes'
|
|
605
|
-
' that allow restarting the workload when --max-restarts is set to'
|
|
606
|
-
' a value greater than 0. By default, workloads restart on exit'
|
|
607
|
-
' codes 42 and 127-255. Any exit codes provided through this flag'
|
|
608
|
-
' will be included alongside the default codes for restarting'
|
|
609
|
-
' conditions.'
|
|
610
|
-
),
|
|
611
|
-
)
|
|
612
|
-
custom_parser.add_argument(
|
|
613
|
-
'--ramdisk-directory',
|
|
614
|
-
type=str,
|
|
615
|
-
default='',
|
|
616
|
-
help=(
|
|
617
|
-
'The directory of the locally mounted RAM disk. This is only to'
|
|
618
|
-
' be used with the CSI driver provided by GKE.'
|
|
619
|
-
),
|
|
620
|
-
)
|
|
621
649
|
|
|
622
650
|
|
|
623
651
|
def add_shared_workload_create_env_arguments(args_parsers):
|
xpk/utils/kubectl.py
CHANGED
|
@@ -20,10 +20,11 @@ from kubernetes.dynamic import DynamicClient
|
|
|
20
20
|
from .console import xpk_print
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def apply_kubectl_manifest(client, manifest):
|
|
23
|
+
def apply_kubectl_manifest(client, manifest) -> int:
|
|
24
24
|
xpk_print('Applying manifest')
|
|
25
25
|
dynamic_client = DynamicClient(client)
|
|
26
26
|
|
|
27
|
+
status_code = 0
|
|
27
28
|
for obj in manifest:
|
|
28
29
|
api_version = obj['apiVersion']
|
|
29
30
|
kind = obj['kind']
|
|
@@ -55,3 +56,5 @@ def apply_kubectl_manifest(client, manifest):
|
|
|
55
56
|
)
|
|
56
57
|
else:
|
|
57
58
|
xpk_print(f'Error applying {kind}: {e}')
|
|
59
|
+
status_code = 1
|
|
60
|
+
return status_code
|