xpk 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -12
- xpk/commands/cluster.py +33 -16
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +23 -20
- xpk/commands/run.py +17 -11
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +64 -19
- xpk/commands/workload.py +154 -319
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +322 -32
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +75 -5
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +11 -3
- xpk/core/gcsfuse.py +8 -5
- xpk/core/kjob.py +57 -18
- xpk/core/nap.py +4 -0
- xpk/core/network.py +11 -21
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +27 -82
- xpk/core/workload_decorators/rdma_decorator.py +3 -3
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -2
- xpk/parser/cluster.py +15 -6
- xpk/parser/storage.py +14 -3
- xpk/parser/workload.py +59 -31
- {xpk-0.7.2.dist-info → xpk-0.8.0.dist-info}/METADATA +60 -4
- {xpk-0.7.2.dist-info → xpk-0.8.0.dist-info}/RECORD +40 -40
- {xpk-0.7.2.dist-info → xpk-0.8.0.dist-info}/WHEEL +0 -0
- {xpk-0.7.2.dist-info → xpk-0.8.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.2.dist-info → xpk-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.2.dist-info → xpk-0.8.0.dist-info}/top_level.txt +0 -0
xpk/core/workload.py
CHANGED
|
@@ -14,12 +14,19 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
from ..utils import templates
|
|
17
20
|
from ..utils.console import xpk_exit, xpk_print
|
|
18
21
|
from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE
|
|
19
22
|
from .commands import run_command_for_value
|
|
20
23
|
from .gcloud_context import zone_to_region
|
|
24
|
+
from .storage import Storage, get_storage_volume_mounts_for_gpu
|
|
21
25
|
from .system_characteristics import SystemCharacteristics
|
|
22
26
|
|
|
27
|
+
RXDM_CONTAINER_A3HIGH_PATH = '/../templates/rxdm_container_a3high.yaml'
|
|
28
|
+
RXDM_CONTAINER_A3MEGA_PATH = '/../templates/rxdm_container_a3mega.yaml'
|
|
29
|
+
|
|
23
30
|
|
|
24
31
|
def workload_list_awk_command(filter_key) -> str:
|
|
25
32
|
"""Function returns the awk command needed from the filter specified.
|
|
@@ -244,98 +251,36 @@ def wait_for_job_completion(args) -> int:
|
|
|
244
251
|
return 0
|
|
245
252
|
|
|
246
253
|
|
|
247
|
-
def
|
|
248
|
-
|
|
254
|
+
def add_gpu_rxdm_container(
|
|
255
|
+
jobset_manifest_str: str,
|
|
256
|
+
system: SystemCharacteristics,
|
|
257
|
+
all_storages: list[Storage],
|
|
258
|
+
) -> str:
|
|
259
|
+
"""Add gpu rxdm container to jobset manifest based on user provided arguments.
|
|
249
260
|
|
|
250
261
|
Args:
|
|
262
|
+
jobset_manifest_str: the JobSet manifest as a YAML string.
|
|
251
263
|
system: system characteristics.
|
|
264
|
+
all_storages: list of all storages.
|
|
252
265
|
|
|
253
266
|
Returns:
|
|
254
|
-
str:
|
|
267
|
+
str: the modified JobSet manifest as a YAML string.
|
|
255
268
|
"""
|
|
256
|
-
gpu_volume = ''
|
|
257
269
|
if system.device_type == H100_DEVICE_TYPE:
|
|
258
|
-
|
|
259
|
-
hostPath:
|
|
260
|
-
path: /home/kubernetes/bin/nvidia/lib64
|
|
261
|
-
- name: tcpd-socket
|
|
262
|
-
hostPath:
|
|
263
|
-
path: /run/tcpx
|
|
264
|
-
- name: shared-memory
|
|
265
|
-
emptyDir:
|
|
266
|
-
medium: "Memory"
|
|
267
|
-
sizeLimit: 200Gi
|
|
268
|
-
- name: workload-terminated-volume
|
|
269
|
-
emptyDir:
|
|
270
|
-
- name: tcpx-nccl-plugin-volume
|
|
271
|
-
emptyDir:"""
|
|
270
|
+
gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3HIGH_PATH)
|
|
272
271
|
elif system.device_type == H100_MEGA_DEVICE_TYPE:
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
- name: shared-memory
|
|
277
|
-
emptyDir:
|
|
278
|
-
medium: "Memory"
|
|
279
|
-
sizeLimit: 1Gi
|
|
280
|
-
- name: workload-terminated-volume
|
|
281
|
-
emptyDir:"""
|
|
282
|
-
return gpu_volume
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
def get_gpu_rxdm_image(system: SystemCharacteristics) -> str:
|
|
286
|
-
"""Get config of rxdm based on user provided arguments.
|
|
287
|
-
|
|
288
|
-
Args:
|
|
289
|
-
system: system characteristics.
|
|
290
|
-
|
|
291
|
-
Returns:
|
|
292
|
-
str: yaml containing the rxdm name and image
|
|
293
|
-
"""
|
|
294
|
-
gpu_rxdm_image = ''
|
|
295
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
296
|
-
gpu_rxdm_image = """- name: tcpd-daemon
|
|
297
|
-
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9"""
|
|
298
|
-
elif system.device_type == H100_MEGA_DEVICE_TYPE:
|
|
299
|
-
gpu_rxdm_image = """- name: fastrak-daemon
|
|
300
|
-
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.9"""
|
|
301
|
-
return gpu_rxdm_image
|
|
272
|
+
gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3MEGA_PATH)
|
|
273
|
+
else:
|
|
274
|
+
return jobset_manifest_str
|
|
302
275
|
|
|
276
|
+
storage_volume_mounts = get_storage_volume_mounts_for_gpu(all_storages)
|
|
277
|
+
gpu_rxdm_container['volumeMounts'].extend(storage_volume_mounts)
|
|
303
278
|
|
|
304
|
-
|
|
305
|
-
"""Get rxdm command based on user provided arguments.
|
|
279
|
+
manifest = yaml.safe_load(jobset_manifest_str)
|
|
306
280
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
Returns:
|
|
311
|
-
str: command of running rxdm container
|
|
312
|
-
"""
|
|
313
|
-
gpu_rxdm_cmd = ''
|
|
314
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
315
|
-
gpu_rxdm_cmd = (
|
|
316
|
-
'/tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm'
|
|
317
|
-
' --gpu_shmem_type fd --setup_param "--verbose 128 2 0"'
|
|
281
|
+
for job in manifest['spec']['replicatedJobs']:
|
|
282
|
+
job['template']['spec']['template']['spec']['containers'].append(
|
|
283
|
+
gpu_rxdm_container
|
|
318
284
|
)
|
|
319
|
-
elif system.device_type == H100_MEGA_DEVICE_TYPE:
|
|
320
|
-
gpu_rxdm_cmd = (
|
|
321
|
-
'set -ex; chmod 755 /fts/entrypoint_rxdm_container.sh;'
|
|
322
|
-
' /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid='
|
|
323
|
-
' --alsologtostderr'
|
|
324
|
-
)
|
|
325
|
-
return gpu_rxdm_cmd
|
|
326
|
-
|
|
327
285
|
|
|
328
|
-
|
|
329
|
-
"""Get gpu tcp volume based on user provided arguments.
|
|
330
|
-
|
|
331
|
-
Args:
|
|
332
|
-
system: system characteristics.
|
|
333
|
-
|
|
334
|
-
Returns:
|
|
335
|
-
str: yaml containing gpu tcp volume
|
|
336
|
-
"""
|
|
337
|
-
gpu_tcp_volume = ''
|
|
338
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
339
|
-
gpu_tcp_volume = """- name: tcpd-socket
|
|
340
|
-
mountPath: /tmp"""
|
|
341
|
-
return gpu_tcp_volume
|
|
286
|
+
return yaml.dump(manifest, sort_keys=False)
|
|
@@ -33,7 +33,7 @@ def decorate_kjob_template(job_manifest) -> str:
|
|
|
33
33
|
return job_manifest
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
|
|
36
|
+
def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
37
37
|
"""
|
|
38
38
|
Decorates a JobSet manifest with the necessary components for rdma-daemon.
|
|
39
39
|
|
|
@@ -80,12 +80,12 @@ def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
|
80
80
|
return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
|
|
81
81
|
|
|
82
82
|
|
|
83
|
-
def add_annotations(job_manifest, sub_networks):
|
|
83
|
+
def add_annotations(job_manifest: dict, sub_networks: list[str]):
|
|
84
84
|
"""Adds or updates annotations in the Pod template."""
|
|
85
85
|
annotations = job_manifest['spec']['template']['metadata']['annotations']
|
|
86
86
|
interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
|
|
87
87
|
annotations.update({
|
|
88
|
-
'networking.gke.io/default-interface':
|
|
88
|
+
'networking.gke.io/default-interface': 'eth0',
|
|
89
89
|
interfaces_key: interfaces_value,
|
|
90
90
|
})
|
|
91
91
|
|
|
@@ -16,7 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import yaml
|
|
18
18
|
|
|
19
|
-
from ...core.storage import GCS_FUSE_TYPE, get_storage_volumes_yaml_dict,
|
|
19
|
+
from ...core.storage import GCS_FUSE_TYPE, PARALLELSTORE_TYPE, get_storage_volumes_yaml_dict, GCS_FUSE_ANNOTATIONS, PARALLELSTORE_ANNOTATIONS
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def decorate_jobset(jobset_manifest_str, storages) -> str:
|
|
@@ -42,9 +42,14 @@ def decorate_jobset(jobset_manifest_str, storages) -> str:
|
|
|
42
42
|
def add_annotations(job_manifest, storages):
|
|
43
43
|
"""Adds or updates storage annotations in the Pod template."""
|
|
44
44
|
annotations = job_manifest['spec']['template']['metadata']['annotations']
|
|
45
|
-
gcs_present =
|
|
45
|
+
gcs_present = any(storage.type == GCS_FUSE_TYPE for storage in storages)
|
|
46
46
|
if gcs_present:
|
|
47
|
-
annotations.update(
|
|
47
|
+
annotations.update(GCS_FUSE_ANNOTATIONS)
|
|
48
|
+
parallelstore_present = any(
|
|
49
|
+
storage.type == PARALLELSTORE_TYPE for storage in storages
|
|
50
|
+
)
|
|
51
|
+
if parallelstore_present:
|
|
52
|
+
annotations.update(PARALLELSTORE_ANNOTATIONS)
|
|
48
53
|
|
|
49
54
|
|
|
50
55
|
def add_volumes(job_manifest, storage_volumes):
|
|
@@ -57,7 +57,7 @@ def decorate_job(job_manifest: dict, sub_networks: list[str]) -> dict:
|
|
|
57
57
|
return job_manifest
|
|
58
58
|
|
|
59
59
|
|
|
60
|
-
def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
|
|
60
|
+
def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
61
61
|
"""
|
|
62
62
|
Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
|
|
63
63
|
|
|
@@ -105,7 +105,7 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
|
|
|
105
105
|
)
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
def add_annotations(job_manifest, sub_networks):
|
|
108
|
+
def add_annotations(job_manifest: dict, sub_networks: list[str]):
|
|
109
109
|
"""Adds or updates annotations in the Pod template."""
|
|
110
110
|
annotations = job_manifest['spec']['template']['metadata']['annotations']
|
|
111
111
|
tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
|
xpk/parser/cluster.py
CHANGED
|
@@ -107,6 +107,7 @@ def set_cluster_parser(cluster_parser):
|
|
|
107
107
|
' enable cluster to accept Pathways workloads.'
|
|
108
108
|
),
|
|
109
109
|
)
|
|
110
|
+
|
|
110
111
|
### Autoprovisioning arguments specific to "cluster create"
|
|
111
112
|
cluster_create_autoprovisioning_arguments = (
|
|
112
113
|
cluster_create_parser.add_argument_group(
|
|
@@ -462,7 +463,7 @@ def add_shared_cluster_create_optional_arguments(args_parsers):
|
|
|
462
463
|
custom_parser.add_argument(
|
|
463
464
|
'--pathways-gce-machine-type',
|
|
464
465
|
type=str,
|
|
465
|
-
default='
|
|
466
|
+
default='n2-standard-64',
|
|
466
467
|
help='The CPU type for Pathways CPU nodepools',
|
|
467
468
|
)
|
|
468
469
|
custom_parser.add_argument(
|
|
@@ -580,14 +581,22 @@ def add_shared_cluster_create_optional_arguments(args_parsers):
|
|
|
580
581
|
' Identity is enabled by default.'
|
|
581
582
|
),
|
|
582
583
|
)
|
|
583
|
-
|
|
584
584
|
custom_parser.add_argument(
|
|
585
585
|
'--enable-gcpfilestore-csi-driver',
|
|
586
586
|
action='store_true',
|
|
587
|
-
help=
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
587
|
+
help='Enable GCPFilestore driver on the cluster.',
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
custom_parser.add_argument(
|
|
591
|
+
'--enable-parallelstore-csi-driver',
|
|
592
|
+
action='store_true',
|
|
593
|
+
help='Enable Parallelstore CSI driver on the cluster.',
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
custom_parser.add_argument(
|
|
597
|
+
'--enable-pd-csi-driver',
|
|
598
|
+
action='store_true',
|
|
599
|
+
help='Enable PersistentDisk CSI driver on the cluster.',
|
|
591
600
|
)
|
|
592
601
|
|
|
593
602
|
|
xpk/parser/storage.py
CHANGED
|
@@ -73,7 +73,7 @@ def add_storage_attach_parser(
|
|
|
73
73
|
'The type of storage. Currently supported types: ["gcsfuse",'
|
|
74
74
|
' "gcpfilestore"]'
|
|
75
75
|
),
|
|
76
|
-
choices=['gcsfuse', 'gcpfilestore'],
|
|
76
|
+
choices=['gcsfuse', 'gcpfilestore', 'parallelstore', 'pd'],
|
|
77
77
|
required=True,
|
|
78
78
|
)
|
|
79
79
|
add_cluster_arguments(req_args, required=True)
|
|
@@ -146,13 +146,19 @@ def add_storage_attach_parser(
|
|
|
146
146
|
|
|
147
147
|
opt_args = storage_attach_parser.add_argument_group(
|
|
148
148
|
'Optional Arguments',
|
|
149
|
-
'Optional arguments for storage
|
|
149
|
+
'Optional arguments for storage attach.',
|
|
150
150
|
)
|
|
151
151
|
opt_args.add_argument(
|
|
152
152
|
'--manifest',
|
|
153
153
|
type=str,
|
|
154
154
|
help='Path to manifest file containing volume definitions',
|
|
155
155
|
)
|
|
156
|
+
opt_args.add_argument(
|
|
157
|
+
'--mount-options',
|
|
158
|
+
type=str,
|
|
159
|
+
help='Comma-separated list of mountOptions for PersistentVolume',
|
|
160
|
+
default='implicit-dirs',
|
|
161
|
+
)
|
|
156
162
|
add_kind_cluster_arguments(opt_args)
|
|
157
163
|
|
|
158
164
|
|
|
@@ -184,7 +190,6 @@ def add_storage_create_parser(
|
|
|
184
190
|
),
|
|
185
191
|
required=True,
|
|
186
192
|
)
|
|
187
|
-
|
|
188
193
|
req_args.add_argument(
|
|
189
194
|
'--type',
|
|
190
195
|
type=str,
|
|
@@ -248,6 +253,12 @@ def add_storage_create_parser(
|
|
|
248
253
|
type=str,
|
|
249
254
|
help='Path to manifest file containing volume definitions',
|
|
250
255
|
)
|
|
256
|
+
opt_args.add_argument(
|
|
257
|
+
'--mount-options',
|
|
258
|
+
type=str,
|
|
259
|
+
help='Comma-separated list of mountOptions for PersistentVolume',
|
|
260
|
+
default='',
|
|
261
|
+
)
|
|
251
262
|
|
|
252
263
|
add_kind_cluster_arguments(opt_args)
|
|
253
264
|
|
xpk/parser/workload.py
CHANGED
|
@@ -134,6 +134,24 @@ def set_workload_parsers(workload_parser):
|
|
|
134
134
|
' to use `gke.io/topology-aware-auto`.'
|
|
135
135
|
),
|
|
136
136
|
)
|
|
137
|
+
workload_create_parser_optional_arguments.add_argument(
|
|
138
|
+
'--ramdisk-directory',
|
|
139
|
+
type=str,
|
|
140
|
+
default='',
|
|
141
|
+
help=(
|
|
142
|
+
'The directory of the locally mounted RAM disk. This is only to'
|
|
143
|
+
' be used with the CSI driver provided by GKE.'
|
|
144
|
+
),
|
|
145
|
+
)
|
|
146
|
+
workload_create_parser_optional_arguments.add_argument(
|
|
147
|
+
'--mtc-enabled',
|
|
148
|
+
action='store_true',
|
|
149
|
+
help=(
|
|
150
|
+
'The workload can use multi-tier checkpointing controllers when the'
|
|
151
|
+
' --ramdisk-directory argument is used with this additional'
|
|
152
|
+
' argument.'
|
|
153
|
+
),
|
|
154
|
+
)
|
|
137
155
|
workload_create_parser_optional_arguments.add_argument(
|
|
138
156
|
'--debug-dump-gcs',
|
|
139
157
|
type=str,
|
|
@@ -161,6 +179,19 @@ def set_workload_parsers(workload_parser):
|
|
|
161
179
|
' create Pathways workloads.'
|
|
162
180
|
),
|
|
163
181
|
)
|
|
182
|
+
workload_create_parser_optional_arguments.add_argument(
|
|
183
|
+
'--restart-on-exit-codes',
|
|
184
|
+
type=str,
|
|
185
|
+
default=None,
|
|
186
|
+
help=(
|
|
187
|
+
'Adding this argument specifies additional user-defined exit codes'
|
|
188
|
+
' that allow restarting the workload when --max-restarts is set to'
|
|
189
|
+
' a value greater than 0. By default, workloads restart on exit'
|
|
190
|
+
' codes 42 and 127-255. Any exit codes provided through this flag'
|
|
191
|
+
' will be included alongside the default codes for restarting'
|
|
192
|
+
' conditions.'
|
|
193
|
+
),
|
|
194
|
+
)
|
|
164
195
|
|
|
165
196
|
# Autoprovisioning workload arguments
|
|
166
197
|
workload_create_autoprovisioning_arguments.add_argument(
|
|
@@ -244,9 +275,7 @@ def set_workload_parsers(workload_parser):
|
|
|
244
275
|
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
245
276
|
'--proxy-server-image',
|
|
246
277
|
type=str,
|
|
247
|
-
default=
|
|
248
|
-
'us-docker.pkg.dev/cloud-tpu-v2-images/pathways/proxy_server:latest'
|
|
249
|
-
),
|
|
278
|
+
default='',
|
|
250
279
|
help=(
|
|
251
280
|
'Please provide the proxy server image for Pathways. This arg can'
|
|
252
281
|
' only be used in `xpk workload create-pathways`.'
|
|
@@ -255,7 +284,7 @@ def set_workload_parsers(workload_parser):
|
|
|
255
284
|
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
256
285
|
'--server-image',
|
|
257
286
|
type=str,
|
|
258
|
-
default='
|
|
287
|
+
default='',
|
|
259
288
|
help=(
|
|
260
289
|
'Please provide the server image for Pathways. This arg can only be'
|
|
261
290
|
' used in `xpk workload create-pathways`.'
|
|
@@ -293,7 +322,7 @@ def set_workload_parsers(workload_parser):
|
|
|
293
322
|
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
294
323
|
'--custom-pathways-server-args',
|
|
295
324
|
type=str,
|
|
296
|
-
default=
|
|
325
|
+
default='',
|
|
297
326
|
help=(
|
|
298
327
|
'Provide custom Pathways server args as follows -'
|
|
299
328
|
" --custom-pathways-server-args='--arg_1=xxx --arg2=yyy'"
|
|
@@ -304,7 +333,7 @@ def set_workload_parsers(workload_parser):
|
|
|
304
333
|
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
305
334
|
'--custom-pathways-proxy-server-args',
|
|
306
335
|
type=str,
|
|
307
|
-
default=
|
|
336
|
+
default='',
|
|
308
337
|
help=(
|
|
309
338
|
'Provide custom Pathways proxy server args as follows -'
|
|
310
339
|
" --custom-pathways-proxy-server-args='--arg_1=xxx --arg2=yyy'"
|
|
@@ -315,7 +344,7 @@ def set_workload_parsers(workload_parser):
|
|
|
315
344
|
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
316
345
|
'--custom-pathways-worker-args',
|
|
317
346
|
type=str,
|
|
318
|
-
default=
|
|
347
|
+
default='',
|
|
319
348
|
help=(
|
|
320
349
|
'Provide custom Pathways worker args as follows -'
|
|
321
350
|
" --custom-pathways-worker-args='--arg_1=xxx --arg2=yyy'"
|
|
@@ -323,6 +352,27 @@ def set_workload_parsers(workload_parser):
|
|
|
323
352
|
required=False,
|
|
324
353
|
)
|
|
325
354
|
|
|
355
|
+
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
356
|
+
'--elastic-slices',
|
|
357
|
+
type=int,
|
|
358
|
+
default=0,
|
|
359
|
+
help=(
|
|
360
|
+
'Enable elastic slices in Pathways and specify'
|
|
361
|
+
' the number of slices the workload could lose.'
|
|
362
|
+
),
|
|
363
|
+
required=False,
|
|
364
|
+
)
|
|
365
|
+
workload_create_pathways_parser_optional_arguments.add_argument(
|
|
366
|
+
'--max-slice-restarts',
|
|
367
|
+
type=int,
|
|
368
|
+
default=1,
|
|
369
|
+
help=(
|
|
370
|
+
'Specify the maximum times the workers in a slice can be'
|
|
371
|
+
' restarted. Used with --elastic-slices for Pathways workloads.'
|
|
372
|
+
),
|
|
373
|
+
required=False,
|
|
374
|
+
)
|
|
375
|
+
|
|
326
376
|
add_shared_workload_create_required_arguments([
|
|
327
377
|
workload_create_parser_required_arguments,
|
|
328
378
|
workload_create_pathways_parser_required_arguments,
|
|
@@ -583,9 +633,9 @@ def add_shared_workload_create_optional_arguments(args_parsers):
|
|
|
583
633
|
),
|
|
584
634
|
)
|
|
585
635
|
custom_parser.add_argument(
|
|
586
|
-
'--
|
|
636
|
+
'--colocated-python-sidecar-image',
|
|
587
637
|
type=str,
|
|
588
|
-
default=
|
|
638
|
+
default='',
|
|
589
639
|
help='Remote Python sidecar server image.',
|
|
590
640
|
)
|
|
591
641
|
custom_parser.add_argument(
|
|
@@ -596,28 +646,6 @@ def add_shared_workload_create_optional_arguments(args_parsers):
|
|
|
596
646
|
' the workload.'
|
|
597
647
|
),
|
|
598
648
|
)
|
|
599
|
-
custom_parser.add_argument(
|
|
600
|
-
'--restart-on-exit-codes',
|
|
601
|
-
type=str,
|
|
602
|
-
default=None,
|
|
603
|
-
help=(
|
|
604
|
-
'Adding this argument specifies additional user-defined exit codes'
|
|
605
|
-
' that allow restarting the workload when --max-restarts is set to'
|
|
606
|
-
' a value greater than 0. By default, workloads restart on exit'
|
|
607
|
-
' codes 42 and 127-255. Any exit codes provided through this flag'
|
|
608
|
-
' will be included alongside the default codes for restarting'
|
|
609
|
-
' conditions.'
|
|
610
|
-
),
|
|
611
|
-
)
|
|
612
|
-
custom_parser.add_argument(
|
|
613
|
-
'--ramdisk-directory',
|
|
614
|
-
type=str,
|
|
615
|
-
default='',
|
|
616
|
-
help=(
|
|
617
|
-
'The directory of the locally mounted RAM disk. This is only to'
|
|
618
|
-
' be used with the CSI driver provided by GKE.'
|
|
619
|
-
),
|
|
620
|
-
)
|
|
621
649
|
|
|
622
650
|
|
|
623
651
|
def add_shared_workload_create_env_arguments(args_parsers):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xpk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
|
|
5
5
|
Author-email: XPK team <xpk-code-reviewers@google.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -88,9 +88,11 @@ and the following GPU types:
|
|
|
88
88
|
and the following CPU types:
|
|
89
89
|
* n2-standard-32
|
|
90
90
|
|
|
91
|
-
xpk also supports Google Cloud Storage solutions:
|
|
91
|
+
xpk also supports [Google Cloud Storage solutions](#storage):
|
|
92
92
|
* [Cloud Storage FUSE](#fuse)
|
|
93
93
|
* [Filestore](#filestore)
|
|
94
|
+
* [Parallelstore](#parallelstore)
|
|
95
|
+
* [Block storage (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
94
96
|
|
|
95
97
|
# Permissions needed on Cloud Console:
|
|
96
98
|
|
|
@@ -253,6 +255,7 @@ all zones.
|
|
|
253
255
|
--num-slices=4 --on-demand \
|
|
254
256
|
--tpu-type=v5litepod-16
|
|
255
257
|
```
|
|
258
|
+
Note that Pathways clusters need a CPU nodepool of n2-standard-64 or higher.
|
|
256
259
|
|
|
257
260
|
* Cluster Create for Ray:
|
|
258
261
|
A cluster with KubeRay enabled and a RayCluster can be created using `cluster create-ray`.
|
|
@@ -475,7 +478,11 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
|
|
|
475
478
|
|
|
476
479
|
|
|
477
480
|
## Storage
|
|
478
|
-
Currently XPK supports
|
|
481
|
+
Currently XPK supports the below types of storages:
|
|
482
|
+
- [Cloud Storage FUSE](#fuse)
|
|
483
|
+
- [Google Cloud Filestore](#filestore)
|
|
484
|
+
- [Google Cloud Parallelstore](#parallelstore)
|
|
485
|
+
- [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
|
|
479
486
|
|
|
480
487
|
### FUSE
|
|
481
488
|
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
|
|
@@ -499,11 +506,12 @@ Parameters:
|
|
|
499
506
|
- `--readonly` - if set to true, workload can only read from storage.
|
|
500
507
|
- `--size` - size of the storage in Gb.
|
|
501
508
|
- `--bucket` - name of the storage bucket. If not set then the name of the storage is used as a bucket name.
|
|
509
|
+
- `--mount-options` - comma-separated list of additional mount options for PersistentVolume ([reference](https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-perf#mount-options)).
|
|
502
510
|
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions. If set, then values from manifest override the following parameters: `--size` and `--bucket`.
|
|
503
511
|
|
|
504
512
|
### Filestore
|
|
505
513
|
|
|
506
|
-
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write
|
|
514
|
+
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
507
515
|
|
|
508
516
|
To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
|
|
509
517
|
|
|
@@ -537,6 +545,54 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
|
|
|
537
545
|
- `--instance` - the name of the Filestore instance. If not set then the name parameter is used as an instance name. Useful when connecting multiple volumes from the same Filestore instance.
|
|
538
546
|
- `--manifest` - path to the manifest file containing PersistentVolume, PresistentVolumeClaim and StorageClass definitions. If set, then values from manifest override the following parameters: `--access-mode`, `--size` and `--volume`.
|
|
539
547
|
|
|
548
|
+
### Parallelstore
|
|
549
|
+
|
|
550
|
+
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
551
|
+
|
|
552
|
+
To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
|
|
553
|
+
|
|
554
|
+
Once it's ready you can use `xpk storage attach` with `--type=parallelstore` command to attach a Parallelstore instance to your cluster. Currently, attaching a Parallelstore is supported only by providing a manifest file.
|
|
555
|
+
|
|
556
|
+
```shell
|
|
557
|
+
python3 xpk.py storage attach test-parallelstore-storage --type=parallelstore \
|
|
558
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
559
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
560
|
+
--auto-mount=true \
|
|
561
|
+
--manifest='./examples/storage/parallelstore-manifest-attach.yaml'
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
Parameters:
|
|
565
|
+
|
|
566
|
+
- `--type` - type of the storage `parallelstore`
|
|
567
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
568
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
569
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
570
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
571
|
+
|
|
572
|
+
### Block storage (Persistent Disk, Hyperdisk)
|
|
573
|
+
|
|
574
|
+
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
|
|
575
|
+
|
|
576
|
+
To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
|
|
577
|
+
|
|
578
|
+
Once it's ready you can use `xpk storage attach` with `--type=pd` command to attach a PersistentDisk instance to your cluster. Currently, attaching a PersistentDisk is supported only by providing a manifest file.
|
|
579
|
+
|
|
580
|
+
```shell
|
|
581
|
+
python3 xpk.py storage attach test-pd-storage --type=pd \
|
|
582
|
+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
|
|
583
|
+
--mount-point='/test-mount-point' --readonly=false \
|
|
584
|
+
--auto-mount=true \
|
|
585
|
+
--manifest='./examples/storage/pd-manifest-attach.yaml'
|
|
586
|
+
```
|
|
587
|
+
|
|
588
|
+
Parameters:
|
|
589
|
+
|
|
590
|
+
- `--type` - type of the storage `pd`
|
|
591
|
+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
|
|
592
|
+
- `--mount-point` - the path on which this storage should be mounted for a workload.
|
|
593
|
+
- `--readonly` - if set to true, workload can only read from storage.
|
|
594
|
+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
|
|
595
|
+
|
|
540
596
|
### List attached storages
|
|
541
597
|
|
|
542
598
|
```shell
|