xpk 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +3 -3
- xpk/commands/cluster.py +22 -1
- xpk/commands/cluster_gcluster.py +27 -0
- xpk/commands/common.py +12 -5
- xpk/commands/kjob_common.py +4 -1
- xpk/commands/run.py +2 -2
- xpk/commands/shell.py +2 -2
- xpk/commands/storage.py +10 -3
- xpk/commands/workload.py +64 -27
- xpk/core/blueprint/blueprint_generator.py +108 -40
- xpk/core/capacity.py +66 -6
- xpk/core/cluster.py +165 -7
- xpk/core/config.py +1 -65
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +145 -72
- xpk/core/jobset.py +143 -0
- xpk/core/kjob.py +2 -6
- xpk/core/kueue.py +165 -5
- xpk/core/nodepool.py +17 -4
- xpk/core/pathways.py +1 -2
- xpk/core/storage.py +1 -95
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +0 -44
- xpk/core/workload_decorators/rdma_decorator.py +2 -0
- xpk/core/workload_decorators/tcpx_decorator.py +10 -4
- xpk/core/workload_decorators/tcpxo_decorator.py +7 -0
- xpk/parser/cluster.py +23 -7
- xpk/parser/storage.py +2 -2
- xpk/parser/workload.py +21 -3
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/METADATA +45 -6
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/RECORD +35 -34
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/WHEEL +0 -0
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.9.0.dist-info → xpk-0.10.0.dist-info}/top_level.txt +0 -0
xpk/core/kueue.py
CHANGED
|
@@ -16,6 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
|
|
19
|
+
import math
|
|
19
20
|
import packaging
|
|
20
21
|
from packaging.version import Version
|
|
21
22
|
|
|
@@ -39,10 +40,12 @@ from .system_characteristics import (
|
|
|
39
40
|
SystemCharacteristics,
|
|
40
41
|
)
|
|
41
42
|
|
|
42
|
-
KUEUE_VERSION = 'v0.
|
|
43
|
+
KUEUE_VERSION = 'v0.12.2'
|
|
43
44
|
CLUSTER_QUEUE_NAME = 'cluster-queue'
|
|
44
45
|
LOCAL_QUEUE_NAME = 'multislice-queue'
|
|
45
46
|
WAIT_FOR_KUEUE_TIMEOUT = '5m'
|
|
47
|
+
MEMORY_SIZE_PER_VM = 1.2
|
|
48
|
+
MIN_MEMORY_LIMIT_SIZE = 4096
|
|
46
49
|
|
|
47
50
|
packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$'
|
|
48
51
|
|
|
@@ -69,6 +72,26 @@ spec:
|
|
|
69
72
|
{machine_label}
|
|
70
73
|
{topology_label}
|
|
71
74
|
---
|
|
75
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
76
|
+
kind: AdmissionCheck
|
|
77
|
+
metadata:
|
|
78
|
+
name: dws-prov
|
|
79
|
+
spec:
|
|
80
|
+
controllerName: kueue.x-k8s.io/provisioning-request
|
|
81
|
+
parameters:
|
|
82
|
+
apiGroup: kueue.x-k8s.io
|
|
83
|
+
kind: ProvisioningRequestConfig
|
|
84
|
+
name: dws-config
|
|
85
|
+
---
|
|
86
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
87
|
+
kind: ProvisioningRequestConfig
|
|
88
|
+
metadata:
|
|
89
|
+
name: dws-config
|
|
90
|
+
spec:
|
|
91
|
+
provisioningClassName: queued-provisioning.gke.io
|
|
92
|
+
managedResources:
|
|
93
|
+
- {managed_resource}
|
|
94
|
+
---
|
|
72
95
|
{pw_resource_flavors}
|
|
73
96
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
74
97
|
kind: ClusterQueue
|
|
@@ -82,6 +105,7 @@ spec:
|
|
|
82
105
|
resourceGroups:
|
|
83
106
|
{covered_resources_config}
|
|
84
107
|
{pw_resources_kueue}
|
|
108
|
+
{admission_checks}
|
|
85
109
|
---
|
|
86
110
|
apiVersion: kueue.x-k8s.io/v1beta1
|
|
87
111
|
kind: LocalQueue
|
|
@@ -166,6 +190,99 @@ spec:
|
|
|
166
190
|
command: [ "sleep", "inf" ]
|
|
167
191
|
"""
|
|
168
192
|
|
|
193
|
+
kueue_controller_manager_yml = """
|
|
194
|
+
apiVersion: apps/v1
|
|
195
|
+
kind: Deployment
|
|
196
|
+
metadata:
|
|
197
|
+
labels:
|
|
198
|
+
app.kubernetes.io/component: controller
|
|
199
|
+
app.kubernetes.io/name: kueue
|
|
200
|
+
control-plane: controller-manager
|
|
201
|
+
name: kueue-controller-manager
|
|
202
|
+
namespace: kueue-system
|
|
203
|
+
spec:
|
|
204
|
+
replicas: 1
|
|
205
|
+
selector:
|
|
206
|
+
matchLabels:
|
|
207
|
+
control-plane: controller-manager
|
|
208
|
+
template:
|
|
209
|
+
metadata:
|
|
210
|
+
annotations:
|
|
211
|
+
kubectl.kubernetes.io/default-container: manager
|
|
212
|
+
labels:
|
|
213
|
+
app.kubernetes.io/component: controller
|
|
214
|
+
app.kubernetes.io/name: kueue
|
|
215
|
+
control-plane: controller-manager
|
|
216
|
+
spec:
|
|
217
|
+
containers:
|
|
218
|
+
- args:
|
|
219
|
+
- --config=/controller_manager_config.yaml
|
|
220
|
+
- --zap-log-level=2
|
|
221
|
+
command:
|
|
222
|
+
- /manager
|
|
223
|
+
image: registry.k8s.io/kueue/kueue:v0.10.0
|
|
224
|
+
imagePullPolicy: Always
|
|
225
|
+
livenessProbe:
|
|
226
|
+
httpGet:
|
|
227
|
+
path: /healthz
|
|
228
|
+
port: 8081
|
|
229
|
+
initialDelaySeconds: 15
|
|
230
|
+
periodSeconds: 20
|
|
231
|
+
name: manager
|
|
232
|
+
ports:
|
|
233
|
+
- containerPort: 8082
|
|
234
|
+
name: visibility
|
|
235
|
+
protocol: TCP
|
|
236
|
+
- containerPort: 9443
|
|
237
|
+
name: webhook-server
|
|
238
|
+
protocol: TCP
|
|
239
|
+
readinessProbe:
|
|
240
|
+
httpGet:
|
|
241
|
+
path: /readyz
|
|
242
|
+
port: 8081
|
|
243
|
+
initialDelaySeconds: 5
|
|
244
|
+
periodSeconds: 10
|
|
245
|
+
resources:
|
|
246
|
+
limits:
|
|
247
|
+
cpu: 500m
|
|
248
|
+
memory: {memory_limit_size}
|
|
249
|
+
requests:
|
|
250
|
+
cpu: 500m
|
|
251
|
+
memory: 512Mi
|
|
252
|
+
securityContext:
|
|
253
|
+
allowPrivilegeEscalation: false
|
|
254
|
+
volumeMounts:
|
|
255
|
+
- mountPath: /tmp/k8s-webhook-server/serving-certs
|
|
256
|
+
name: cert
|
|
257
|
+
readOnly: true
|
|
258
|
+
- mountPath: /controller_manager_config.yaml
|
|
259
|
+
name: manager-config
|
|
260
|
+
subPath: controller_manager_config.yaml
|
|
261
|
+
- args:
|
|
262
|
+
- --secure-listen-address=0.0.0.0:8443
|
|
263
|
+
- --upstream=http://127.0.0.1:8080/
|
|
264
|
+
- --logtostderr=true
|
|
265
|
+
- --v=10
|
|
266
|
+
image: registry.k8s.io/kubebuilder/kube-rbac-proxy:v0.16.0
|
|
267
|
+
name: kube-rbac-proxy
|
|
268
|
+
ports:
|
|
269
|
+
- containerPort: 8443
|
|
270
|
+
name: https
|
|
271
|
+
protocol: TCP
|
|
272
|
+
securityContext:
|
|
273
|
+
runAsNonRoot: true
|
|
274
|
+
serviceAccountName: kueue-controller-manager
|
|
275
|
+
terminationGracePeriodSeconds: 10
|
|
276
|
+
volumes:
|
|
277
|
+
- name: cert
|
|
278
|
+
secret:
|
|
279
|
+
defaultMode: 420
|
|
280
|
+
secretName: kueue-webhook-server-cert
|
|
281
|
+
- configMap:
|
|
282
|
+
name: kueue-manager-config
|
|
283
|
+
name: manager-config
|
|
284
|
+
"""
|
|
285
|
+
|
|
169
286
|
|
|
170
287
|
def verify_kueuectl(args: Namespace) -> None:
|
|
171
288
|
"""Verify if kueuectl is installed.
|
|
@@ -282,6 +399,7 @@ def install_kueue_crs(
|
|
|
282
399
|
args,
|
|
283
400
|
system: SystemCharacteristics,
|
|
284
401
|
autoprovisioning_config: AutoprovisioningConfig | None,
|
|
402
|
+
flex_with_tpu=False,
|
|
285
403
|
) -> int:
|
|
286
404
|
"""Install Kueue Custom Resources.
|
|
287
405
|
|
|
@@ -309,6 +427,13 @@ def install_kueue_crs(
|
|
|
309
427
|
else:
|
|
310
428
|
# Determine total chips based on user specified topology.
|
|
311
429
|
total_chips = get_total_chips_requested_from_args(args, system)
|
|
430
|
+
if args.flex and flex_with_tpu is False:
|
|
431
|
+
admission_checks = """
|
|
432
|
+
admissionChecks:
|
|
433
|
+
- dws-prov
|
|
434
|
+
"""
|
|
435
|
+
else:
|
|
436
|
+
admission_checks = ''
|
|
312
437
|
|
|
313
438
|
covered_resources_config = get_kueue_covered_resources_config(
|
|
314
439
|
cluster_hardware_name=cluster_hardware_name,
|
|
@@ -322,7 +447,9 @@ def install_kueue_crs(
|
|
|
322
447
|
B200_DEVICE_TYPE,
|
|
323
448
|
]:
|
|
324
449
|
topology_label = 'topologyName: "gke-default"'
|
|
325
|
-
|
|
450
|
+
res_type = AcceleratorTypeToAcceleratorCharacteristics[
|
|
451
|
+
system.accelerator_type
|
|
452
|
+
].resource_type
|
|
326
453
|
yml_string = cluster_set_crd_yaml.format(
|
|
327
454
|
system=system,
|
|
328
455
|
cluster_hardware_name=cluster_hardware_name,
|
|
@@ -334,11 +461,11 @@ def install_kueue_crs(
|
|
|
334
461
|
),
|
|
335
462
|
topology_label=topology_label,
|
|
336
463
|
covered_resources_config=covered_resources_config,
|
|
337
|
-
resource_type=
|
|
338
|
-
system.accelerator_type
|
|
339
|
-
].resource_type,
|
|
464
|
+
resource_type=res_type,
|
|
340
465
|
pw_resource_flavors=add_pw_resource_flavors(args),
|
|
341
466
|
pw_resources_kueue=add_pw_resources_to_kueue(args),
|
|
467
|
+
admission_checks=admission_checks,
|
|
468
|
+
managed_resource=res_type,
|
|
342
469
|
cluster_queue_name=CLUSTER_QUEUE_NAME,
|
|
343
470
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
344
471
|
)
|
|
@@ -386,3 +513,36 @@ def get_kueue_covered_resources_config(
|
|
|
386
513
|
total_chips=total_chips,
|
|
387
514
|
)
|
|
388
515
|
return config_string
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def update_kueue_resources_if_necessary(args):
|
|
519
|
+
"""Update the kueue manifest to increase the resources for the kueue controller manager.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
args: user provided arguments for running the command.
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
0 if successful and 1 otherwise.
|
|
526
|
+
"""
|
|
527
|
+
# Get total number of nodes
|
|
528
|
+
cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
|
|
529
|
+
return_code, out = run_command_for_value(
|
|
530
|
+
cmd_total_node_num, 'Count total nodes', args
|
|
531
|
+
)
|
|
532
|
+
if return_code != 0:
|
|
533
|
+
xpk_exit(1)
|
|
534
|
+
# 1.2MiB per VM or 4GiB (whichever is greater).
|
|
535
|
+
new_memory_limit = (
|
|
536
|
+
f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
|
|
537
|
+
)
|
|
538
|
+
yml_string = kueue_controller_manager_yml.format(
|
|
539
|
+
memory_limit_size=new_memory_limit,
|
|
540
|
+
)
|
|
541
|
+
tmp = write_tmp_file(yml_string)
|
|
542
|
+
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
543
|
+
|
|
544
|
+
task = 'Updating Kueue Controller Manager resources'
|
|
545
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
546
|
+
if return_code != 0:
|
|
547
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
548
|
+
return return_code
|
xpk/core/nodepool.py
CHANGED
|
@@ -77,8 +77,12 @@ def run_gke_node_pool_create_command(
|
|
|
77
77
|
if return_code > 0:
|
|
78
78
|
xpk_print('Listing all reservations failed!')
|
|
79
79
|
return_code = 1
|
|
80
|
+
if system.accelerator_type == AcceleratorType['TPU']:
|
|
81
|
+
max_nodes = system.vms_per_slice
|
|
82
|
+
else:
|
|
83
|
+
max_nodes = 1000
|
|
80
84
|
capacity_args, return_code = get_capacity_arguments_from_capacity_type(
|
|
81
|
-
args, capacity_type
|
|
85
|
+
args, capacity_type, max_nodes
|
|
82
86
|
)
|
|
83
87
|
if return_code > 0:
|
|
84
88
|
xpk_print('Parsing capacity arguments failed!')
|
|
@@ -275,7 +279,10 @@ def run_gke_node_pool_create_command(
|
|
|
275
279
|
)
|
|
276
280
|
if system.accelerator_type == AcceleratorType['TPU']:
|
|
277
281
|
command += f' --node-version={gke_node_pool_version}'
|
|
278
|
-
|
|
282
|
+
if capacity_type == CapacityType.FLEX_START:
|
|
283
|
+
command += ' --num-nodes=0'
|
|
284
|
+
else:
|
|
285
|
+
command += f' --num-nodes={system.vms_per_slice}'
|
|
279
286
|
command += ' --placement-type=COMPACT --max-pods-per-node 15'
|
|
280
287
|
command += (
|
|
281
288
|
f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
|
|
@@ -284,7 +291,10 @@ def run_gke_node_pool_create_command(
|
|
|
284
291
|
command += f' {args.custom_tpu_nodepool_arguments}'
|
|
285
292
|
elif system.accelerator_type == AcceleratorType['GPU']:
|
|
286
293
|
subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
|
|
287
|
-
|
|
294
|
+
if capacity_type == CapacityType.FLEX_START:
|
|
295
|
+
command += ' --num-nodes=0'
|
|
296
|
+
else:
|
|
297
|
+
command += f' --num-nodes={args.num_nodes}'
|
|
288
298
|
command += (
|
|
289
299
|
' --accelerator'
|
|
290
300
|
f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
|
|
@@ -298,7 +308,10 @@ def run_gke_node_pool_create_command(
|
|
|
298
308
|
)
|
|
299
309
|
command += ' --max-pods-per-node=32'
|
|
300
310
|
elif system.accelerator_type == AcceleratorType['CPU']:
|
|
301
|
-
|
|
311
|
+
if capacity_type == CapacityType.FLEX_START:
|
|
312
|
+
command += ' --num-nodes=0'
|
|
313
|
+
else:
|
|
314
|
+
command += f' --num-nodes={system.vms_per_slice}'
|
|
302
315
|
command += (
|
|
303
316
|
f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
|
|
304
317
|
)
|
xpk/core/pathways.py
CHANGED
|
@@ -19,8 +19,7 @@ from ..core.docker_container import get_user_workload_container
|
|
|
19
19
|
from ..core.gcloud_context import zone_to_region
|
|
20
20
|
from ..core.nodepool import get_all_nodepools_programmatic
|
|
21
21
|
from ..utils.console import xpk_exit, xpk_print
|
|
22
|
-
from .
|
|
23
|
-
from .system_characteristics import SystemCharacteristics
|
|
22
|
+
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
def add_pw_resource_flavors(args):
|
xpk/core/storage.py
CHANGED
|
@@ -46,6 +46,7 @@ STORAGE_CRD_NAME = f"{XPK_API_GROUP_NAME}.{STORAGE_CRD_PLURAL}"
|
|
|
46
46
|
GCS_FUSE_TYPE = "gcsfuse"
|
|
47
47
|
GCP_FILESTORE_TYPE = "gcpfilestore"
|
|
48
48
|
PARALLELSTORE_TYPE = "parallelstore"
|
|
49
|
+
LUSTRE_TYPE = "lustre"
|
|
49
50
|
GCE_PD_TYPE = "pd"
|
|
50
51
|
MANIFESTS_PATH = os.path.abspath("xpkclusters/storage-manifests")
|
|
51
52
|
GCS_FUSE_ANNOTATIONS = {
|
|
@@ -365,101 +366,6 @@ def get_storage_annotations(storages: list[Storage]) -> list[str]:
|
|
|
365
366
|
return annotations
|
|
366
367
|
|
|
367
368
|
|
|
368
|
-
def get_storage_volume_mounts_yaml(storages: list[Storage]) -> str:
|
|
369
|
-
"""
|
|
370
|
-
Generates the YAML representation of the volumeMounts section for the given Storages.
|
|
371
|
-
|
|
372
|
-
This function creates the YAML snippet that defines how the storage volumes
|
|
373
|
-
should be mounted within a Pod's containers.
|
|
374
|
-
|
|
375
|
-
Args:
|
|
376
|
-
storages: A list of Storage objects.
|
|
377
|
-
|
|
378
|
-
Returns:
|
|
379
|
-
A string containing the YAML representation of the volumeMounts section.
|
|
380
|
-
"""
|
|
381
|
-
yaml_str = ""
|
|
382
|
-
for storage in storages:
|
|
383
|
-
yaml_str += f"""- name: {storage.pv}
|
|
384
|
-
mountPath: {storage.mount_point}
|
|
385
|
-
readOnly: {storage.readonly}
|
|
386
|
-
"""
|
|
387
|
-
return yaml_str
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
def get_storage_volumes_yaml(storages: list[Storage]) -> str:
|
|
391
|
-
"""
|
|
392
|
-
Generates the YAML representation of the volumes section for the given Storages.
|
|
393
|
-
|
|
394
|
-
This function creates the YAML snippet that defines the volumes to be
|
|
395
|
-
mounted in a Pod, including the PersistentVolumeClaim associated with
|
|
396
|
-
each Storage.
|
|
397
|
-
|
|
398
|
-
Args:
|
|
399
|
-
storages: A list of Storage objects.
|
|
400
|
-
|
|
401
|
-
Returns:
|
|
402
|
-
A string containing the YAML representation of the volumes section.
|
|
403
|
-
"""
|
|
404
|
-
yaml_str = ""
|
|
405
|
-
for storage in storages:
|
|
406
|
-
yaml_str += f"""- name: {storage.pv}
|
|
407
|
-
persistentVolumeClaim:
|
|
408
|
-
claimName: {storage.pvc}
|
|
409
|
-
readOnly: {storage.readonly}
|
|
410
|
-
"""
|
|
411
|
-
return yaml_str
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
def get_storage_volume_mounts_for_gpu(
|
|
415
|
-
storages: list[Storage],
|
|
416
|
-
) -> list[dict]:
|
|
417
|
-
"""
|
|
418
|
-
Generates the YAML representation of the volumeMounts section for the given Storages.
|
|
419
|
-
|
|
420
|
-
This function creates the list of storage specifications that define how the storage volumes
|
|
421
|
-
should be mounted within a Pod's containers.
|
|
422
|
-
|
|
423
|
-
Args:
|
|
424
|
-
storages: A list of Storage objects.
|
|
425
|
-
|
|
426
|
-
Returns:
|
|
427
|
-
A list containing the dictionary representation of the volumeMounts section.
|
|
428
|
-
"""
|
|
429
|
-
return [
|
|
430
|
-
{
|
|
431
|
-
"name": storage.pv,
|
|
432
|
-
"mountPath": storage.mount_point,
|
|
433
|
-
"readOnly": storage.readonly,
|
|
434
|
-
}
|
|
435
|
-
for storage in storages
|
|
436
|
-
]
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
def get_storage_volumes_yaml_for_gpu(storages: list[Storage]) -> str:
|
|
440
|
-
"""
|
|
441
|
-
Generates the YAML representation of the volumes section for the given Storages.
|
|
442
|
-
|
|
443
|
-
This function creates the YAML snippet that defines the volumes to be
|
|
444
|
-
mounted in a Pod, including the PersistentVolumeClaim associated with
|
|
445
|
-
each Storage.
|
|
446
|
-
|
|
447
|
-
Args:
|
|
448
|
-
storages: A list of Storage objects.
|
|
449
|
-
|
|
450
|
-
Returns:
|
|
451
|
-
A string containing the YAML representation of the volumes section.
|
|
452
|
-
"""
|
|
453
|
-
yaml_str = ""
|
|
454
|
-
for storage in storages:
|
|
455
|
-
yaml_str += f"""- name: {storage.pv}
|
|
456
|
-
persistentVolumeClaim:
|
|
457
|
-
claimName: {storage.pvc}
|
|
458
|
-
readOnly: {storage.readonly}
|
|
459
|
-
"""
|
|
460
|
-
return yaml_str
|
|
461
|
-
|
|
462
|
-
|
|
463
369
|
def get_storage_volumes_yaml_dict(storages: list[Storage]) -> list[dict]:
|
|
464
370
|
vols = []
|
|
465
371
|
for storage in storages:
|
xpk/core/workload.py
CHANGED
|
@@ -14,18 +14,9 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
import yaml
|
|
18
|
-
|
|
19
|
-
from ..utils import templates
|
|
20
17
|
from ..utils.console import xpk_exit, xpk_print
|
|
21
|
-
from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE
|
|
22
18
|
from .commands import run_command_for_value
|
|
23
19
|
from .gcloud_context import zone_to_region
|
|
24
|
-
from .storage import Storage, get_storage_volume_mounts_for_gpu
|
|
25
|
-
from .system_characteristics import SystemCharacteristics
|
|
26
|
-
|
|
27
|
-
RXDM_CONTAINER_A3HIGH_PATH = '/../templates/rxdm_container_a3high.yaml'
|
|
28
|
-
RXDM_CONTAINER_A3MEGA_PATH = '/../templates/rxdm_container_a3mega.yaml'
|
|
29
20
|
|
|
30
21
|
|
|
31
22
|
def workload_list_awk_command(filter_key) -> str:
|
|
@@ -249,38 +240,3 @@ def wait_for_job_completion(args) -> int:
|
|
|
249
240
|
xpk_print('Your workload did not complete successfully')
|
|
250
241
|
return 125
|
|
251
242
|
return 0
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
def add_gpu_rxdm_container(
|
|
255
|
-
jobset_manifest_str: str,
|
|
256
|
-
system: SystemCharacteristics,
|
|
257
|
-
all_storages: list[Storage],
|
|
258
|
-
) -> str:
|
|
259
|
-
"""Add gpu rxdm container to jobset manifest based on user provided arguments.
|
|
260
|
-
|
|
261
|
-
Args:
|
|
262
|
-
jobset_manifest_str: the JobSet manifest as a YAML string.
|
|
263
|
-
system: system characteristics.
|
|
264
|
-
all_storages: list of all storages.
|
|
265
|
-
|
|
266
|
-
Returns:
|
|
267
|
-
str: the modified JobSet manifest as a YAML string.
|
|
268
|
-
"""
|
|
269
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
270
|
-
gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3HIGH_PATH)
|
|
271
|
-
elif system.device_type == H100_MEGA_DEVICE_TYPE:
|
|
272
|
-
gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3MEGA_PATH)
|
|
273
|
-
else:
|
|
274
|
-
return jobset_manifest_str
|
|
275
|
-
|
|
276
|
-
storage_volume_mounts = get_storage_volume_mounts_for_gpu(all_storages)
|
|
277
|
-
gpu_rxdm_container['volumeMounts'].extend(storage_volume_mounts)
|
|
278
|
-
|
|
279
|
-
manifest = yaml.safe_load(jobset_manifest_str)
|
|
280
|
-
|
|
281
|
-
for job in manifest['spec']['replicatedJobs']:
|
|
282
|
-
job['template']['spec']['template']['spec']['containers'].append(
|
|
283
|
-
gpu_rxdm_container
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
return yaml.dump(manifest, sort_keys=False)
|
|
@@ -80,6 +80,8 @@ def add_annotations(job_manifest: dict, sub_networks: list[str]):
|
|
|
80
80
|
"""Adds or updates annotations in the Pod template."""
|
|
81
81
|
annotations = job_manifest['spec']['template']['metadata']['annotations']
|
|
82
82
|
interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
|
|
83
|
+
if annotations is None:
|
|
84
|
+
annotations = {}
|
|
83
85
|
annotations.update({
|
|
84
86
|
'networking.gke.io/default-interface': 'eth0',
|
|
85
87
|
interfaces_key: interfaces_value,
|
|
@@ -25,7 +25,7 @@ tcpx = 'v2.0.11'
|
|
|
25
25
|
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
26
26
|
add_volumes(job_manifest)
|
|
27
27
|
add_tolerations(job_manifest)
|
|
28
|
-
|
|
28
|
+
add_tcpx_daemon_container(job_manifest)
|
|
29
29
|
update_gpu_containers(job_manifest)
|
|
30
30
|
return job_manifest
|
|
31
31
|
|
|
@@ -34,7 +34,7 @@ def decorate_job(job_manifest: dict) -> dict:
|
|
|
34
34
|
add_annotations(job_manifest)
|
|
35
35
|
add_volumes(job_manifest)
|
|
36
36
|
add_tolerations(job_manifest)
|
|
37
|
-
|
|
37
|
+
add_tcpx_daemon_container(job_manifest)
|
|
38
38
|
update_gpu_containers(job_manifest)
|
|
39
39
|
return job_manifest
|
|
40
40
|
|
|
@@ -131,10 +131,13 @@ def add_volumes(job_manifest: dict):
|
|
|
131
131
|
})
|
|
132
132
|
volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
|
|
133
133
|
volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
|
|
134
|
+
volumes.append(
|
|
135
|
+
{'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
|
|
136
|
+
)
|
|
134
137
|
|
|
135
138
|
|
|
136
|
-
def
|
|
137
|
-
"""Adds the
|
|
139
|
+
def add_tcpx_daemon_container(job_manifest):
|
|
140
|
+
"""Adds the tcpx-daemon container to the Pod spec."""
|
|
138
141
|
tcpxo_daemon_container = {
|
|
139
142
|
'name': 'tcpx-daemon',
|
|
140
143
|
'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{tcpx}',
|
|
@@ -177,3 +180,6 @@ def update_gpu_containers(job_manifest):
|
|
|
177
180
|
volumeMounts.append(
|
|
178
181
|
{'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'}
|
|
179
182
|
)
|
|
183
|
+
container['volumeMounts'].append(
|
|
184
|
+
{'name': 'dshm', 'mountPath': '/dev/shm'}
|
|
185
|
+
)
|
|
@@ -15,6 +15,7 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
import yaml
|
|
18
|
+
|
|
18
19
|
from ...utils.yaml import literal_string
|
|
19
20
|
|
|
20
21
|
# Component version
|
|
@@ -141,6 +142,9 @@ def add_volumes(job_manifest):
|
|
|
141
142
|
'name': 'aperture-devices',
|
|
142
143
|
'hostPath': {'path': '/dev/aperture_devices'},
|
|
143
144
|
})
|
|
145
|
+
volumes.append(
|
|
146
|
+
{'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
|
|
147
|
+
)
|
|
144
148
|
|
|
145
149
|
|
|
146
150
|
def add_tcpxo_daemon_container(job_manifest):
|
|
@@ -189,3 +193,6 @@ def update_gpu_containers(job_manifest):
|
|
|
189
193
|
container['volumeMounts'].append(
|
|
190
194
|
{'name': 'libraries', 'mountPath': '/usr/local/nvidia'}
|
|
191
195
|
)
|
|
196
|
+
container['volumeMounts'].append(
|
|
197
|
+
{'name': 'dshm', 'mountPath': '/dev/shm'}
|
|
198
|
+
)
|
xpk/parser/cluster.py
CHANGED
|
@@ -743,6 +743,11 @@ def add_driver_arguments(parser: ArgumentParser):
|
|
|
743
743
|
action='store_true',
|
|
744
744
|
help='Enable PersistentDisk CSI driver on the cluster.',
|
|
745
745
|
)
|
|
746
|
+
parser.add_argument(
|
|
747
|
+
'--enable-lustre-csi-driver',
|
|
748
|
+
action='store_true',
|
|
749
|
+
help='Enable Lustre CSI driver on the cluster.',
|
|
750
|
+
)
|
|
746
751
|
|
|
747
752
|
|
|
748
753
|
def add_shared_cluster_create_tensorboard_arguments(parser: ArgumentParser):
|
|
@@ -792,25 +797,36 @@ def add_shared_cluster_create_capacity_arguments(parser: ArgumentParser):
|
|
|
792
797
|
'--on-demand',
|
|
793
798
|
action='store_true',
|
|
794
799
|
help=(
|
|
795
|
-
'Sets node pool creation to use on-demand resources.
|
|
796
|
-
'
|
|
800
|
+
'Sets node pool creation to use on-demand resources. See'
|
|
801
|
+
' `--reservation`, `--flex` or `--spot` for other capacity'
|
|
802
|
+
' types.'
|
|
797
803
|
),
|
|
798
804
|
)
|
|
799
805
|
parser.add_argument(
|
|
800
806
|
'--reservation',
|
|
801
807
|
type=str,
|
|
802
808
|
help=(
|
|
803
|
-
'The reservation to be used for acquiring resources in the'
|
|
804
|
-
'
|
|
805
|
-
'
|
|
809
|
+
'The reservation to be used for acquiring resources in the cluster.'
|
|
810
|
+
' This will attempt to find the provided reservation. See `--spot`,'
|
|
811
|
+
' `--flex` or `--on-demand` for other capacity types.'
|
|
806
812
|
),
|
|
807
813
|
)
|
|
808
814
|
parser.add_argument(
|
|
809
815
|
'--spot',
|
|
810
816
|
action='store_true',
|
|
811
817
|
help=(
|
|
812
|
-
'Sets node pool creation to use spot resources.'
|
|
813
|
-
'
|
|
818
|
+
'Sets node pool creation to use spot resources. See'
|
|
819
|
+
' `--reservation`, `--flex` or `--on-demand` for other'
|
|
820
|
+
' capacity types.'
|
|
821
|
+
),
|
|
822
|
+
)
|
|
823
|
+
parser.add_argument(
|
|
824
|
+
'--flex',
|
|
825
|
+
action='store_true',
|
|
826
|
+
help=(
|
|
827
|
+
'Sets node pool creation to use DWS Flex Start resources. See'
|
|
828
|
+
' `--reservation`, `--on-demand` or `--spot` for other capacity'
|
|
829
|
+
' types.'
|
|
814
830
|
),
|
|
815
831
|
)
|
|
816
832
|
|
xpk/parser/storage.py
CHANGED
|
@@ -71,9 +71,9 @@ def add_storage_attach_parser(
|
|
|
71
71
|
type=str,
|
|
72
72
|
help=(
|
|
73
73
|
'The type of storage. Currently supported types: "gcsfuse",'
|
|
74
|
-
' "gcpfilestore", "parallelstore", "pd"'
|
|
74
|
+
' "gcpfilestore", "parallelstore", "pd", "lustre"'
|
|
75
75
|
),
|
|
76
|
-
choices=['gcsfuse', 'gcpfilestore', 'parallelstore', 'pd'],
|
|
76
|
+
choices=['gcsfuse', 'gcpfilestore', 'parallelstore', 'pd', 'lustre'],
|
|
77
77
|
required=True,
|
|
78
78
|
)
|
|
79
79
|
add_cluster_arguments(req_args, required=True)
|
xpk/parser/workload.py
CHANGED
|
@@ -208,15 +208,25 @@ def set_workload_parsers(workload_parser):
|
|
|
208
208
|
help=(
|
|
209
209
|
'Sets autoprovisioning to use reservation resources for the workload'
|
|
210
210
|
' request. This will attempt to find the provided reservation. See'
|
|
211
|
-
' `--spot` or `--on-demand` for other capacity types.'
|
|
211
|
+
' `--spot`, `--flex` or `--on-demand` for other capacity types.'
|
|
212
212
|
),
|
|
213
213
|
)
|
|
214
214
|
workload_create_autoprovisioning_arguments.add_argument(
|
|
215
215
|
'--spot',
|
|
216
216
|
action='store_true',
|
|
217
217
|
help=(
|
|
218
|
-
'Sets autoprovisioning to use spot resources.'
|
|
219
|
-
'
|
|
218
|
+
'Sets autoprovisioning to use spot resources. See `--reservation`,'
|
|
219
|
+
' `--flex` or `--on-demand` for other capacity types.'
|
|
220
|
+
),
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
workload_create_autoprovisioning_arguments.add_argument(
|
|
224
|
+
'--flex',
|
|
225
|
+
action='store_true',
|
|
226
|
+
help=(
|
|
227
|
+
'Sets autoprovisioning to use flex-start resources. See'
|
|
228
|
+
' `--reservation`, `--spot` or `--on-demand` for other capacity'
|
|
229
|
+
' types.'
|
|
220
230
|
),
|
|
221
231
|
)
|
|
222
232
|
|
|
@@ -728,6 +738,14 @@ def add_shared_workload_docker_image_arguments(args_parsers):
|
|
|
728
738
|
' directly by the xpk workload.'
|
|
729
739
|
),
|
|
730
740
|
)
|
|
741
|
+
custom_parser.add_argument(
|
|
742
|
+
'--docker-image-pull-secret',
|
|
743
|
+
type=str,
|
|
744
|
+
help=(
|
|
745
|
+
'Name of the secret that will be used to pull image from'
|
|
746
|
+
' private repository'
|
|
747
|
+
),
|
|
748
|
+
)
|
|
731
749
|
|
|
732
750
|
|
|
733
751
|
def add_shared_workload_create_tensorboard_arguments(args_parsers):
|