xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +33 -12
- xpk/commands/cluster_gcluster_test.py +5 -1
- xpk/commands/cluster_test.py +125 -0
- xpk/commands/config.py +3 -3
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +2 -0
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/workload.py +124 -139
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +3 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +2 -0
- xpk/core/cluster.py +18 -47
- xpk/core/cluster_test.py +76 -1
- xpk/core/config.py +81 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/kjob.py +17 -16
- xpk/core/kueue_manager.py +13 -19
- xpk/core/kueue_manager_test.py +27 -1
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +17 -15
- xpk/core/nodepool_test.py +25 -4
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +128 -132
- xpk/core/scheduling_test.py +215 -2
- xpk/core/system_characteristics.py +179 -0
- xpk/core/system_characteristics_test.py +49 -1
- xpk/core/telemetry.py +4 -4
- xpk/core/telemetry_test.py +9 -9
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +2 -0
- xpk/parser/cluster.py +22 -88
- xpk/parser/cluster_test.py +41 -0
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -41
- xpk/parser/workload_test.py +2 -48
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/utils/feature_flags.py +3 -0
- xpk/utils/validation.py +2 -2
- xpk-0.16.0.dist-info/METADATA +127 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
- xpk-0.15.0.dist-info/METADATA +0 -1666
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
integration/README.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
This folder contains integration tests.
|
|
2
|
+
|
|
3
|
+
To run them env variables are needed:
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
export PROJECT_ID=...
|
|
7
|
+
export REGION=...
|
|
8
|
+
export ZONE=...
|
|
9
|
+
export AUTH_CIDR=...
|
|
10
|
+
export DEPLOYMENT_DIR=...
|
|
11
|
+
export CLUSTER_NAME=...
|
|
12
|
+
export GCLOUD_CFG_PATH=...
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
To run tests:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pytest src/integration
|
|
19
|
+
```
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
kind: ConfigMap
|
|
2
|
+
apiVersion: v1
|
|
3
|
+
metadata:
|
|
4
|
+
name: ${resource_config_name}
|
|
5
|
+
data:
|
|
6
|
+
h100-mega-80gb-8: "${num_nodes}"
|
|
7
|
+
---
|
|
8
|
+
kind: ConfigMap
|
|
9
|
+
apiVersion: v1
|
|
10
|
+
metadata:
|
|
11
|
+
name: ${cluster_config_name}
|
|
12
|
+
data:
|
|
13
|
+
capacity_type: "${capacity_type}"
|
|
14
|
+
reservation_id: "${reservation}"
|
|
15
|
+
provisioner: gcluster
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
apiVersion: apiextensions.k8s.io/v1
|
|
2
|
+
kind: CustomResourceDefinition
|
|
3
|
+
metadata:
|
|
4
|
+
name: storages.xpk.x-k8s.io
|
|
5
|
+
spec:
|
|
6
|
+
group: xpk.x-k8s.io
|
|
7
|
+
versions:
|
|
8
|
+
- name: v1
|
|
9
|
+
served: true
|
|
10
|
+
storage: true
|
|
11
|
+
schema:
|
|
12
|
+
openAPIV3Schema:
|
|
13
|
+
type: object
|
|
14
|
+
properties:
|
|
15
|
+
spec:
|
|
16
|
+
type: object
|
|
17
|
+
properties:
|
|
18
|
+
type:
|
|
19
|
+
type: string
|
|
20
|
+
cluster:
|
|
21
|
+
type: string
|
|
22
|
+
auto_mount:
|
|
23
|
+
type: boolean
|
|
24
|
+
mount_point:
|
|
25
|
+
type: string
|
|
26
|
+
readonly:
|
|
27
|
+
type: boolean
|
|
28
|
+
manifest:
|
|
29
|
+
type: string
|
|
30
|
+
pv:
|
|
31
|
+
type: string
|
|
32
|
+
pvc:
|
|
33
|
+
type: string
|
|
34
|
+
required:
|
|
35
|
+
- type
|
|
36
|
+
- cluster
|
|
37
|
+
- auto_mount
|
|
38
|
+
- mount_point
|
|
39
|
+
- readonly
|
|
40
|
+
- manifest
|
|
41
|
+
- pvc
|
|
42
|
+
- pv
|
|
43
|
+
x-kubernetes-validations:
|
|
44
|
+
- message: Value is immutable
|
|
45
|
+
rule: self == oldSelf
|
|
46
|
+
scope: Cluster
|
|
47
|
+
names:
|
|
48
|
+
plural: storages
|
|
49
|
+
singular: storage
|
|
50
|
+
kind: Storage
|
|
51
|
+
shortNames:
|
|
52
|
+
- stg
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
kind: ConfigMap
|
|
2
|
+
apiVersion: v1
|
|
3
|
+
metadata:
|
|
4
|
+
name: ${resource_config_name}
|
|
5
|
+
data:
|
|
6
|
+
h200-141gb-8: "${num_nodes}"
|
|
7
|
+
---
|
|
8
|
+
kind: ConfigMap
|
|
9
|
+
apiVersion: v1
|
|
10
|
+
metadata:
|
|
11
|
+
name: ${cluster_config_name}
|
|
12
|
+
data:
|
|
13
|
+
capacity_type: "${capacity_type}"
|
|
14
|
+
reservation_id: "${reservation}"
|
|
15
|
+
provisioner: gcluster
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Copyright 2024 Google Inc. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
apiVersion: apps/v1
|
|
16
|
+
kind: DaemonSet
|
|
17
|
+
metadata:
|
|
18
|
+
name: disable-mglru
|
|
19
|
+
namespace: kube-system
|
|
20
|
+
spec:
|
|
21
|
+
selector:
|
|
22
|
+
matchLabels:
|
|
23
|
+
app: disable-mglru
|
|
24
|
+
template:
|
|
25
|
+
metadata:
|
|
26
|
+
labels:
|
|
27
|
+
app: disable-mglru
|
|
28
|
+
spec:
|
|
29
|
+
hostNetwork: true
|
|
30
|
+
tolerations:
|
|
31
|
+
- operator: "Exists"
|
|
32
|
+
key: nvidia.com/gpu
|
|
33
|
+
containers:
|
|
34
|
+
- name: disable-mglru
|
|
35
|
+
image: alpine:latest
|
|
36
|
+
command: ["/bin/sh"]
|
|
37
|
+
securityContext:
|
|
38
|
+
privileged: true
|
|
39
|
+
args:
|
|
40
|
+
- -c
|
|
41
|
+
- |
|
|
42
|
+
echo n | tee /sys/kernel/mm/lru_gen/enabled
|
|
43
|
+
sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
|
|
44
|
+
sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
|
|
45
|
+
sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
|
|
46
|
+
sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
|
|
47
|
+
sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
|
|
48
|
+
sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
|
|
49
|
+
sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
|
|
50
|
+
sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
|
|
51
|
+
sleep infinity
|
|
52
|
+
volumeMounts:
|
|
53
|
+
- name: sys-kernel-mm-lru-gen
|
|
54
|
+
mountPath: /sys/kernel/mm/lru_gen
|
|
55
|
+
# Remount sysfs so that it will be writable.
|
|
56
|
+
volumes:
|
|
57
|
+
- name: sys-kernel-mm-lru-gen
|
|
58
|
+
hostPath:
|
|
59
|
+
path: /sys/kernel/mm/lru_gen
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# Copyright 2024 Google Inc. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
apiVersion: apps/v1
|
|
16
|
+
kind: DaemonSet
|
|
17
|
+
metadata:
|
|
18
|
+
name: nccl-rdma-installer
|
|
19
|
+
namespace: kube-system
|
|
20
|
+
labels:
|
|
21
|
+
k8s-app: nccl-rdma-installer
|
|
22
|
+
spec:
|
|
23
|
+
selector:
|
|
24
|
+
matchLabels:
|
|
25
|
+
k8s-app: nccl-rdma-installer
|
|
26
|
+
updateStrategy:
|
|
27
|
+
type: RollingUpdate
|
|
28
|
+
template:
|
|
29
|
+
metadata:
|
|
30
|
+
labels:
|
|
31
|
+
name: nccl-rdma-installer
|
|
32
|
+
k8s-app: nccl-rdma-installer
|
|
33
|
+
spec:
|
|
34
|
+
priorityClassName: system-node-critical
|
|
35
|
+
affinity:
|
|
36
|
+
nodeAffinity:
|
|
37
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
38
|
+
nodeSelectorTerms:
|
|
39
|
+
- matchExpressions:
|
|
40
|
+
- key: cloud.google.com/gke-accelerator
|
|
41
|
+
operator: In
|
|
42
|
+
values:
|
|
43
|
+
- nvidia-h200-141gb
|
|
44
|
+
tolerations:
|
|
45
|
+
- operator: "Exists"
|
|
46
|
+
hostNetwork: true
|
|
47
|
+
hostPID: true
|
|
48
|
+
volumes:
|
|
49
|
+
- name: library-dir-host
|
|
50
|
+
hostPath:
|
|
51
|
+
path: /home/kubernetes/bin/nvidia/lib64
|
|
52
|
+
type: DirectoryOrCreate
|
|
53
|
+
- name: gib
|
|
54
|
+
hostPath:
|
|
55
|
+
path: /home/kubernetes/bin/gib
|
|
56
|
+
initContainers:
|
|
57
|
+
- name: disable-log-martian
|
|
58
|
+
image: alpine:latest
|
|
59
|
+
command: ["/bin/sh"]
|
|
60
|
+
securityContext:
|
|
61
|
+
privileged: true
|
|
62
|
+
args:
|
|
63
|
+
- -c
|
|
64
|
+
- |
|
|
65
|
+
sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
|
|
66
|
+
sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
|
|
67
|
+
sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
|
|
68
|
+
sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
|
|
69
|
+
sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
|
|
70
|
+
sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
|
|
71
|
+
sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
|
|
72
|
+
sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
|
|
73
|
+
- image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3
|
|
74
|
+
name: nccl-rdma-installer
|
|
75
|
+
resources:
|
|
76
|
+
requests:
|
|
77
|
+
cpu: 150m
|
|
78
|
+
securityContext:
|
|
79
|
+
privileged: true
|
|
80
|
+
volumeMounts:
|
|
81
|
+
- name: library-dir-host
|
|
82
|
+
mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
|
|
83
|
+
- name: gib
|
|
84
|
+
mountPath: /usr/local/home/kubernetes/bin/gib
|
|
85
|
+
command: ["/bin/sh", "-c"]
|
|
86
|
+
args:
|
|
87
|
+
- |
|
|
88
|
+
set -ex
|
|
89
|
+
/scripts/container_entry.sh install --install-nccl
|
|
90
|
+
cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
|
|
91
|
+
cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
|
|
92
|
+
echo "installation finishes"
|
|
93
|
+
containers:
|
|
94
|
+
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
|
|
95
|
+
name: pause
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
apiVersion: apiextensions.k8s.io/v1
|
|
2
|
+
kind: CustomResourceDefinition
|
|
3
|
+
metadata:
|
|
4
|
+
name: storages.xpk.x-k8s.io
|
|
5
|
+
spec:
|
|
6
|
+
group: xpk.x-k8s.io
|
|
7
|
+
versions:
|
|
8
|
+
- name: v1
|
|
9
|
+
served: true
|
|
10
|
+
storage: true
|
|
11
|
+
schema:
|
|
12
|
+
openAPIV3Schema:
|
|
13
|
+
type: object
|
|
14
|
+
properties:
|
|
15
|
+
spec:
|
|
16
|
+
type: object
|
|
17
|
+
properties:
|
|
18
|
+
type:
|
|
19
|
+
type: string
|
|
20
|
+
cluster:
|
|
21
|
+
type: string
|
|
22
|
+
auto_mount:
|
|
23
|
+
type: boolean
|
|
24
|
+
mount_point:
|
|
25
|
+
type: string
|
|
26
|
+
readonly:
|
|
27
|
+
type: boolean
|
|
28
|
+
manifest:
|
|
29
|
+
type: string
|
|
30
|
+
pv:
|
|
31
|
+
type: string
|
|
32
|
+
pvc:
|
|
33
|
+
type: string
|
|
34
|
+
required:
|
|
35
|
+
- type
|
|
36
|
+
- cluster
|
|
37
|
+
- auto_mount
|
|
38
|
+
- mount_point
|
|
39
|
+
- readonly
|
|
40
|
+
- manifest
|
|
41
|
+
- pvc
|
|
42
|
+
- pv
|
|
43
|
+
x-kubernetes-validations:
|
|
44
|
+
- message: Value is immutable
|
|
45
|
+
rule: self == oldSelf
|
|
46
|
+
scope: Cluster
|
|
47
|
+
names:
|
|
48
|
+
plural: storages
|
|
49
|
+
singular: storage
|
|
50
|
+
kind: Storage
|
|
51
|
+
shortNames:
|
|
52
|
+
- stg
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
kind: ConfigMap
|
|
2
|
+
apiVersion: v1
|
|
3
|
+
metadata:
|
|
4
|
+
name: ${resource_config_name}
|
|
5
|
+
data:
|
|
6
|
+
b200-8: "${num_nodes}"
|
|
7
|
+
---
|
|
8
|
+
kind: ConfigMap
|
|
9
|
+
apiVersion: v1
|
|
10
|
+
metadata:
|
|
11
|
+
name: ${cluster_config_name}
|
|
12
|
+
data:
|
|
13
|
+
capacity_type: "${capacity_type}"
|
|
14
|
+
reservation_id: "${reservation}"
|
|
15
|
+
provisioner: gcluster
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
apiVersion: apps/v1
|
|
2
|
+
kind: DaemonSet
|
|
3
|
+
metadata:
|
|
4
|
+
name: nccl-rdma-installer
|
|
5
|
+
namespace: kube-system
|
|
6
|
+
labels:
|
|
7
|
+
k8s-app: nccl-rdma-installer
|
|
8
|
+
spec:
|
|
9
|
+
selector:
|
|
10
|
+
matchLabels:
|
|
11
|
+
k8s-app: nccl-rdma-installer
|
|
12
|
+
updateStrategy:
|
|
13
|
+
type: RollingUpdate
|
|
14
|
+
template:
|
|
15
|
+
metadata:
|
|
16
|
+
labels:
|
|
17
|
+
name: nccl-rdma-installer
|
|
18
|
+
k8s-app: nccl-rdma-installer
|
|
19
|
+
spec:
|
|
20
|
+
priorityClassName: system-node-critical
|
|
21
|
+
affinity:
|
|
22
|
+
nodeAffinity:
|
|
23
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
24
|
+
nodeSelectorTerms:
|
|
25
|
+
- matchExpressions:
|
|
26
|
+
- key: cloud.google.com/gke-accelerator
|
|
27
|
+
operator: In
|
|
28
|
+
values:
|
|
29
|
+
- nvidia-b200
|
|
30
|
+
tolerations:
|
|
31
|
+
- operator: "Exists"
|
|
32
|
+
hostNetwork: true
|
|
33
|
+
hostPID: true
|
|
34
|
+
volumes:
|
|
35
|
+
- name: library-dir-host
|
|
36
|
+
hostPath:
|
|
37
|
+
path: /home/kubernetes/bin/nvidia/lib64
|
|
38
|
+
type: DirectoryOrCreate
|
|
39
|
+
- name: gib
|
|
40
|
+
hostPath:
|
|
41
|
+
path: /home/kubernetes/bin/gib
|
|
42
|
+
initContainers:
|
|
43
|
+
- image: us-docker.pkg.dev/kernel-net-team/clouda4-nccl-dev/nccl-plugin-gib-diagnostic:v1.0.3-b200
|
|
44
|
+
name: nccl-rdma-installer
|
|
45
|
+
resources:
|
|
46
|
+
requests:
|
|
47
|
+
cpu: 150m
|
|
48
|
+
securityContext:
|
|
49
|
+
privileged: true
|
|
50
|
+
volumeMounts:
|
|
51
|
+
- name: library-dir-host
|
|
52
|
+
mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
|
|
53
|
+
- name: gib
|
|
54
|
+
mountPath: /usr/local/home/kubernetes/bin/gib
|
|
55
|
+
command: ["/bin/sh", "-c"]
|
|
56
|
+
args:
|
|
57
|
+
- |
|
|
58
|
+
set -ex
|
|
59
|
+
/scripts/container_entry.sh install --install-nccl
|
|
60
|
+
cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
|
|
61
|
+
cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
|
|
62
|
+
# ibv_devinfo || exit 1
|
|
63
|
+
echo "installation finishes"
|
|
64
|
+
containers:
|
|
65
|
+
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
|
|
66
|
+
name: pause
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
apiVersion: apiextensions.k8s.io/v1
|
|
2
|
+
kind: CustomResourceDefinition
|
|
3
|
+
metadata:
|
|
4
|
+
name: storages.xpk.x-k8s.io
|
|
5
|
+
spec:
|
|
6
|
+
group: xpk.x-k8s.io
|
|
7
|
+
versions:
|
|
8
|
+
- name: v1
|
|
9
|
+
served: true
|
|
10
|
+
storage: true
|
|
11
|
+
schema:
|
|
12
|
+
openAPIV3Schema:
|
|
13
|
+
type: object
|
|
14
|
+
properties:
|
|
15
|
+
spec:
|
|
16
|
+
type: object
|
|
17
|
+
properties:
|
|
18
|
+
type:
|
|
19
|
+
type: string
|
|
20
|
+
cluster:
|
|
21
|
+
type: string
|
|
22
|
+
auto_mount:
|
|
23
|
+
type: boolean
|
|
24
|
+
mount_point:
|
|
25
|
+
type: string
|
|
26
|
+
readonly:
|
|
27
|
+
type: boolean
|
|
28
|
+
manifest:
|
|
29
|
+
type: string
|
|
30
|
+
pv:
|
|
31
|
+
type: string
|
|
32
|
+
pvc:
|
|
33
|
+
type: string
|
|
34
|
+
required:
|
|
35
|
+
- type
|
|
36
|
+
- cluster
|
|
37
|
+
- auto_mount
|
|
38
|
+
- mount_point
|
|
39
|
+
- readonly
|
|
40
|
+
- manifest
|
|
41
|
+
- pvc
|
|
42
|
+
- pv
|
|
43
|
+
x-kubernetes-validations:
|
|
44
|
+
- message: Value is immutable
|
|
45
|
+
rule: self == oldSelf
|
|
46
|
+
scope: Cluster
|
|
47
|
+
names:
|
|
48
|
+
plural: storages
|
|
49
|
+
singular: storage
|
|
50
|
+
kind: Storage
|
|
51
|
+
shortNames:
|
|
52
|
+
- stg
|
xpk/commands/cluster.py
CHANGED
|
@@ -18,7 +18,8 @@ from tabulate import tabulate
|
|
|
18
18
|
|
|
19
19
|
from ..utils.feature_flags import FeatureFlags
|
|
20
20
|
from ..utils.versions import ReleaseChannel
|
|
21
|
-
from ..core.
|
|
21
|
+
from ..core.pathways import get_pathways_machine_types
|
|
22
|
+
from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type
|
|
22
23
|
from ..core.cluster import (
|
|
23
24
|
get_all_clusters_programmatic,
|
|
24
25
|
get_cluster_credentials,
|
|
@@ -27,7 +28,6 @@ from ..core.cluster import (
|
|
|
27
28
|
set_jobset_on_cluster,
|
|
28
29
|
set_pathways_job_on_cluster,
|
|
29
30
|
setup_k8s_env,
|
|
30
|
-
disable_mglru_on_cluster,
|
|
31
31
|
count_nodes_on_cluster,
|
|
32
32
|
update_cluster_with_gcpfilestore_driver_if_necessary,
|
|
33
33
|
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
@@ -84,6 +84,7 @@ from jinja2 import Environment, FileSystemLoader
|
|
|
84
84
|
from ..utils.templates import get_templates_absolute_path
|
|
85
85
|
import shutil
|
|
86
86
|
import os
|
|
87
|
+
from .managed_ml_diagnostics import install_mldiagnostics_prerequisites
|
|
87
88
|
|
|
88
89
|
CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
|
|
89
90
|
|
|
@@ -210,6 +211,25 @@ def _validate_cluster_create_args(args, system: SystemCharacteristics):
|
|
|
210
211
|
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
|
|
211
212
|
validate_sub_slicing_system(system)
|
|
212
213
|
_validate_sub_slicing_reservation(args)
|
|
214
|
+
if args.enable_pathways:
|
|
215
|
+
_validate_pathways_machine(args)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _validate_pathways_machine(args):
|
|
219
|
+
return_code, result = get_pathways_machine_types(
|
|
220
|
+
project=args.project, zone=args.zone
|
|
221
|
+
)
|
|
222
|
+
if return_code != 0:
|
|
223
|
+
xpk_print('Error: Unable to retrieve available pathways machine types')
|
|
224
|
+
xpk_exit(1)
|
|
225
|
+
|
|
226
|
+
if args.pathways_gce_machine_type not in result:
|
|
227
|
+
xpk_print(
|
|
228
|
+
'Error: Invalid --pathways-gce-machine-type. Specify machine type that'
|
|
229
|
+
' has at least 100GB of memory and at least 49 CPUs.'
|
|
230
|
+
)
|
|
231
|
+
xpk_print(f'Available machine types: {", ".join(result)}')
|
|
232
|
+
xpk_exit(1)
|
|
213
233
|
|
|
214
234
|
|
|
215
235
|
def _validate_sub_slicing_reservation(args):
|
|
@@ -261,11 +281,10 @@ def cluster_create(args) -> None:
|
|
|
261
281
|
xpk_print('Fetching system characteristics failed!')
|
|
262
282
|
xpk_exit(return_code)
|
|
263
283
|
|
|
264
|
-
_validate_cluster_create_args(args, system)
|
|
265
|
-
|
|
266
284
|
xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
|
|
267
285
|
add_zone_and_project(args)
|
|
268
286
|
|
|
287
|
+
_validate_cluster_create_args(args, system)
|
|
269
288
|
_log_cluster_create_telemetry(args)
|
|
270
289
|
|
|
271
290
|
release_channel = (
|
|
@@ -422,6 +441,13 @@ def cluster_create(args) -> None:
|
|
|
422
441
|
# pylint: disable=line-too-long
|
|
423
442
|
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
|
|
424
443
|
)
|
|
444
|
+
|
|
445
|
+
if args.managed_mldiagnostics:
|
|
446
|
+
return_code = install_mldiagnostics_prerequisites()
|
|
447
|
+
if return_code != 0:
|
|
448
|
+
xpk_print('Installation of MLDiagnostics failed.')
|
|
449
|
+
xpk_exit(return_code)
|
|
450
|
+
|
|
425
451
|
xpk_exit(0)
|
|
426
452
|
|
|
427
453
|
|
|
@@ -979,7 +1005,7 @@ def update_coredns() -> int:
|
|
|
979
1005
|
|
|
980
1006
|
# 6. Scale up coredns and verify readiness
|
|
981
1007
|
scale_up_coredns(replicas=15)
|
|
982
|
-
verify_coredns_readiness(
|
|
1008
|
+
verify_coredns_readiness()
|
|
983
1009
|
|
|
984
1010
|
xpk_print('The CoreDNS setup process has been completed.')
|
|
985
1011
|
|
|
@@ -1220,7 +1246,8 @@ def run_gke_cluster_create_command(
|
|
|
1220
1246
|
|
|
1221
1247
|
if args.enable_lustre_csi_driver:
|
|
1222
1248
|
addons.append('LustreCsiDriver')
|
|
1223
|
-
|
|
1249
|
+
if args.enable_legacy_lustre_port:
|
|
1250
|
+
command += ' --enable-legacy-lustre-port'
|
|
1224
1251
|
|
|
1225
1252
|
if hasattr(args, 'enable_mtc') and args.enable_mtc:
|
|
1226
1253
|
addons.append('HighScaleCheckpointing')
|
|
@@ -1336,12 +1363,6 @@ def prepare_gpus(system: SystemCharacteristics):
|
|
|
1336
1363
|
if install_nri_code != 0:
|
|
1337
1364
|
xpk_exit(install_nri_code)
|
|
1338
1365
|
|
|
1339
|
-
if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
|
|
1340
|
-
xpk_print('Disabling MGLRU')
|
|
1341
|
-
err_code = disable_mglru_on_cluster()
|
|
1342
|
-
if err_code > 0:
|
|
1343
|
-
xpk_exit(err_code)
|
|
1344
|
-
|
|
1345
1366
|
|
|
1346
1367
|
def _log_cluster_create_telemetry(args) -> None:
|
|
1347
1368
|
if FeatureFlags.TELEMETRY_ENABLED:
|
|
@@ -20,7 +20,7 @@ import pytest
|
|
|
20
20
|
|
|
21
21
|
from xpk.commands.cluster_gcluster import cluster_create
|
|
22
22
|
from xpk.core.kueue_manager import KueueConfig
|
|
23
|
-
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
|
|
23
|
+
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
|
|
24
24
|
from xpk.utils.versions import ReleaseChannel
|
|
25
25
|
|
|
26
26
|
|
|
@@ -97,6 +97,8 @@ def test_install_kueue_standard(
|
|
|
97
97
|
accelerator_type=AcceleratorType.GPU,
|
|
98
98
|
device_type="h100-mega-80gb-8",
|
|
99
99
|
supports_sub_slicing=False,
|
|
100
|
+
docker_platform=DockerPlatform.ARM,
|
|
101
|
+
gpu_config=GpuConfig(requires_topology=True),
|
|
100
102
|
)
|
|
101
103
|
mock_cluster_create_deps["get_system_characteristics"].return_value = (
|
|
102
104
|
mock_system,
|
|
@@ -148,6 +150,8 @@ def test_install_kueue_with_autoprovisioning(
|
|
|
148
150
|
accelerator_type=AcceleratorType.GPU,
|
|
149
151
|
device_type="h100-mega-80gb-8",
|
|
150
152
|
supports_sub_slicing=False,
|
|
153
|
+
docker_platform=DockerPlatform.ARM,
|
|
154
|
+
gpu_config=GpuConfig(requires_topology=True),
|
|
151
155
|
)
|
|
152
156
|
mock_cluster_create_deps["get_system_characteristics"].return_value = (
|
|
153
157
|
mock_system,
|