xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +89 -32
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +16 -3
- xpk/commands/cluster_test.py +353 -7
- xpk/commands/config.py +3 -5
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +3 -1
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +143 -142
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +73 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +48 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +55 -104
- xpk/core/cluster_test.py +170 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +88 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +17 -19
- xpk/core/kueue_manager.py +205 -51
- xpk/core/kueue_manager_test.py +158 -4
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +37 -43
- xpk/core/nodepool_test.py +42 -19
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +144 -133
- xpk/core/scheduling_test.py +298 -6
- xpk/core/system_characteristics.py +256 -19
- xpk/core/system_characteristics_test.py +128 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +33 -13
- xpk/parser/cluster.py +40 -67
- xpk/parser/cluster_test.py +83 -3
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -29
- xpk/parser/workload_test.py +3 -49
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +10 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +2 -13
- xpk/utils/versions.py +31 -0
- xpk-0.16.0.dist-info/METADATA +127 -0
- xpk-0.16.0.dist-info/RECORD +168 -0
- xpk-0.14.4.dist-info/METADATA +0 -1645
- xpk-0.14.4.dist-info/RECORD +0 -139
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
integration/README.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
This folder contains integration tests.
|
|
2
|
+
|
|
3
|
+
To run them env variables are needed:
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
export PROJECT_ID=...
|
|
7
|
+
export REGION=...
|
|
8
|
+
export ZONE=...
|
|
9
|
+
export AUTH_CIDR=...
|
|
10
|
+
export DEPLOYMENT_DIR=...
|
|
11
|
+
export CLUSTER_NAME=...
|
|
12
|
+
export GCLOUD_CFG_PATH=...
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
To run tests:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pytest src/integration
|
|
19
|
+
```
|
|
@@ -18,6 +18,7 @@ from xpk.commands.cluster_gcluster import get_unique_name
|
|
|
18
18
|
from xpk.core.docker_manager import DockerManager
|
|
19
19
|
from xpk.core.gcluster_manager import GclusterManager
|
|
20
20
|
from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
|
|
21
|
+
from xpk.utils.versions import ReleaseChannel
|
|
21
22
|
import pytest
|
|
22
23
|
import os
|
|
23
24
|
import shutil
|
|
@@ -28,6 +29,8 @@ region = os.getenv("REGION")
|
|
|
28
29
|
zone = os.getenv("ZONE")
|
|
29
30
|
auth_cidr = os.getenv("AUTH_CIDR")
|
|
30
31
|
cluster_name = os.getenv("A3_MEGA_TEST_CLUSTER_NAME")
|
|
32
|
+
release_channel = os.getenv("RELEASE_CHANNEL")
|
|
33
|
+
cluster_version = os.getenv("CLUSTER_VERSION")
|
|
31
34
|
|
|
32
35
|
uploads_dir = "uploads"
|
|
33
36
|
|
|
@@ -87,6 +90,8 @@ def test_create_a3_mega_deployment_files(setup_tests):
|
|
|
87
90
|
assert auth_cidr is not None
|
|
88
91
|
assert ctk_gcloud_cfg is not None
|
|
89
92
|
assert cluster_name is not None
|
|
93
|
+
assert release_channel is not None
|
|
94
|
+
assert cluster_version is not None
|
|
90
95
|
docker_path, bp_path = setup_tests[0], setup_tests[1]
|
|
91
96
|
|
|
92
97
|
blueprint_name = f"{cluster_name}-a3-mega-xpk"
|
|
@@ -107,6 +112,8 @@ def test_create_a3_mega_deployment_files(setup_tests):
|
|
|
107
112
|
auth_cidr=auth_cidr,
|
|
108
113
|
zone=zone,
|
|
109
114
|
system_node_pool_min_node_count=3,
|
|
115
|
+
release_channel=ReleaseChannel(release_channel),
|
|
116
|
+
cluster_version=cluster_version,
|
|
110
117
|
)
|
|
111
118
|
blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
|
|
112
119
|
blueprint_deps_test_path = os.path.join(bp_path, prefix, blueprint_name)
|
|
@@ -164,6 +171,8 @@ def create_test_a3_mega_deployment(docker_path: str, bp_path: str):
|
|
|
164
171
|
assert auth_cidr is not None
|
|
165
172
|
assert ctk_gcloud_cfg is not None
|
|
166
173
|
assert cluster_name is not None
|
|
174
|
+
assert release_channel is not None
|
|
175
|
+
assert cluster_version is not None
|
|
167
176
|
|
|
168
177
|
blueprint_name = f"{cluster_name}-a3-mega-xpk"
|
|
169
178
|
prefix = "prefix"
|
|
@@ -183,6 +192,8 @@ def create_test_a3_mega_deployment(docker_path: str, bp_path: str):
|
|
|
183
192
|
auth_cidr=auth_cidr,
|
|
184
193
|
zone=zone,
|
|
185
194
|
system_node_pool_min_node_count=3,
|
|
195
|
+
release_channel=ReleaseChannel(release_channel),
|
|
196
|
+
cluster_version=cluster_version,
|
|
186
197
|
)
|
|
187
198
|
|
|
188
199
|
gcluster_manager = GclusterManager(
|
|
@@ -24,6 +24,7 @@ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
|
|
|
24
24
|
from xpk.core.capacity import CapacityType
|
|
25
25
|
from xpk.core.docker_manager import DockerManager
|
|
26
26
|
from xpk.core.gcluster_manager import GclusterManager
|
|
27
|
+
from xpk.utils.versions import ReleaseChannel
|
|
27
28
|
|
|
28
29
|
ctk_gcloud_cfg = os.getenv("GCLOUD_CFG_PATH")
|
|
29
30
|
project_id = os.getenv("PROJECT_ID")
|
|
@@ -31,6 +32,8 @@ region = os.getenv("REGION")
|
|
|
31
32
|
zone = os.getenv("ZONE")
|
|
32
33
|
auth_cidr = os.getenv("AUTH_CIDR")
|
|
33
34
|
cluster_name = os.getenv("A3_ULTRA_TEST_CLUSTER_NAME")
|
|
35
|
+
release_channel = os.getenv("RELEASE_CHANNEL")
|
|
36
|
+
cluster_version = os.getenv("CLUSTER_VERSION")
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
@pytest.fixture(name="setup_tests")
|
|
@@ -60,6 +63,8 @@ def test_create_a3_ultra_deployment_files(setup_tests):
|
|
|
60
63
|
assert auth_cidr is not None
|
|
61
64
|
assert ctk_gcloud_cfg is not None
|
|
62
65
|
assert cluster_name is not None
|
|
66
|
+
assert release_channel is not None
|
|
67
|
+
assert cluster_version is not None
|
|
63
68
|
docker_path, bp_path = setup_tests[0], setup_tests[1]
|
|
64
69
|
blueprint_name = f"{cluster_name}-a3-ultra-xpk"
|
|
65
70
|
|
|
@@ -80,6 +85,8 @@ def test_create_a3_ultra_deployment_files(setup_tests):
|
|
|
80
85
|
num_nodes=1,
|
|
81
86
|
system_node_pool_machine_type="e2-standard-16",
|
|
82
87
|
prefix=prefix,
|
|
88
|
+
release_channel=ReleaseChannel(release_channel),
|
|
89
|
+
cluster_version=cluster_version,
|
|
83
90
|
)
|
|
84
91
|
blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
|
|
85
92
|
blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
|
|
@@ -125,6 +132,8 @@ def test_create_a3_ultra_deployment(setup_tests):
|
|
|
125
132
|
assert auth_cidr is not None
|
|
126
133
|
assert ctk_gcloud_cfg is not None
|
|
127
134
|
assert cluster_name is not None
|
|
135
|
+
assert release_channel is not None
|
|
136
|
+
assert cluster_version is not None
|
|
128
137
|
docker_path, bp_path = setup_tests[0], setup_tests[1]
|
|
129
138
|
blueprint_name = f"{cluster_name}-a3-ultra-xpk"
|
|
130
139
|
|
|
@@ -144,6 +153,8 @@ def test_create_a3_ultra_deployment(setup_tests):
|
|
|
144
153
|
capacity_type=CapacityType.SPOT,
|
|
145
154
|
num_nodes=1,
|
|
146
155
|
system_node_pool_machine_type="e2-standard-16",
|
|
156
|
+
release_channel=ReleaseChannel(release_channel),
|
|
157
|
+
cluster_version=cluster_version,
|
|
147
158
|
)
|
|
148
159
|
blueprint_test_path = os.path.join(bp_path, f"{blueprint_name}.yaml")
|
|
149
160
|
blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
|
integration/gcluster_a4_test.py
CHANGED
|
@@ -24,6 +24,7 @@ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
|
|
|
24
24
|
from xpk.core.capacity import CapacityType
|
|
25
25
|
from xpk.core.docker_manager import DockerManager
|
|
26
26
|
from xpk.core.gcluster_manager import GclusterManager
|
|
27
|
+
from xpk.utils.versions import ReleaseChannel
|
|
27
28
|
|
|
28
29
|
ctk_gcloud_cfg = os.getenv("GCLOUD_CFG_PATH")
|
|
29
30
|
project_id = os.getenv("PROJECT_ID")
|
|
@@ -31,6 +32,8 @@ region = os.getenv("REGION")
|
|
|
31
32
|
zone = os.getenv("ZONE")
|
|
32
33
|
auth_cidr = os.getenv("AUTH_CIDR")
|
|
33
34
|
cluster_name = os.getenv("A4_TEST_CLUSTER_NAME")
|
|
35
|
+
release_channel = os.getenv("RELEASE_CHANNEL")
|
|
36
|
+
cluster_version = os.getenv("CLUSTER_VERSION")
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
@pytest.fixture(name="setup_tests")
|
|
@@ -60,6 +63,8 @@ def test_create_a4_deployment_files(setup_tests):
|
|
|
60
63
|
assert auth_cidr is not None
|
|
61
64
|
assert ctk_gcloud_cfg is not None
|
|
62
65
|
assert cluster_name is not None
|
|
66
|
+
assert release_channel is not None
|
|
67
|
+
assert cluster_version is not None
|
|
63
68
|
docker_path, bp_path = setup_tests[0], setup_tests[1]
|
|
64
69
|
blueprint_name = f"{cluster_name}-a4-xpk"
|
|
65
70
|
|
|
@@ -80,6 +85,8 @@ def test_create_a4_deployment_files(setup_tests):
|
|
|
80
85
|
num_nodes=1,
|
|
81
86
|
system_node_pool_machine_type="e2-standard-16",
|
|
82
87
|
prefix=prefix,
|
|
88
|
+
release_channel=ReleaseChannel(release_channel),
|
|
89
|
+
cluster_version=cluster_version,
|
|
83
90
|
)
|
|
84
91
|
blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
|
|
85
92
|
blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
|
|
@@ -125,6 +132,8 @@ def test_create_a4_deployment(setup_tests):
|
|
|
125
132
|
assert auth_cidr is not None
|
|
126
133
|
assert ctk_gcloud_cfg is not None
|
|
127
134
|
assert cluster_name is not None
|
|
135
|
+
assert release_channel is not None
|
|
136
|
+
assert cluster_version is not None
|
|
128
137
|
docker_path, bp_path = setup_tests[0], setup_tests[1]
|
|
129
138
|
blueprint_name = f"{cluster_name}-a4-xpk"
|
|
130
139
|
|
|
@@ -144,6 +153,8 @@ def test_create_a4_deployment(setup_tests):
|
|
|
144
153
|
capacity_type=CapacityType.SPOT,
|
|
145
154
|
num_nodes=1,
|
|
146
155
|
system_node_pool_machine_type="e2-standard-16",
|
|
156
|
+
release_channel=ReleaseChannel(release_channel),
|
|
157
|
+
cluster_version=cluster_version,
|
|
147
158
|
)
|
|
148
159
|
blueprint_test_path = os.path.join(bp_path, f"{blueprint_name}.yaml")
|
|
149
160
|
blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
kind: ConfigMap
|
|
2
|
+
apiVersion: v1
|
|
3
|
+
metadata:
|
|
4
|
+
name: ${resource_config_name}
|
|
5
|
+
data:
|
|
6
|
+
h100-mega-80gb-8: "${num_nodes}"
|
|
7
|
+
---
|
|
8
|
+
kind: ConfigMap
|
|
9
|
+
apiVersion: v1
|
|
10
|
+
metadata:
|
|
11
|
+
name: ${cluster_config_name}
|
|
12
|
+
data:
|
|
13
|
+
capacity_type: "${capacity_type}"
|
|
14
|
+
reservation_id: "${reservation}"
|
|
15
|
+
provisioner: gcluster
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
apiVersion: apiextensions.k8s.io/v1
|
|
2
|
+
kind: CustomResourceDefinition
|
|
3
|
+
metadata:
|
|
4
|
+
name: storages.xpk.x-k8s.io
|
|
5
|
+
spec:
|
|
6
|
+
group: xpk.x-k8s.io
|
|
7
|
+
versions:
|
|
8
|
+
- name: v1
|
|
9
|
+
served: true
|
|
10
|
+
storage: true
|
|
11
|
+
schema:
|
|
12
|
+
openAPIV3Schema:
|
|
13
|
+
type: object
|
|
14
|
+
properties:
|
|
15
|
+
spec:
|
|
16
|
+
type: object
|
|
17
|
+
properties:
|
|
18
|
+
type:
|
|
19
|
+
type: string
|
|
20
|
+
cluster:
|
|
21
|
+
type: string
|
|
22
|
+
auto_mount:
|
|
23
|
+
type: boolean
|
|
24
|
+
mount_point:
|
|
25
|
+
type: string
|
|
26
|
+
readonly:
|
|
27
|
+
type: boolean
|
|
28
|
+
manifest:
|
|
29
|
+
type: string
|
|
30
|
+
pv:
|
|
31
|
+
type: string
|
|
32
|
+
pvc:
|
|
33
|
+
type: string
|
|
34
|
+
required:
|
|
35
|
+
- type
|
|
36
|
+
- cluster
|
|
37
|
+
- auto_mount
|
|
38
|
+
- mount_point
|
|
39
|
+
- readonly
|
|
40
|
+
- manifest
|
|
41
|
+
- pvc
|
|
42
|
+
- pv
|
|
43
|
+
x-kubernetes-validations:
|
|
44
|
+
- message: Value is immutable
|
|
45
|
+
rule: self == oldSelf
|
|
46
|
+
scope: Cluster
|
|
47
|
+
names:
|
|
48
|
+
plural: storages
|
|
49
|
+
singular: storage
|
|
50
|
+
kind: Storage
|
|
51
|
+
shortNames:
|
|
52
|
+
- stg
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
kind: ConfigMap
|
|
2
|
+
apiVersion: v1
|
|
3
|
+
metadata:
|
|
4
|
+
name: ${resource_config_name}
|
|
5
|
+
data:
|
|
6
|
+
h200-141gb-8: "${num_nodes}"
|
|
7
|
+
---
|
|
8
|
+
kind: ConfigMap
|
|
9
|
+
apiVersion: v1
|
|
10
|
+
metadata:
|
|
11
|
+
name: ${cluster_config_name}
|
|
12
|
+
data:
|
|
13
|
+
capacity_type: "${capacity_type}"
|
|
14
|
+
reservation_id: "${reservation}"
|
|
15
|
+
provisioner: gcluster
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Copyright 2024 Google Inc. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
apiVersion: apps/v1
|
|
16
|
+
kind: DaemonSet
|
|
17
|
+
metadata:
|
|
18
|
+
name: disable-mglru
|
|
19
|
+
namespace: kube-system
|
|
20
|
+
spec:
|
|
21
|
+
selector:
|
|
22
|
+
matchLabels:
|
|
23
|
+
app: disable-mglru
|
|
24
|
+
template:
|
|
25
|
+
metadata:
|
|
26
|
+
labels:
|
|
27
|
+
app: disable-mglru
|
|
28
|
+
spec:
|
|
29
|
+
hostNetwork: true
|
|
30
|
+
tolerations:
|
|
31
|
+
- operator: "Exists"
|
|
32
|
+
key: nvidia.com/gpu
|
|
33
|
+
containers:
|
|
34
|
+
- name: disable-mglru
|
|
35
|
+
image: alpine:latest
|
|
36
|
+
command: ["/bin/sh"]
|
|
37
|
+
securityContext:
|
|
38
|
+
privileged: true
|
|
39
|
+
args:
|
|
40
|
+
- -c
|
|
41
|
+
- |
|
|
42
|
+
echo n | tee /sys/kernel/mm/lru_gen/enabled
|
|
43
|
+
sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
|
|
44
|
+
sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
|
|
45
|
+
sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
|
|
46
|
+
sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
|
|
47
|
+
sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
|
|
48
|
+
sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
|
|
49
|
+
sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
|
|
50
|
+
sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
|
|
51
|
+
sleep infinity
|
|
52
|
+
volumeMounts:
|
|
53
|
+
- name: sys-kernel-mm-lru-gen
|
|
54
|
+
mountPath: /sys/kernel/mm/lru_gen
|
|
55
|
+
# Remount sysfs so that it will be writable.
|
|
56
|
+
volumes:
|
|
57
|
+
- name: sys-kernel-mm-lru-gen
|
|
58
|
+
hostPath:
|
|
59
|
+
path: /sys/kernel/mm/lru_gen
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# Copyright 2024 Google Inc. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
apiVersion: apps/v1
|
|
16
|
+
kind: DaemonSet
|
|
17
|
+
metadata:
|
|
18
|
+
name: nccl-rdma-installer
|
|
19
|
+
namespace: kube-system
|
|
20
|
+
labels:
|
|
21
|
+
k8s-app: nccl-rdma-installer
|
|
22
|
+
spec:
|
|
23
|
+
selector:
|
|
24
|
+
matchLabels:
|
|
25
|
+
k8s-app: nccl-rdma-installer
|
|
26
|
+
updateStrategy:
|
|
27
|
+
type: RollingUpdate
|
|
28
|
+
template:
|
|
29
|
+
metadata:
|
|
30
|
+
labels:
|
|
31
|
+
name: nccl-rdma-installer
|
|
32
|
+
k8s-app: nccl-rdma-installer
|
|
33
|
+
spec:
|
|
34
|
+
priorityClassName: system-node-critical
|
|
35
|
+
affinity:
|
|
36
|
+
nodeAffinity:
|
|
37
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
38
|
+
nodeSelectorTerms:
|
|
39
|
+
- matchExpressions:
|
|
40
|
+
- key: cloud.google.com/gke-accelerator
|
|
41
|
+
operator: In
|
|
42
|
+
values:
|
|
43
|
+
- nvidia-h200-141gb
|
|
44
|
+
tolerations:
|
|
45
|
+
- operator: "Exists"
|
|
46
|
+
hostNetwork: true
|
|
47
|
+
hostPID: true
|
|
48
|
+
volumes:
|
|
49
|
+
- name: library-dir-host
|
|
50
|
+
hostPath:
|
|
51
|
+
path: /home/kubernetes/bin/nvidia/lib64
|
|
52
|
+
type: DirectoryOrCreate
|
|
53
|
+
- name: gib
|
|
54
|
+
hostPath:
|
|
55
|
+
path: /home/kubernetes/bin/gib
|
|
56
|
+
initContainers:
|
|
57
|
+
- name: disable-log-martian
|
|
58
|
+
image: alpine:latest
|
|
59
|
+
command: ["/bin/sh"]
|
|
60
|
+
securityContext:
|
|
61
|
+
privileged: true
|
|
62
|
+
args:
|
|
63
|
+
- -c
|
|
64
|
+
- |
|
|
65
|
+
sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
|
|
66
|
+
sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
|
|
67
|
+
sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
|
|
68
|
+
sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
|
|
69
|
+
sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
|
|
70
|
+
sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
|
|
71
|
+
sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
|
|
72
|
+
sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
|
|
73
|
+
- image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3
|
|
74
|
+
name: nccl-rdma-installer
|
|
75
|
+
resources:
|
|
76
|
+
requests:
|
|
77
|
+
cpu: 150m
|
|
78
|
+
securityContext:
|
|
79
|
+
privileged: true
|
|
80
|
+
volumeMounts:
|
|
81
|
+
- name: library-dir-host
|
|
82
|
+
mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
|
|
83
|
+
- name: gib
|
|
84
|
+
mountPath: /usr/local/home/kubernetes/bin/gib
|
|
85
|
+
command: ["/bin/sh", "-c"]
|
|
86
|
+
args:
|
|
87
|
+
- |
|
|
88
|
+
set -ex
|
|
89
|
+
/scripts/container_entry.sh install --install-nccl
|
|
90
|
+
cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
|
|
91
|
+
cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
|
|
92
|
+
echo "installation finishes"
|
|
93
|
+
containers:
|
|
94
|
+
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
|
|
95
|
+
name: pause
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
apiVersion: apiextensions.k8s.io/v1
|
|
2
|
+
kind: CustomResourceDefinition
|
|
3
|
+
metadata:
|
|
4
|
+
name: storages.xpk.x-k8s.io
|
|
5
|
+
spec:
|
|
6
|
+
group: xpk.x-k8s.io
|
|
7
|
+
versions:
|
|
8
|
+
- name: v1
|
|
9
|
+
served: true
|
|
10
|
+
storage: true
|
|
11
|
+
schema:
|
|
12
|
+
openAPIV3Schema:
|
|
13
|
+
type: object
|
|
14
|
+
properties:
|
|
15
|
+
spec:
|
|
16
|
+
type: object
|
|
17
|
+
properties:
|
|
18
|
+
type:
|
|
19
|
+
type: string
|
|
20
|
+
cluster:
|
|
21
|
+
type: string
|
|
22
|
+
auto_mount:
|
|
23
|
+
type: boolean
|
|
24
|
+
mount_point:
|
|
25
|
+
type: string
|
|
26
|
+
readonly:
|
|
27
|
+
type: boolean
|
|
28
|
+
manifest:
|
|
29
|
+
type: string
|
|
30
|
+
pv:
|
|
31
|
+
type: string
|
|
32
|
+
pvc:
|
|
33
|
+
type: string
|
|
34
|
+
required:
|
|
35
|
+
- type
|
|
36
|
+
- cluster
|
|
37
|
+
- auto_mount
|
|
38
|
+
- mount_point
|
|
39
|
+
- readonly
|
|
40
|
+
- manifest
|
|
41
|
+
- pvc
|
|
42
|
+
- pv
|
|
43
|
+
x-kubernetes-validations:
|
|
44
|
+
- message: Value is immutable
|
|
45
|
+
rule: self == oldSelf
|
|
46
|
+
scope: Cluster
|
|
47
|
+
names:
|
|
48
|
+
plural: storages
|
|
49
|
+
singular: storage
|
|
50
|
+
kind: Storage
|
|
51
|
+
shortNames:
|
|
52
|
+
- stg
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
kind: ConfigMap
|
|
2
|
+
apiVersion: v1
|
|
3
|
+
metadata:
|
|
4
|
+
name: ${resource_config_name}
|
|
5
|
+
data:
|
|
6
|
+
b200-8: "${num_nodes}"
|
|
7
|
+
---
|
|
8
|
+
kind: ConfigMap
|
|
9
|
+
apiVersion: v1
|
|
10
|
+
metadata:
|
|
11
|
+
name: ${cluster_config_name}
|
|
12
|
+
data:
|
|
13
|
+
capacity_type: "${capacity_type}"
|
|
14
|
+
reservation_id: "${reservation}"
|
|
15
|
+
provisioner: gcluster
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
apiVersion: apps/v1
|
|
2
|
+
kind: DaemonSet
|
|
3
|
+
metadata:
|
|
4
|
+
name: nccl-rdma-installer
|
|
5
|
+
namespace: kube-system
|
|
6
|
+
labels:
|
|
7
|
+
k8s-app: nccl-rdma-installer
|
|
8
|
+
spec:
|
|
9
|
+
selector:
|
|
10
|
+
matchLabels:
|
|
11
|
+
k8s-app: nccl-rdma-installer
|
|
12
|
+
updateStrategy:
|
|
13
|
+
type: RollingUpdate
|
|
14
|
+
template:
|
|
15
|
+
metadata:
|
|
16
|
+
labels:
|
|
17
|
+
name: nccl-rdma-installer
|
|
18
|
+
k8s-app: nccl-rdma-installer
|
|
19
|
+
spec:
|
|
20
|
+
priorityClassName: system-node-critical
|
|
21
|
+
affinity:
|
|
22
|
+
nodeAffinity:
|
|
23
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
24
|
+
nodeSelectorTerms:
|
|
25
|
+
- matchExpressions:
|
|
26
|
+
- key: cloud.google.com/gke-accelerator
|
|
27
|
+
operator: In
|
|
28
|
+
values:
|
|
29
|
+
- nvidia-b200
|
|
30
|
+
tolerations:
|
|
31
|
+
- operator: "Exists"
|
|
32
|
+
hostNetwork: true
|
|
33
|
+
hostPID: true
|
|
34
|
+
volumes:
|
|
35
|
+
- name: library-dir-host
|
|
36
|
+
hostPath:
|
|
37
|
+
path: /home/kubernetes/bin/nvidia/lib64
|
|
38
|
+
type: DirectoryOrCreate
|
|
39
|
+
- name: gib
|
|
40
|
+
hostPath:
|
|
41
|
+
path: /home/kubernetes/bin/gib
|
|
42
|
+
initContainers:
|
|
43
|
+
- image: us-docker.pkg.dev/kernel-net-team/clouda4-nccl-dev/nccl-plugin-gib-diagnostic:v1.0.3-b200
|
|
44
|
+
name: nccl-rdma-installer
|
|
45
|
+
resources:
|
|
46
|
+
requests:
|
|
47
|
+
cpu: 150m
|
|
48
|
+
securityContext:
|
|
49
|
+
privileged: true
|
|
50
|
+
volumeMounts:
|
|
51
|
+
- name: library-dir-host
|
|
52
|
+
mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
|
|
53
|
+
- name: gib
|
|
54
|
+
mountPath: /usr/local/home/kubernetes/bin/gib
|
|
55
|
+
command: ["/bin/sh", "-c"]
|
|
56
|
+
args:
|
|
57
|
+
- |
|
|
58
|
+
set -ex
|
|
59
|
+
/scripts/container_entry.sh install --install-nccl
|
|
60
|
+
cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
|
|
61
|
+
cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
|
|
62
|
+
# ibv_devinfo || exit 1
|
|
63
|
+
echo "installation finishes"
|
|
64
|
+
containers:
|
|
65
|
+
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
|
|
66
|
+
name: pause
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
apiVersion: apiextensions.k8s.io/v1
|
|
2
|
+
kind: CustomResourceDefinition
|
|
3
|
+
metadata:
|
|
4
|
+
name: storages.xpk.x-k8s.io
|
|
5
|
+
spec:
|
|
6
|
+
group: xpk.x-k8s.io
|
|
7
|
+
versions:
|
|
8
|
+
- name: v1
|
|
9
|
+
served: true
|
|
10
|
+
storage: true
|
|
11
|
+
schema:
|
|
12
|
+
openAPIV3Schema:
|
|
13
|
+
type: object
|
|
14
|
+
properties:
|
|
15
|
+
spec:
|
|
16
|
+
type: object
|
|
17
|
+
properties:
|
|
18
|
+
type:
|
|
19
|
+
type: string
|
|
20
|
+
cluster:
|
|
21
|
+
type: string
|
|
22
|
+
auto_mount:
|
|
23
|
+
type: boolean
|
|
24
|
+
mount_point:
|
|
25
|
+
type: string
|
|
26
|
+
readonly:
|
|
27
|
+
type: boolean
|
|
28
|
+
manifest:
|
|
29
|
+
type: string
|
|
30
|
+
pv:
|
|
31
|
+
type: string
|
|
32
|
+
pvc:
|
|
33
|
+
type: string
|
|
34
|
+
required:
|
|
35
|
+
- type
|
|
36
|
+
- cluster
|
|
37
|
+
- auto_mount
|
|
38
|
+
- mount_point
|
|
39
|
+
- readonly
|
|
40
|
+
- manifest
|
|
41
|
+
- pvc
|
|
42
|
+
- pv
|
|
43
|
+
x-kubernetes-validations:
|
|
44
|
+
- message: Value is immutable
|
|
45
|
+
rule: self == oldSelf
|
|
46
|
+
scope: Cluster
|
|
47
|
+
names:
|
|
48
|
+
plural: storages
|
|
49
|
+
singular: storage
|
|
50
|
+
kind: Storage
|
|
51
|
+
shortNames:
|
|
52
|
+
- stg
|