xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. integration/README.md +19 -0
  2. integration/gcluster_a3mega_test.py +11 -0
  3. integration/gcluster_a3ultra_test.py +11 -0
  4. integration/gcluster_a4_test.py +11 -0
  5. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  6. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  7. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  8. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  9. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  10. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  11. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  12. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  13. xpk/blueprints/a4/storage_crd.yaml +52 -0
  14. xpk/commands/cluster.py +89 -32
  15. xpk/commands/cluster_gcluster.py +25 -5
  16. xpk/commands/cluster_gcluster_test.py +16 -3
  17. xpk/commands/cluster_test.py +353 -7
  18. xpk/commands/config.py +3 -5
  19. xpk/commands/inspector.py +5 -3
  20. xpk/commands/kind.py +3 -1
  21. xpk/commands/managed_ml_diagnostics.py +249 -0
  22. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  23. xpk/commands/storage.py +8 -10
  24. xpk/commands/workload.py +143 -142
  25. xpk/commands/workload_test.py +160 -118
  26. xpk/core/blueprint/blueprint_generator.py +73 -33
  27. xpk/core/blueprint/blueprint_test.py +9 -0
  28. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  29. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  30. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  31. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  32. xpk/core/capacity.py +48 -8
  33. xpk/core/capacity_test.py +32 -1
  34. xpk/core/cluster.py +55 -104
  35. xpk/core/cluster_test.py +170 -0
  36. xpk/core/commands.py +4 -10
  37. xpk/core/config.py +88 -7
  38. xpk/core/config_test.py +67 -11
  39. xpk/core/docker_container.py +3 -1
  40. xpk/core/docker_image.py +10 -6
  41. xpk/core/docker_resources.py +1 -10
  42. xpk/core/gcloud_context.py +18 -12
  43. xpk/core/gcloud_context_test.py +111 -1
  44. xpk/core/kjob.py +17 -19
  45. xpk/core/kueue_manager.py +205 -51
  46. xpk/core/kueue_manager_test.py +158 -4
  47. xpk/core/nap.py +13 -14
  48. xpk/core/nodepool.py +37 -43
  49. xpk/core/nodepool_test.py +42 -19
  50. xpk/core/pathways.py +23 -0
  51. xpk/core/pathways_test.py +57 -0
  52. xpk/core/resources.py +84 -27
  53. xpk/core/scheduling.py +144 -133
  54. xpk/core/scheduling_test.py +298 -6
  55. xpk/core/system_characteristics.py +256 -19
  56. xpk/core/system_characteristics_test.py +128 -5
  57. xpk/core/telemetry.py +263 -0
  58. xpk/core/telemetry_test.py +211 -0
  59. xpk/core/vertex.py +4 -3
  60. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  61. xpk/main.py +33 -13
  62. xpk/parser/cluster.py +40 -67
  63. xpk/parser/cluster_test.py +83 -3
  64. xpk/parser/common.py +84 -0
  65. xpk/parser/storage.py +10 -0
  66. xpk/parser/storage_test.py +47 -0
  67. xpk/parser/workload.py +14 -29
  68. xpk/parser/workload_test.py +3 -49
  69. xpk/telemetry_uploader.py +29 -0
  70. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  71. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  72. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  73. xpk/utils/console.py +41 -10
  74. xpk/utils/console_test.py +106 -0
  75. xpk/utils/feature_flags.py +10 -1
  76. xpk/utils/file.py +4 -1
  77. xpk/utils/topology.py +4 -0
  78. xpk/utils/user_agent.py +35 -0
  79. xpk/utils/user_agent_test.py +44 -0
  80. xpk/utils/user_input.py +48 -0
  81. xpk/utils/user_input_test.py +92 -0
  82. xpk/utils/validation.py +2 -13
  83. xpk/utils/versions.py +31 -0
  84. xpk-0.16.0.dist-info/METADATA +127 -0
  85. xpk-0.16.0.dist-info/RECORD +168 -0
  86. xpk-0.14.4.dist-info/METADATA +0 -1645
  87. xpk-0.14.4.dist-info/RECORD +0 -139
  88. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  89. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  90. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  91. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
integration/README.md ADDED
@@ -0,0 +1,19 @@
1
+ This folder contains integration tests.
2
+
3
+ To run them env variables are needed:
4
+
5
+ ```bash
6
+ export PROJECT_ID=...
7
+ export REGION=...
8
+ export ZONE=...
9
+ export AUTH_CIDR=...
10
+ export DEPLOYMENT_DIR=...
11
+ export CLUSTER_NAME=...
12
+ export GCLOUD_CFG_PATH=...
13
+ ```
14
+
15
+ To run tests:
16
+
17
+ ```bash
18
+ pytest src/integration
19
+ ```
@@ -18,6 +18,7 @@ from xpk.commands.cluster_gcluster import get_unique_name
18
18
  from xpk.core.docker_manager import DockerManager
19
19
  from xpk.core.gcluster_manager import GclusterManager
20
20
  from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
21
+ from xpk.utils.versions import ReleaseChannel
21
22
  import pytest
22
23
  import os
23
24
  import shutil
@@ -28,6 +29,8 @@ region = os.getenv("REGION")
28
29
  zone = os.getenv("ZONE")
29
30
  auth_cidr = os.getenv("AUTH_CIDR")
30
31
  cluster_name = os.getenv("A3_MEGA_TEST_CLUSTER_NAME")
32
+ release_channel = os.getenv("RELEASE_CHANNEL")
33
+ cluster_version = os.getenv("CLUSTER_VERSION")
31
34
 
32
35
  uploads_dir = "uploads"
33
36
 
@@ -87,6 +90,8 @@ def test_create_a3_mega_deployment_files(setup_tests):
87
90
  assert auth_cidr is not None
88
91
  assert ctk_gcloud_cfg is not None
89
92
  assert cluster_name is not None
93
+ assert release_channel is not None
94
+ assert cluster_version is not None
90
95
  docker_path, bp_path = setup_tests[0], setup_tests[1]
91
96
 
92
97
  blueprint_name = f"{cluster_name}-a3-mega-xpk"
@@ -107,6 +112,8 @@ def test_create_a3_mega_deployment_files(setup_tests):
107
112
  auth_cidr=auth_cidr,
108
113
  zone=zone,
109
114
  system_node_pool_min_node_count=3,
115
+ release_channel=ReleaseChannel(release_channel),
116
+ cluster_version=cluster_version,
110
117
  )
111
118
  blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
112
119
  blueprint_deps_test_path = os.path.join(bp_path, prefix, blueprint_name)
@@ -164,6 +171,8 @@ def create_test_a3_mega_deployment(docker_path: str, bp_path: str):
164
171
  assert auth_cidr is not None
165
172
  assert ctk_gcloud_cfg is not None
166
173
  assert cluster_name is not None
174
+ assert release_channel is not None
175
+ assert cluster_version is not None
167
176
 
168
177
  blueprint_name = f"{cluster_name}-a3-mega-xpk"
169
178
  prefix = "prefix"
@@ -183,6 +192,8 @@ def create_test_a3_mega_deployment(docker_path: str, bp_path: str):
183
192
  auth_cidr=auth_cidr,
184
193
  zone=zone,
185
194
  system_node_pool_min_node_count=3,
195
+ release_channel=ReleaseChannel(release_channel),
196
+ cluster_version=cluster_version,
186
197
  )
187
198
 
188
199
  gcluster_manager = GclusterManager(
@@ -24,6 +24,7 @@ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
24
24
  from xpk.core.capacity import CapacityType
25
25
  from xpk.core.docker_manager import DockerManager
26
26
  from xpk.core.gcluster_manager import GclusterManager
27
+ from xpk.utils.versions import ReleaseChannel
27
28
 
28
29
  ctk_gcloud_cfg = os.getenv("GCLOUD_CFG_PATH")
29
30
  project_id = os.getenv("PROJECT_ID")
@@ -31,6 +32,8 @@ region = os.getenv("REGION")
31
32
  zone = os.getenv("ZONE")
32
33
  auth_cidr = os.getenv("AUTH_CIDR")
33
34
  cluster_name = os.getenv("A3_ULTRA_TEST_CLUSTER_NAME")
35
+ release_channel = os.getenv("RELEASE_CHANNEL")
36
+ cluster_version = os.getenv("CLUSTER_VERSION")
34
37
 
35
38
 
36
39
  @pytest.fixture(name="setup_tests")
@@ -60,6 +63,8 @@ def test_create_a3_ultra_deployment_files(setup_tests):
60
63
  assert auth_cidr is not None
61
64
  assert ctk_gcloud_cfg is not None
62
65
  assert cluster_name is not None
66
+ assert release_channel is not None
67
+ assert cluster_version is not None
63
68
  docker_path, bp_path = setup_tests[0], setup_tests[1]
64
69
  blueprint_name = f"{cluster_name}-a3-ultra-xpk"
65
70
 
@@ -80,6 +85,8 @@ def test_create_a3_ultra_deployment_files(setup_tests):
80
85
  num_nodes=1,
81
86
  system_node_pool_machine_type="e2-standard-16",
82
87
  prefix=prefix,
88
+ release_channel=ReleaseChannel(release_channel),
89
+ cluster_version=cluster_version,
83
90
  )
84
91
  blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
85
92
  blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
@@ -125,6 +132,8 @@ def test_create_a3_ultra_deployment(setup_tests):
125
132
  assert auth_cidr is not None
126
133
  assert ctk_gcloud_cfg is not None
127
134
  assert cluster_name is not None
135
+ assert release_channel is not None
136
+ assert cluster_version is not None
128
137
  docker_path, bp_path = setup_tests[0], setup_tests[1]
129
138
  blueprint_name = f"{cluster_name}-a3-ultra-xpk"
130
139
 
@@ -144,6 +153,8 @@ def test_create_a3_ultra_deployment(setup_tests):
144
153
  capacity_type=CapacityType.SPOT,
145
154
  num_nodes=1,
146
155
  system_node_pool_machine_type="e2-standard-16",
156
+ release_channel=ReleaseChannel(release_channel),
157
+ cluster_version=cluster_version,
147
158
  )
148
159
  blueprint_test_path = os.path.join(bp_path, f"{blueprint_name}.yaml")
149
160
  blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
@@ -24,6 +24,7 @@ from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
24
24
  from xpk.core.capacity import CapacityType
25
25
  from xpk.core.docker_manager import DockerManager
26
26
  from xpk.core.gcluster_manager import GclusterManager
27
+ from xpk.utils.versions import ReleaseChannel
27
28
 
28
29
  ctk_gcloud_cfg = os.getenv("GCLOUD_CFG_PATH")
29
30
  project_id = os.getenv("PROJECT_ID")
@@ -31,6 +32,8 @@ region = os.getenv("REGION")
31
32
  zone = os.getenv("ZONE")
32
33
  auth_cidr = os.getenv("AUTH_CIDR")
33
34
  cluster_name = os.getenv("A4_TEST_CLUSTER_NAME")
35
+ release_channel = os.getenv("RELEASE_CHANNEL")
36
+ cluster_version = os.getenv("CLUSTER_VERSION")
34
37
 
35
38
 
36
39
  @pytest.fixture(name="setup_tests")
@@ -60,6 +63,8 @@ def test_create_a4_deployment_files(setup_tests):
60
63
  assert auth_cidr is not None
61
64
  assert ctk_gcloud_cfg is not None
62
65
  assert cluster_name is not None
66
+ assert release_channel is not None
67
+ assert cluster_version is not None
63
68
  docker_path, bp_path = setup_tests[0], setup_tests[1]
64
69
  blueprint_name = f"{cluster_name}-a4-xpk"
65
70
 
@@ -80,6 +85,8 @@ def test_create_a4_deployment_files(setup_tests):
80
85
  num_nodes=1,
81
86
  system_node_pool_machine_type="e2-standard-16",
82
87
  prefix=prefix,
88
+ release_channel=ReleaseChannel(release_channel),
89
+ cluster_version=cluster_version,
83
90
  )
84
91
  blueprint_test_path = os.path.join(bp_path, prefix, f"{blueprint_name}.yaml")
85
92
  blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
@@ -125,6 +132,8 @@ def test_create_a4_deployment(setup_tests):
125
132
  assert auth_cidr is not None
126
133
  assert ctk_gcloud_cfg is not None
127
134
  assert cluster_name is not None
135
+ assert release_channel is not None
136
+ assert cluster_version is not None
128
137
  docker_path, bp_path = setup_tests[0], setup_tests[1]
129
138
  blueprint_name = f"{cluster_name}-a4-xpk"
130
139
 
@@ -144,6 +153,8 @@ def test_create_a4_deployment(setup_tests):
144
153
  capacity_type=CapacityType.SPOT,
145
154
  num_nodes=1,
146
155
  system_node_pool_machine_type="e2-standard-16",
156
+ release_channel=ReleaseChannel(release_channel),
157
+ cluster_version=cluster_version,
147
158
  )
148
159
  blueprint_test_path = os.path.join(bp_path, f"{blueprint_name}.yaml")
149
160
  blueprint_deps_test_path = os.path.join(bp_path, blueprint_name)
@@ -0,0 +1,15 @@
1
+ kind: ConfigMap
2
+ apiVersion: v1
3
+ metadata:
4
+ name: ${resource_config_name}
5
+ data:
6
+ h100-mega-80gb-8: "${num_nodes}"
7
+ ---
8
+ kind: ConfigMap
9
+ apiVersion: v1
10
+ metadata:
11
+ name: ${cluster_config_name}
12
+ data:
13
+ capacity_type: "${capacity_type}"
14
+ reservation_id: "${reservation}"
15
+ provisioner: gcluster
@@ -0,0 +1,52 @@
1
+ apiVersion: apiextensions.k8s.io/v1
2
+ kind: CustomResourceDefinition
3
+ metadata:
4
+ name: storages.xpk.x-k8s.io
5
+ spec:
6
+ group: xpk.x-k8s.io
7
+ versions:
8
+ - name: v1
9
+ served: true
10
+ storage: true
11
+ schema:
12
+ openAPIV3Schema:
13
+ type: object
14
+ properties:
15
+ spec:
16
+ type: object
17
+ properties:
18
+ type:
19
+ type: string
20
+ cluster:
21
+ type: string
22
+ auto_mount:
23
+ type: boolean
24
+ mount_point:
25
+ type: string
26
+ readonly:
27
+ type: boolean
28
+ manifest:
29
+ type: string
30
+ pv:
31
+ type: string
32
+ pvc:
33
+ type: string
34
+ required:
35
+ - type
36
+ - cluster
37
+ - auto_mount
38
+ - mount_point
39
+ - readonly
40
+ - manifest
41
+ - pvc
42
+ - pv
43
+ x-kubernetes-validations:
44
+ - message: Value is immutable
45
+ rule: self == oldSelf
46
+ scope: Cluster
47
+ names:
48
+ plural: storages
49
+ singular: storage
50
+ kind: Storage
51
+ shortNames:
52
+ - stg
@@ -0,0 +1,15 @@
1
+ kind: ConfigMap
2
+ apiVersion: v1
3
+ metadata:
4
+ name: ${resource_config_name}
5
+ data:
6
+ h200-141gb-8: "${num_nodes}"
7
+ ---
8
+ kind: ConfigMap
9
+ apiVersion: v1
10
+ metadata:
11
+ name: ${cluster_config_name}
12
+ data:
13
+ capacity_type: "${capacity_type}"
14
+ reservation_id: "${reservation}"
15
+ provisioner: gcluster
@@ -0,0 +1,59 @@
1
+ # Copyright 2024 Google Inc. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ apiVersion: apps/v1
16
+ kind: DaemonSet
17
+ metadata:
18
+ name: disable-mglru
19
+ namespace: kube-system
20
+ spec:
21
+ selector:
22
+ matchLabels:
23
+ app: disable-mglru
24
+ template:
25
+ metadata:
26
+ labels:
27
+ app: disable-mglru
28
+ spec:
29
+ hostNetwork: true
30
+ tolerations:
31
+ - operator: "Exists"
32
+ key: nvidia.com/gpu
33
+ containers:
34
+ - name: disable-mglru
35
+ image: alpine:latest
36
+ command: ["/bin/sh"]
37
+ securityContext:
38
+ privileged: true
39
+ args:
40
+ - -c
41
+ - |
42
+ echo n | tee /sys/kernel/mm/lru_gen/enabled
43
+ sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
44
+ sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
45
+ sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
46
+ sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
47
+ sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
48
+ sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
49
+ sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
50
+ sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
51
+ sleep infinity
52
+ volumeMounts:
53
+ - name: sys-kernel-mm-lru-gen
54
+ mountPath: /sys/kernel/mm/lru_gen
55
+ # Remount sysfs so that it will be writable.
56
+ volumes:
57
+ - name: sys-kernel-mm-lru-gen
58
+ hostPath:
59
+ path: /sys/kernel/mm/lru_gen
@@ -0,0 +1,95 @@
1
+ # Copyright 2024 Google Inc. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ apiVersion: apps/v1
16
+ kind: DaemonSet
17
+ metadata:
18
+ name: nccl-rdma-installer
19
+ namespace: kube-system
20
+ labels:
21
+ k8s-app: nccl-rdma-installer
22
+ spec:
23
+ selector:
24
+ matchLabels:
25
+ k8s-app: nccl-rdma-installer
26
+ updateStrategy:
27
+ type: RollingUpdate
28
+ template:
29
+ metadata:
30
+ labels:
31
+ name: nccl-rdma-installer
32
+ k8s-app: nccl-rdma-installer
33
+ spec:
34
+ priorityClassName: system-node-critical
35
+ affinity:
36
+ nodeAffinity:
37
+ requiredDuringSchedulingIgnoredDuringExecution:
38
+ nodeSelectorTerms:
39
+ - matchExpressions:
40
+ - key: cloud.google.com/gke-accelerator
41
+ operator: In
42
+ values:
43
+ - nvidia-h200-141gb
44
+ tolerations:
45
+ - operator: "Exists"
46
+ hostNetwork: true
47
+ hostPID: true
48
+ volumes:
49
+ - name: library-dir-host
50
+ hostPath:
51
+ path: /home/kubernetes/bin/nvidia/lib64
52
+ type: DirectoryOrCreate
53
+ - name: gib
54
+ hostPath:
55
+ path: /home/kubernetes/bin/gib
56
+ initContainers:
57
+ - name: disable-log-martian
58
+ image: alpine:latest
59
+ command: ["/bin/sh"]
60
+ securityContext:
61
+ privileged: true
62
+ args:
63
+ - -c
64
+ - |
65
+ sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
66
+ sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
67
+ sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
68
+ sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
69
+ sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
70
+ sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
71
+ sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
72
+ sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
73
+ - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3
74
+ name: nccl-rdma-installer
75
+ resources:
76
+ requests:
77
+ cpu: 150m
78
+ securityContext:
79
+ privileged: true
80
+ volumeMounts:
81
+ - name: library-dir-host
82
+ mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
83
+ - name: gib
84
+ mountPath: /usr/local/home/kubernetes/bin/gib
85
+ command: ["/bin/sh", "-c"]
86
+ args:
87
+ - |
88
+ set -ex
89
+ /scripts/container_entry.sh install --install-nccl
90
+ cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
91
+ cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
92
+ echo "installation finishes"
93
+ containers:
94
+ - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
95
+ name: pause
@@ -0,0 +1,52 @@
1
+ apiVersion: apiextensions.k8s.io/v1
2
+ kind: CustomResourceDefinition
3
+ metadata:
4
+ name: storages.xpk.x-k8s.io
5
+ spec:
6
+ group: xpk.x-k8s.io
7
+ versions:
8
+ - name: v1
9
+ served: true
10
+ storage: true
11
+ schema:
12
+ openAPIV3Schema:
13
+ type: object
14
+ properties:
15
+ spec:
16
+ type: object
17
+ properties:
18
+ type:
19
+ type: string
20
+ cluster:
21
+ type: string
22
+ auto_mount:
23
+ type: boolean
24
+ mount_point:
25
+ type: string
26
+ readonly:
27
+ type: boolean
28
+ manifest:
29
+ type: string
30
+ pv:
31
+ type: string
32
+ pvc:
33
+ type: string
34
+ required:
35
+ - type
36
+ - cluster
37
+ - auto_mount
38
+ - mount_point
39
+ - readonly
40
+ - manifest
41
+ - pvc
42
+ - pv
43
+ x-kubernetes-validations:
44
+ - message: Value is immutable
45
+ rule: self == oldSelf
46
+ scope: Cluster
47
+ names:
48
+ plural: storages
49
+ singular: storage
50
+ kind: Storage
51
+ shortNames:
52
+ - stg
@@ -0,0 +1,15 @@
1
+ kind: ConfigMap
2
+ apiVersion: v1
3
+ metadata:
4
+ name: ${resource_config_name}
5
+ data:
6
+ b200-8: "${num_nodes}"
7
+ ---
8
+ kind: ConfigMap
9
+ apiVersion: v1
10
+ metadata:
11
+ name: ${cluster_config_name}
12
+ data:
13
+ capacity_type: "${capacity_type}"
14
+ reservation_id: "${reservation}"
15
+ provisioner: gcluster
@@ -0,0 +1,66 @@
1
+ apiVersion: apps/v1
2
+ kind: DaemonSet
3
+ metadata:
4
+ name: nccl-rdma-installer
5
+ namespace: kube-system
6
+ labels:
7
+ k8s-app: nccl-rdma-installer
8
+ spec:
9
+ selector:
10
+ matchLabels:
11
+ k8s-app: nccl-rdma-installer
12
+ updateStrategy:
13
+ type: RollingUpdate
14
+ template:
15
+ metadata:
16
+ labels:
17
+ name: nccl-rdma-installer
18
+ k8s-app: nccl-rdma-installer
19
+ spec:
20
+ priorityClassName: system-node-critical
21
+ affinity:
22
+ nodeAffinity:
23
+ requiredDuringSchedulingIgnoredDuringExecution:
24
+ nodeSelectorTerms:
25
+ - matchExpressions:
26
+ - key: cloud.google.com/gke-accelerator
27
+ operator: In
28
+ values:
29
+ - nvidia-b200
30
+ tolerations:
31
+ - operator: "Exists"
32
+ hostNetwork: true
33
+ hostPID: true
34
+ volumes:
35
+ - name: library-dir-host
36
+ hostPath:
37
+ path: /home/kubernetes/bin/nvidia/lib64
38
+ type: DirectoryOrCreate
39
+ - name: gib
40
+ hostPath:
41
+ path: /home/kubernetes/bin/gib
42
+ initContainers:
43
+ - image: us-docker.pkg.dev/kernel-net-team/clouda4-nccl-dev/nccl-plugin-gib-diagnostic:v1.0.3-b200
44
+ name: nccl-rdma-installer
45
+ resources:
46
+ requests:
47
+ cpu: 150m
48
+ securityContext:
49
+ privileged: true
50
+ volumeMounts:
51
+ - name: library-dir-host
52
+ mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
53
+ - name: gib
54
+ mountPath: /usr/local/home/kubernetes/bin/gib
55
+ command: ["/bin/sh", "-c"]
56
+ args:
57
+ - |
58
+ set -ex
59
+ /scripts/container_entry.sh install --install-nccl
60
+ cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
61
+ cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
62
+ # ibv_devinfo || exit 1
63
+ echo "installation finishes"
64
+ containers:
65
+ - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
66
+ name: pause
@@ -0,0 +1,52 @@
1
+ apiVersion: apiextensions.k8s.io/v1
2
+ kind: CustomResourceDefinition
3
+ metadata:
4
+ name: storages.xpk.x-k8s.io
5
+ spec:
6
+ group: xpk.x-k8s.io
7
+ versions:
8
+ - name: v1
9
+ served: true
10
+ storage: true
11
+ schema:
12
+ openAPIV3Schema:
13
+ type: object
14
+ properties:
15
+ spec:
16
+ type: object
17
+ properties:
18
+ type:
19
+ type: string
20
+ cluster:
21
+ type: string
22
+ auto_mount:
23
+ type: boolean
24
+ mount_point:
25
+ type: string
26
+ readonly:
27
+ type: boolean
28
+ manifest:
29
+ type: string
30
+ pv:
31
+ type: string
32
+ pvc:
33
+ type: string
34
+ required:
35
+ - type
36
+ - cluster
37
+ - auto_mount
38
+ - mount_point
39
+ - readonly
40
+ - manifest
41
+ - pvc
42
+ - pv
43
+ x-kubernetes-validations:
44
+ - message: Value is immutable
45
+ rule: self == oldSelf
46
+ scope: Cluster
47
+ names:
48
+ plural: storages
49
+ singular: storage
50
+ kind: Storage
51
+ shortNames:
52
+ - stg