xpk 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. integration/README.md +19 -0
  2. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  3. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  4. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  5. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  6. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  7. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  8. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  9. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  10. xpk/blueprints/a4/storage_crd.yaml +52 -0
  11. xpk/commands/cluster.py +33 -12
  12. xpk/commands/cluster_gcluster_test.py +5 -1
  13. xpk/commands/cluster_test.py +125 -0
  14. xpk/commands/config.py +3 -3
  15. xpk/commands/inspector.py +5 -3
  16. xpk/commands/kind.py +2 -0
  17. xpk/commands/managed_ml_diagnostics.py +249 -0
  18. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  19. xpk/commands/workload.py +125 -139
  20. xpk/commands/workload_test.py +160 -118
  21. xpk/core/blueprint/blueprint_generator.py +3 -0
  22. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  23. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  24. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  25. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  26. xpk/core/capacity.py +2 -0
  27. xpk/core/cluster.py +18 -47
  28. xpk/core/cluster_test.py +76 -1
  29. xpk/core/config.py +81 -7
  30. xpk/core/config_test.py +67 -11
  31. xpk/core/docker_container.py +3 -1
  32. xpk/core/docker_image.py +10 -6
  33. xpk/core/docker_resources.py +1 -10
  34. xpk/core/kjob.py +17 -16
  35. xpk/core/kueue_manager.py +13 -19
  36. xpk/core/kueue_manager_test.py +27 -1
  37. xpk/core/nap.py +13 -14
  38. xpk/core/nodepool.py +17 -15
  39. xpk/core/nodepool_test.py +25 -4
  40. xpk/core/pathways.py +23 -0
  41. xpk/core/pathways_test.py +57 -0
  42. xpk/core/resources.py +84 -27
  43. xpk/core/scheduling.py +128 -132
  44. xpk/core/scheduling_test.py +215 -2
  45. xpk/core/system_characteristics.py +179 -0
  46. xpk/core/system_characteristics_test.py +49 -1
  47. xpk/core/telemetry.py +4 -4
  48. xpk/core/telemetry_test.py +9 -9
  49. xpk/core/vertex.py +4 -3
  50. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  51. xpk/main.py +2 -0
  52. xpk/parser/cluster.py +22 -88
  53. xpk/parser/cluster_test.py +41 -0
  54. xpk/parser/common.py +84 -0
  55. xpk/parser/storage.py +10 -0
  56. xpk/parser/storage_test.py +47 -0
  57. xpk/parser/workload.py +14 -41
  58. xpk/parser/workload_test.py +2 -48
  59. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  60. xpk/utils/feature_flags.py +3 -0
  61. xpk/utils/validation.py +2 -2
  62. xpk-0.16.1.dist-info/METADATA +127 -0
  63. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/RECORD +67 -48
  64. xpk-0.15.0.dist-info/METADATA +0 -1666
  65. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/WHEEL +0 -0
  66. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/top_level.txt +0 -0
integration/README.md ADDED
@@ -0,0 +1,19 @@
1
+ This folder contains integration tests.
2
+
3
+ To run them env variables are needed:
4
+
5
+ ```bash
6
+ export PROJECT_ID=...
7
+ export REGION=...
8
+ export ZONE=...
9
+ export AUTH_CIDR=...
10
+ export DEPLOYMENT_DIR=...
11
+ export CLUSTER_NAME=...
12
+ export GCLOUD_CFG_PATH=...
13
+ ```
14
+
15
+ To run tests:
16
+
17
+ ```bash
18
+ pytest src/integration
19
+ ```
@@ -0,0 +1,15 @@
1
+ kind: ConfigMap
2
+ apiVersion: v1
3
+ metadata:
4
+ name: ${resource_config_name}
5
+ data:
6
+ h100-mega-80gb-8: "${num_nodes}"
7
+ ---
8
+ kind: ConfigMap
9
+ apiVersion: v1
10
+ metadata:
11
+ name: ${cluster_config_name}
12
+ data:
13
+ capacity_type: "${capacity_type}"
14
+ reservation_id: "${reservation}"
15
+ provisioner: gcluster
@@ -0,0 +1,52 @@
1
+ apiVersion: apiextensions.k8s.io/v1
2
+ kind: CustomResourceDefinition
3
+ metadata:
4
+ name: storages.xpk.x-k8s.io
5
+ spec:
6
+ group: xpk.x-k8s.io
7
+ versions:
8
+ - name: v1
9
+ served: true
10
+ storage: true
11
+ schema:
12
+ openAPIV3Schema:
13
+ type: object
14
+ properties:
15
+ spec:
16
+ type: object
17
+ properties:
18
+ type:
19
+ type: string
20
+ cluster:
21
+ type: string
22
+ auto_mount:
23
+ type: boolean
24
+ mount_point:
25
+ type: string
26
+ readonly:
27
+ type: boolean
28
+ manifest:
29
+ type: string
30
+ pv:
31
+ type: string
32
+ pvc:
33
+ type: string
34
+ required:
35
+ - type
36
+ - cluster
37
+ - auto_mount
38
+ - mount_point
39
+ - readonly
40
+ - manifest
41
+ - pvc
42
+ - pv
43
+ x-kubernetes-validations:
44
+ - message: Value is immutable
45
+ rule: self == oldSelf
46
+ scope: Cluster
47
+ names:
48
+ plural: storages
49
+ singular: storage
50
+ kind: Storage
51
+ shortNames:
52
+ - stg
@@ -0,0 +1,15 @@
1
+ kind: ConfigMap
2
+ apiVersion: v1
3
+ metadata:
4
+ name: ${resource_config_name}
5
+ data:
6
+ h200-141gb-8: "${num_nodes}"
7
+ ---
8
+ kind: ConfigMap
9
+ apiVersion: v1
10
+ metadata:
11
+ name: ${cluster_config_name}
12
+ data:
13
+ capacity_type: "${capacity_type}"
14
+ reservation_id: "${reservation}"
15
+ provisioner: gcluster
@@ -0,0 +1,59 @@
1
+ # Copyright 2024 Google Inc. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ apiVersion: apps/v1
16
+ kind: DaemonSet
17
+ metadata:
18
+ name: disable-mglru
19
+ namespace: kube-system
20
+ spec:
21
+ selector:
22
+ matchLabels:
23
+ app: disable-mglru
24
+ template:
25
+ metadata:
26
+ labels:
27
+ app: disable-mglru
28
+ spec:
29
+ hostNetwork: true
30
+ tolerations:
31
+ - operator: "Exists"
32
+ key: nvidia.com/gpu
33
+ containers:
34
+ - name: disable-mglru
35
+ image: alpine:latest
36
+ command: ["/bin/sh"]
37
+ securityContext:
38
+ privileged: true
39
+ args:
40
+ - -c
41
+ - |
42
+ echo n | tee /sys/kernel/mm/lru_gen/enabled
43
+ sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
44
+ sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
45
+ sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
46
+ sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
47
+ sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
48
+ sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
49
+ sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
50
+ sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
51
+ sleep infinity
52
+ volumeMounts:
53
+ - name: sys-kernel-mm-lru-gen
54
+ mountPath: /sys/kernel/mm/lru_gen
55
+ # Remount sysfs so that it will be writable.
56
+ volumes:
57
+ - name: sys-kernel-mm-lru-gen
58
+ hostPath:
59
+ path: /sys/kernel/mm/lru_gen
@@ -0,0 +1,95 @@
1
+ # Copyright 2024 Google Inc. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ apiVersion: apps/v1
16
+ kind: DaemonSet
17
+ metadata:
18
+ name: nccl-rdma-installer
19
+ namespace: kube-system
20
+ labels:
21
+ k8s-app: nccl-rdma-installer
22
+ spec:
23
+ selector:
24
+ matchLabels:
25
+ k8s-app: nccl-rdma-installer
26
+ updateStrategy:
27
+ type: RollingUpdate
28
+ template:
29
+ metadata:
30
+ labels:
31
+ name: nccl-rdma-installer
32
+ k8s-app: nccl-rdma-installer
33
+ spec:
34
+ priorityClassName: system-node-critical
35
+ affinity:
36
+ nodeAffinity:
37
+ requiredDuringSchedulingIgnoredDuringExecution:
38
+ nodeSelectorTerms:
39
+ - matchExpressions:
40
+ - key: cloud.google.com/gke-accelerator
41
+ operator: In
42
+ values:
43
+ - nvidia-h200-141gb
44
+ tolerations:
45
+ - operator: "Exists"
46
+ hostNetwork: true
47
+ hostPID: true
48
+ volumes:
49
+ - name: library-dir-host
50
+ hostPath:
51
+ path: /home/kubernetes/bin/nvidia/lib64
52
+ type: DirectoryOrCreate
53
+ - name: gib
54
+ hostPath:
55
+ path: /home/kubernetes/bin/gib
56
+ initContainers:
57
+ - name: disable-log-martian
58
+ image: alpine:latest
59
+ command: ["/bin/sh"]
60
+ securityContext:
61
+ privileged: true
62
+ args:
63
+ - -c
64
+ - |
65
+ sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
66
+ sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
67
+ sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
68
+ sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
69
+ sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
70
+ sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
71
+ sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
72
+ sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
73
+ - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3
74
+ name: nccl-rdma-installer
75
+ resources:
76
+ requests:
77
+ cpu: 150m
78
+ securityContext:
79
+ privileged: true
80
+ volumeMounts:
81
+ - name: library-dir-host
82
+ mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
83
+ - name: gib
84
+ mountPath: /usr/local/home/kubernetes/bin/gib
85
+ command: ["/bin/sh", "-c"]
86
+ args:
87
+ - |
88
+ set -ex
89
+ /scripts/container_entry.sh install --install-nccl
90
+ cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
91
+ cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
92
+ echo "installation finishes"
93
+ containers:
94
+ - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
95
+ name: pause
@@ -0,0 +1,52 @@
1
+ apiVersion: apiextensions.k8s.io/v1
2
+ kind: CustomResourceDefinition
3
+ metadata:
4
+ name: storages.xpk.x-k8s.io
5
+ spec:
6
+ group: xpk.x-k8s.io
7
+ versions:
8
+ - name: v1
9
+ served: true
10
+ storage: true
11
+ schema:
12
+ openAPIV3Schema:
13
+ type: object
14
+ properties:
15
+ spec:
16
+ type: object
17
+ properties:
18
+ type:
19
+ type: string
20
+ cluster:
21
+ type: string
22
+ auto_mount:
23
+ type: boolean
24
+ mount_point:
25
+ type: string
26
+ readonly:
27
+ type: boolean
28
+ manifest:
29
+ type: string
30
+ pv:
31
+ type: string
32
+ pvc:
33
+ type: string
34
+ required:
35
+ - type
36
+ - cluster
37
+ - auto_mount
38
+ - mount_point
39
+ - readonly
40
+ - manifest
41
+ - pvc
42
+ - pv
43
+ x-kubernetes-validations:
44
+ - message: Value is immutable
45
+ rule: self == oldSelf
46
+ scope: Cluster
47
+ names:
48
+ plural: storages
49
+ singular: storage
50
+ kind: Storage
51
+ shortNames:
52
+ - stg
@@ -0,0 +1,15 @@
1
+ kind: ConfigMap
2
+ apiVersion: v1
3
+ metadata:
4
+ name: ${resource_config_name}
5
+ data:
6
+ b200-8: "${num_nodes}"
7
+ ---
8
+ kind: ConfigMap
9
+ apiVersion: v1
10
+ metadata:
11
+ name: ${cluster_config_name}
12
+ data:
13
+ capacity_type: "${capacity_type}"
14
+ reservation_id: "${reservation}"
15
+ provisioner: gcluster
@@ -0,0 +1,66 @@
1
+ apiVersion: apps/v1
2
+ kind: DaemonSet
3
+ metadata:
4
+ name: nccl-rdma-installer
5
+ namespace: kube-system
6
+ labels:
7
+ k8s-app: nccl-rdma-installer
8
+ spec:
9
+ selector:
10
+ matchLabels:
11
+ k8s-app: nccl-rdma-installer
12
+ updateStrategy:
13
+ type: RollingUpdate
14
+ template:
15
+ metadata:
16
+ labels:
17
+ name: nccl-rdma-installer
18
+ k8s-app: nccl-rdma-installer
19
+ spec:
20
+ priorityClassName: system-node-critical
21
+ affinity:
22
+ nodeAffinity:
23
+ requiredDuringSchedulingIgnoredDuringExecution:
24
+ nodeSelectorTerms:
25
+ - matchExpressions:
26
+ - key: cloud.google.com/gke-accelerator
27
+ operator: In
28
+ values:
29
+ - nvidia-b200
30
+ tolerations:
31
+ - operator: "Exists"
32
+ hostNetwork: true
33
+ hostPID: true
34
+ volumes:
35
+ - name: library-dir-host
36
+ hostPath:
37
+ path: /home/kubernetes/bin/nvidia/lib64
38
+ type: DirectoryOrCreate
39
+ - name: gib
40
+ hostPath:
41
+ path: /home/kubernetes/bin/gib
42
+ initContainers:
43
+ - image: us-docker.pkg.dev/kernel-net-team/clouda4-nccl-dev/nccl-plugin-gib-diagnostic:v1.0.3-b200
44
+ name: nccl-rdma-installer
45
+ resources:
46
+ requests:
47
+ cpu: 150m
48
+ securityContext:
49
+ privileged: true
50
+ volumeMounts:
51
+ - name: library-dir-host
52
+ mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
53
+ - name: gib
54
+ mountPath: /usr/local/home/kubernetes/bin/gib
55
+ command: ["/bin/sh", "-c"]
56
+ args:
57
+ - |
58
+ set -ex
59
+ /scripts/container_entry.sh install --install-nccl
60
+ cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
61
+ cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
62
+ # ibv_devinfo || exit 1
63
+ echo "installation finishes"
64
+ containers:
65
+ - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
66
+ name: pause
@@ -0,0 +1,52 @@
1
+ apiVersion: apiextensions.k8s.io/v1
2
+ kind: CustomResourceDefinition
3
+ metadata:
4
+ name: storages.xpk.x-k8s.io
5
+ spec:
6
+ group: xpk.x-k8s.io
7
+ versions:
8
+ - name: v1
9
+ served: true
10
+ storage: true
11
+ schema:
12
+ openAPIV3Schema:
13
+ type: object
14
+ properties:
15
+ spec:
16
+ type: object
17
+ properties:
18
+ type:
19
+ type: string
20
+ cluster:
21
+ type: string
22
+ auto_mount:
23
+ type: boolean
24
+ mount_point:
25
+ type: string
26
+ readonly:
27
+ type: boolean
28
+ manifest:
29
+ type: string
30
+ pv:
31
+ type: string
32
+ pvc:
33
+ type: string
34
+ required:
35
+ - type
36
+ - cluster
37
+ - auto_mount
38
+ - mount_point
39
+ - readonly
40
+ - manifest
41
+ - pvc
42
+ - pv
43
+ x-kubernetes-validations:
44
+ - message: Value is immutable
45
+ rule: self == oldSelf
46
+ scope: Cluster
47
+ names:
48
+ plural: storages
49
+ singular: storage
50
+ kind: Storage
51
+ shortNames:
52
+ - stg
xpk/commands/cluster.py CHANGED
@@ -18,7 +18,8 @@ from tabulate import tabulate
18
18
 
19
19
  from ..utils.feature_flags import FeatureFlags
20
20
  from ..utils.versions import ReleaseChannel
21
- from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE, get_reservation_deployment_type
21
+ from ..core.pathways import get_pathways_machine_types
22
+ from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type
22
23
  from ..core.cluster import (
23
24
  get_all_clusters_programmatic,
24
25
  get_cluster_credentials,
@@ -27,7 +28,6 @@ from ..core.cluster import (
27
28
  set_jobset_on_cluster,
28
29
  set_pathways_job_on_cluster,
29
30
  setup_k8s_env,
30
- disable_mglru_on_cluster,
31
31
  count_nodes_on_cluster,
32
32
  update_cluster_with_gcpfilestore_driver_if_necessary,
33
33
  update_cluster_with_gcsfuse_driver_if_necessary,
@@ -84,6 +84,7 @@ from jinja2 import Environment, FileSystemLoader
84
84
  from ..utils.templates import get_templates_absolute_path
85
85
  import shutil
86
86
  import os
87
+ from .managed_ml_diagnostics import install_mldiagnostics_prerequisites
87
88
 
88
89
  CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
89
90
 
@@ -210,6 +211,25 @@ def _validate_cluster_create_args(args, system: SystemCharacteristics):
210
211
  if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
211
212
  validate_sub_slicing_system(system)
212
213
  _validate_sub_slicing_reservation(args)
214
+ if args.enable_pathways:
215
+ _validate_pathways_machine(args)
216
+
217
+
218
+ def _validate_pathways_machine(args):
219
+ return_code, result = get_pathways_machine_types(
220
+ project=args.project, zone=args.zone
221
+ )
222
+ if return_code != 0:
223
+ xpk_print('Error: Unable to retrieve available pathways machine types')
224
+ xpk_exit(1)
225
+
226
+ if args.pathways_gce_machine_type not in result:
227
+ xpk_print(
228
+ 'Error: Invalid --pathways-gce-machine-type. Specify machine type that'
229
+ ' has at least 100GB of memory and at least 49 CPUs.'
230
+ )
231
+ xpk_print(f'Available machine types: {", ".join(result)}')
232
+ xpk_exit(1)
213
233
 
214
234
 
215
235
  def _validate_sub_slicing_reservation(args):
@@ -261,11 +281,10 @@ def cluster_create(args) -> None:
261
281
  xpk_print('Fetching system characteristics failed!')
262
282
  xpk_exit(return_code)
263
283
 
264
- _validate_cluster_create_args(args, system)
265
-
266
284
  xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
267
285
  add_zone_and_project(args)
268
286
 
287
+ _validate_cluster_create_args(args, system)
269
288
  _log_cluster_create_telemetry(args)
270
289
 
271
290
  release_channel = (
@@ -422,6 +441,13 @@ def cluster_create(args) -> None:
422
441
  # pylint: disable=line-too-long
423
442
  f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
424
443
  )
444
+
445
+ if args.managed_mldiagnostics:
446
+ return_code = install_mldiagnostics_prerequisites()
447
+ if return_code != 0:
448
+ xpk_print('Installation of MLDiagnostics failed.')
449
+ xpk_exit(return_code)
450
+
425
451
  xpk_exit(0)
426
452
 
427
453
 
@@ -979,7 +1005,7 @@ def update_coredns() -> int:
979
1005
 
980
1006
  # 6. Scale up coredns and verify readiness
981
1007
  scale_up_coredns(replicas=15)
982
- verify_coredns_readiness(timeout=120)
1008
+ verify_coredns_readiness()
983
1009
 
984
1010
  xpk_print('The CoreDNS setup process has been completed.')
985
1011
 
@@ -1220,7 +1246,8 @@ def run_gke_cluster_create_command(
1220
1246
 
1221
1247
  if args.enable_lustre_csi_driver:
1222
1248
  addons.append('LustreCsiDriver')
1223
- command += ' --enable-legacy-lustre-port'
1249
+ if args.enable_legacy_lustre_port:
1250
+ command += ' --enable-legacy-lustre-port'
1224
1251
 
1225
1252
  if hasattr(args, 'enable_mtc') and args.enable_mtc:
1226
1253
  addons.append('HighScaleCheckpointing')
@@ -1336,12 +1363,6 @@ def prepare_gpus(system: SystemCharacteristics):
1336
1363
  if install_nri_code != 0:
1337
1364
  xpk_exit(install_nri_code)
1338
1365
 
1339
- if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
1340
- xpk_print('Disabling MGLRU')
1341
- err_code = disable_mglru_on_cluster()
1342
- if err_code > 0:
1343
- xpk_exit(err_code)
1344
-
1345
1366
 
1346
1367
  def _log_cluster_create_telemetry(args) -> None:
1347
1368
  if FeatureFlags.TELEMETRY_ENABLED:
@@ -20,7 +20,7 @@ import pytest
20
20
 
21
21
  from xpk.commands.cluster_gcluster import cluster_create
22
22
  from xpk.core.kueue_manager import KueueConfig
23
- from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
23
+ from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
24
24
  from xpk.utils.versions import ReleaseChannel
25
25
 
26
26
 
@@ -97,6 +97,8 @@ def test_install_kueue_standard(
97
97
  accelerator_type=AcceleratorType.GPU,
98
98
  device_type="h100-mega-80gb-8",
99
99
  supports_sub_slicing=False,
100
+ docker_platform=DockerPlatform.ARM,
101
+ gpu_config=GpuConfig(requires_topology=True),
100
102
  )
101
103
  mock_cluster_create_deps["get_system_characteristics"].return_value = (
102
104
  mock_system,
@@ -148,6 +150,8 @@ def test_install_kueue_with_autoprovisioning(
148
150
  accelerator_type=AcceleratorType.GPU,
149
151
  device_type="h100-mega-80gb-8",
150
152
  supports_sub_slicing=False,
153
+ docker_platform=DockerPlatform.ARM,
154
+ gpu_config=GpuConfig(requires_topology=True),
151
155
  )
152
156
  mock_cluster_create_deps["get_system_characteristics"].return_value = (
153
157
  mock_system,