xpk 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +33 -12
- xpk/commands/cluster_gcluster_test.py +5 -1
- xpk/commands/cluster_test.py +125 -0
- xpk/commands/config.py +3 -3
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +2 -0
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/workload.py +125 -139
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +3 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +2 -0
- xpk/core/cluster.py +18 -47
- xpk/core/cluster_test.py +76 -1
- xpk/core/config.py +81 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/kjob.py +17 -16
- xpk/core/kueue_manager.py +13 -19
- xpk/core/kueue_manager_test.py +27 -1
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +17 -15
- xpk/core/nodepool_test.py +25 -4
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +128 -132
- xpk/core/scheduling_test.py +215 -2
- xpk/core/system_characteristics.py +179 -0
- xpk/core/system_characteristics_test.py +49 -1
- xpk/core/telemetry.py +4 -4
- xpk/core/telemetry_test.py +9 -9
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +2 -0
- xpk/parser/cluster.py +22 -88
- xpk/parser/cluster_test.py +41 -0
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -41
- xpk/parser/workload_test.py +2 -48
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/utils/feature_flags.py +3 -0
- xpk/utils/validation.py +2 -2
- xpk-0.16.1.dist-info/METADATA +127 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/RECORD +67 -48
- xpk-0.15.0.dist-info/METADATA +0 -1666
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/WHEEL +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# Copyright 2024 "Google LLC"
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
!Blueprint
|
|
15
|
+
blueprint_name: xpk-gke-a3-ultra
|
|
16
|
+
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
|
|
17
|
+
toolkit_modules_version: v1.62.2
|
|
18
|
+
|
|
19
|
+
vars:
|
|
20
|
+
labels: {gke_product_type: xpk}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
terraform_backend_defaults:
|
|
24
|
+
type: gcs
|
|
25
|
+
configuration:
|
|
26
|
+
bucket: test-bucket
|
|
27
|
+
prefix: xpk_terraform_state/testdir/gke-a3-ultra/
|
|
28
|
+
|
|
29
|
+
deployment_groups:
|
|
30
|
+
- !DeploymentGroup
|
|
31
|
+
group: primary
|
|
32
|
+
modules:
|
|
33
|
+
- !DeploymentModule
|
|
34
|
+
id: gke-a3-ultra-net-0
|
|
35
|
+
source: modules/network/vpc
|
|
36
|
+
settings:
|
|
37
|
+
network_name: gke-a3-ultra-net-0
|
|
38
|
+
subnetworks:
|
|
39
|
+
- subnet_name: gke-a3-ultra-sub-0
|
|
40
|
+
subnet_region: us-central1
|
|
41
|
+
subnet_ip: 192.168.0.0/18
|
|
42
|
+
secondary_ranges_list:
|
|
43
|
+
- subnetwork_name : gke-a3-ultra-sub-0
|
|
44
|
+
ranges:
|
|
45
|
+
- range_name: pods
|
|
46
|
+
ip_cidr_range: 10.4.0.0/14
|
|
47
|
+
- range_name: services
|
|
48
|
+
ip_cidr_range: 10.0.32.0/20
|
|
49
|
+
firewall_rules:
|
|
50
|
+
- name: gke-a3-ultra-internal-0
|
|
51
|
+
ranges: [192.168.0.0/16]
|
|
52
|
+
allow:
|
|
53
|
+
- protocol: tcp
|
|
54
|
+
ports: ["0-65535"]
|
|
55
|
+
- protocol: udp
|
|
56
|
+
ports: ["0-65535"]
|
|
57
|
+
- protocol: icmp
|
|
58
|
+
|
|
59
|
+
- !DeploymentModule
|
|
60
|
+
id: gke-a3-ultra-net-1
|
|
61
|
+
source: modules/network/vpc
|
|
62
|
+
settings:
|
|
63
|
+
network_name: gke-a3-ultra-net-1
|
|
64
|
+
mtu: 8896
|
|
65
|
+
subnetworks:
|
|
66
|
+
- subnet_name: gke-a3-ultra-sub-1
|
|
67
|
+
subnet_region: us-central1
|
|
68
|
+
subnet_ip: 192.168.64.0/18
|
|
69
|
+
firewall_rules:
|
|
70
|
+
- name: gke-a3-ultra-internal-1
|
|
71
|
+
ranges: [192.168.0.0/16]
|
|
72
|
+
allow:
|
|
73
|
+
- protocol: tcp
|
|
74
|
+
ports: ["0-65535"]
|
|
75
|
+
- protocol: udp
|
|
76
|
+
ports: ["0-65535"]
|
|
77
|
+
- protocol: icmp
|
|
78
|
+
|
|
79
|
+
- !DeploymentModule
|
|
80
|
+
id: gke-a3-ultra-rdma-net
|
|
81
|
+
source: modules/network/gpu-rdma-vpc
|
|
82
|
+
settings:
|
|
83
|
+
network_name: gke-a3-ultra-rdma-net
|
|
84
|
+
mtu: 8896
|
|
85
|
+
network_profile: https://www.googleapis.com/compute/beta/projects/foo/global/networkProfiles/us-central1-c-vpc-roce
|
|
86
|
+
network_routing_mode: REGIONAL
|
|
87
|
+
subnetworks_template:
|
|
88
|
+
name_prefix: gke-a3-ultra-rdma-sub
|
|
89
|
+
count: 8
|
|
90
|
+
ip_range: 192.168.128.0/18
|
|
91
|
+
region: us-central1
|
|
92
|
+
|
|
93
|
+
- !DeploymentModule
|
|
94
|
+
id: gke-a3-ultra-a3-ultragpu-cluster
|
|
95
|
+
source: modules/scheduler/gke-cluster
|
|
96
|
+
use: [gke-a3-ultra-net-0]
|
|
97
|
+
settings:
|
|
98
|
+
release_channel: RAPID
|
|
99
|
+
version_prefix: '1.2'
|
|
100
|
+
min_cluster_version: 1.2.3
|
|
101
|
+
prefix_with_deployment_name: false
|
|
102
|
+
name_suffix: gke-a3-ultra
|
|
103
|
+
system_node_pool_machine_type: "e2-standard-16"
|
|
104
|
+
enable_dcgm_monitoring: true
|
|
105
|
+
enable_gcsfuse_csi: true
|
|
106
|
+
enable_filestore_csi: true
|
|
107
|
+
enable_private_endpoint: false # Allows access from authorized public IPs
|
|
108
|
+
master_authorized_networks:
|
|
109
|
+
- cidr_block: 10.0.0.0/32 # Allows your machine to run the kubectl command. Required for multi network setup.
|
|
110
|
+
display_name: "kubectl-access-network"
|
|
111
|
+
system_node_pool_node_count:
|
|
112
|
+
total_min_nodes: 2
|
|
113
|
+
total_max_nodes: 1000
|
|
114
|
+
additional_networks: $(concat([{network=gke-a3-ultra-net-1.network_name, subnetwork=gke-a3-ultra-net-1.subnetwork_name, subnetwork_project="foo", nic_type="GVNIC", queue_count=null, network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a3-ultra-rdma-net.subnetwork_interfaces_gke))
|
|
115
|
+
k8s_network_names:
|
|
116
|
+
rdma_prefix: "gke-a3-ultra-rdma-sub-"
|
|
117
|
+
rdma_start_index: 0
|
|
118
|
+
rdma_postfix: ""
|
|
119
|
+
gvnic_prefix: "gke-a3-ultra-sub-"
|
|
120
|
+
gvnic_start_index: 1
|
|
121
|
+
outputs: [instructions]
|
|
122
|
+
|
|
123
|
+
- !DeploymentModule
|
|
124
|
+
id: gke-a3-ultra-a3u-pool
|
|
125
|
+
source: modules/compute/gke-node-pool
|
|
126
|
+
use: [gke-a3-ultra-a3-ultragpu-cluster]
|
|
127
|
+
settings:
|
|
128
|
+
machine_type: a3-ultragpu-8g
|
|
129
|
+
auto_upgrade: true
|
|
130
|
+
zones: [us-central1-c]
|
|
131
|
+
spot: false
|
|
132
|
+
max_pods_per_node: 32
|
|
133
|
+
guest_accelerator:
|
|
134
|
+
- type: nvidia-h200-141gb
|
|
135
|
+
count: 8
|
|
136
|
+
gpu_driver_installation_config:
|
|
137
|
+
gpu_driver_version: "LATEST"
|
|
138
|
+
additional_networks:
|
|
139
|
+
$(concat([{network=gke-a3-ultra-net-1.network_name, subnetwork=gke-a3-ultra-net-1.subnetwork_name, subnetwork_project="foo", nic_type="GVNIC", queue_count=null, network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a3-ultra-rdma-net.subnetwork_interfaces_gke))
|
|
140
|
+
reservation_affinity:
|
|
141
|
+
consume_reservation_type: SPECIFIC_RESERVATION
|
|
142
|
+
specific_reservations:
|
|
143
|
+
- name: test-reservation
|
|
144
|
+
static_node_count: 2
|
|
145
|
+
outputs: [instructions]
|
|
146
|
+
|
|
147
|
+
- !DeploymentModule
|
|
148
|
+
id: workload-manager-install
|
|
149
|
+
source: modules/management/kubectl-apply
|
|
150
|
+
use: [gke-a3-ultra-a3-ultragpu-cluster]
|
|
151
|
+
settings:
|
|
152
|
+
jobset:
|
|
153
|
+
install: true
|
|
154
|
+
version: v0.7.2
|
|
155
|
+
apply_manifests:
|
|
156
|
+
- source: $(ghpc_stage("xpk-gke-a3-ultra"))/nccl-installer.yaml
|
|
157
|
+
- source: $(ghpc_stage("xpk-gke-a3-ultra"))/mlgru-disable.yaml
|
|
158
|
+
- source: $(ghpc_stage("xpk-gke-a3-ultra"))/storage_crd.yaml
|
|
159
|
+
|
|
160
|
+
- !DeploymentModule
|
|
161
|
+
id: workload_configmap
|
|
162
|
+
source: 'modules/management/kubectl-apply'
|
|
163
|
+
use: ['gke-a3-ultra-a3-ultragpu-cluster']
|
|
164
|
+
settings:
|
|
165
|
+
apply_manifests:
|
|
166
|
+
- source: '$(ghpc_stage("xpk-gke-a3-ultra"))/config-map.yaml.tftpl'
|
|
167
|
+
template_vars: {
|
|
168
|
+
resource_config_name: "gke-a3-ultra-resources-configmap",
|
|
169
|
+
num_nodes: "2",
|
|
170
|
+
cluster_config_name: "gke-a3-ultra-metadata-configmap",
|
|
171
|
+
capacity_type: "reservation",
|
|
172
|
+
reservation: "test-reservation",
|
|
173
|
+
}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# Copyright 2024 "Google LLC"
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
!Blueprint
|
|
15
|
+
blueprint_name: xpk-gke-a4
|
|
16
|
+
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
|
|
17
|
+
toolkit_modules_version: v1.62.2
|
|
18
|
+
|
|
19
|
+
vars:
|
|
20
|
+
labels: {gke_product_type: xpk}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
terraform_backend_defaults:
|
|
24
|
+
type: gcs
|
|
25
|
+
configuration:
|
|
26
|
+
bucket: test-bucket
|
|
27
|
+
prefix: xpk_terraform_state/testdir/gke-a4/
|
|
28
|
+
|
|
29
|
+
deployment_groups:
|
|
30
|
+
- !DeploymentGroup
|
|
31
|
+
modules:
|
|
32
|
+
- !DeploymentModule
|
|
33
|
+
id: gke-a4-net-0
|
|
34
|
+
source: modules/network/vpc
|
|
35
|
+
settings:
|
|
36
|
+
network_name: gke-a4-net-0
|
|
37
|
+
mtu: 8896
|
|
38
|
+
subnetworks:
|
|
39
|
+
- subnet_name: gke-a4-sub-0
|
|
40
|
+
subnet_region: us-central1
|
|
41
|
+
subnet_ip: 192.168.0.0/18
|
|
42
|
+
secondary_ranges_list:
|
|
43
|
+
- subnetwork_name: gke-a4-sub-0
|
|
44
|
+
ranges:
|
|
45
|
+
- range_name: pods
|
|
46
|
+
ip_cidr_range: 10.4.0.0/14
|
|
47
|
+
- range_name: services
|
|
48
|
+
ip_cidr_range: 10.0.32.0/20
|
|
49
|
+
firewall_rules:
|
|
50
|
+
- name: gke-a4-internal-0
|
|
51
|
+
ranges:
|
|
52
|
+
- 192.168.0.0/16
|
|
53
|
+
allow:
|
|
54
|
+
- protocol: tcp
|
|
55
|
+
ports:
|
|
56
|
+
- 0-65535
|
|
57
|
+
- protocol: udp
|
|
58
|
+
ports:
|
|
59
|
+
- 0-65535
|
|
60
|
+
- protocol: icmp
|
|
61
|
+
|
|
62
|
+
- !DeploymentModule
|
|
63
|
+
id: gke-a4-net-1
|
|
64
|
+
source: modules/network/vpc
|
|
65
|
+
settings:
|
|
66
|
+
network_name: gke-a4-net-1
|
|
67
|
+
mtu: 8896
|
|
68
|
+
subnetworks:
|
|
69
|
+
- subnet_name: gke-a4-sub-1
|
|
70
|
+
subnet_region: us-central1
|
|
71
|
+
subnet_ip: 192.168.64.0/18
|
|
72
|
+
firewall_rules:
|
|
73
|
+
- name: gke-a4-internal-1
|
|
74
|
+
ranges:
|
|
75
|
+
- 192.168.0.0/16
|
|
76
|
+
allow:
|
|
77
|
+
- protocol: tcp
|
|
78
|
+
ports:
|
|
79
|
+
- 0-65535
|
|
80
|
+
- protocol: udp
|
|
81
|
+
ports:
|
|
82
|
+
- 0-65535
|
|
83
|
+
- protocol: icmp
|
|
84
|
+
|
|
85
|
+
- !DeploymentModule
|
|
86
|
+
id: gke-a4-rdma-net
|
|
87
|
+
source: modules/network/gpu-rdma-vpc
|
|
88
|
+
settings:
|
|
89
|
+
network_name: gke-a4-rdma-net
|
|
90
|
+
mtu: 8896
|
|
91
|
+
network_profile:
|
|
92
|
+
https://www.googleapis.com/compute/beta/projects/foo/global/networkProfiles/us-central1-c-vpc-roce
|
|
93
|
+
network_routing_mode: REGIONAL
|
|
94
|
+
subnetworks_template:
|
|
95
|
+
name_prefix: gke-a4-rdma-sub
|
|
96
|
+
count: 8
|
|
97
|
+
ip_range: 192.168.128.0/18
|
|
98
|
+
region: us-central1
|
|
99
|
+
|
|
100
|
+
- !DeploymentModule
|
|
101
|
+
id: gke-a4-a4-cluster
|
|
102
|
+
source: modules/scheduler/gke-cluster
|
|
103
|
+
outputs:
|
|
104
|
+
- instructions
|
|
105
|
+
settings:
|
|
106
|
+
system_node_pool_machine_type: e2-standard-16
|
|
107
|
+
system_node_pool_node_count:
|
|
108
|
+
total_min_nodes: 2
|
|
109
|
+
total_max_nodes: 1000
|
|
110
|
+
prefix_with_deployment_name: false
|
|
111
|
+
name_suffix: gke-a4
|
|
112
|
+
enable_dcgm_monitoring: true
|
|
113
|
+
enable_gcsfuse_csi: true
|
|
114
|
+
enable_private_endpoint: false
|
|
115
|
+
master_authorized_networks:
|
|
116
|
+
- cidr_block: 10.0.0.0/32
|
|
117
|
+
display_name: kubectl-access-network
|
|
118
|
+
additional_networks: $(concat([{network=gke-a4-net-1.network_name, subnetwork=gke-a4-net-1.subnetwork_name,
|
|
119
|
+
subnetwork_project="foo", nic_type="GVNIC", queue_count=null,
|
|
120
|
+
network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null,
|
|
121
|
+
network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a4-rdma-net.subnetwork_interfaces_gke))
|
|
122
|
+
release_channel: RAPID
|
|
123
|
+
version_prefix: '1.2'
|
|
124
|
+
min_cluster_version: 1.2.3
|
|
125
|
+
use:
|
|
126
|
+
- gke-a4-net-0
|
|
127
|
+
- !DeploymentModule
|
|
128
|
+
id: gke-a4-a4-pool
|
|
129
|
+
source: modules/compute/gke-node-pool
|
|
130
|
+
use:
|
|
131
|
+
- gke-a4-a4-cluster
|
|
132
|
+
outputs:
|
|
133
|
+
- instructions
|
|
134
|
+
settings:
|
|
135
|
+
machine_type: a4-highgpu-8g
|
|
136
|
+
auto_upgrade: true
|
|
137
|
+
zones:
|
|
138
|
+
- us-central1-c
|
|
139
|
+
disk_type: hyperdisk-balanced
|
|
140
|
+
local_ssd_count_ephemeral_storage: 32
|
|
141
|
+
spot: false
|
|
142
|
+
reservation_affinity:
|
|
143
|
+
consume_reservation_type: SPECIFIC_RESERVATION
|
|
144
|
+
specific_reservations:
|
|
145
|
+
- name: test-reservation
|
|
146
|
+
max_pods_per_node: 32
|
|
147
|
+
guest_accelerator:
|
|
148
|
+
- type: nvidia-b200
|
|
149
|
+
count: 8
|
|
150
|
+
gpu_driver_installation_config:
|
|
151
|
+
gpu_driver_version: LATEST
|
|
152
|
+
additional_networks: $(concat([{network=gke-a4-net-1.network_name, subnetwork=gke-a4-net-1.subnetwork_name,
|
|
153
|
+
subnetwork_project="foo", nic_type="GVNIC", queue_count=null,
|
|
154
|
+
network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null,
|
|
155
|
+
network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a4-rdma-net.subnetwork_interfaces_gke))
|
|
156
|
+
static_node_count: 2
|
|
157
|
+
|
|
158
|
+
- !DeploymentModule
|
|
159
|
+
id: workload-manager-install
|
|
160
|
+
source: modules/management/kubectl-apply
|
|
161
|
+
use:
|
|
162
|
+
- gke-a4-a4-cluster
|
|
163
|
+
settings:
|
|
164
|
+
jobset:
|
|
165
|
+
install: true
|
|
166
|
+
version: v0.7.2
|
|
167
|
+
apply_manifests:
|
|
168
|
+
- source: $(ghpc_stage("xpk-gke-a4"))/nccl-rdma-installer-a4.yaml
|
|
169
|
+
- source: $(ghpc_stage("xpk-gke-a4"))/storage_crd.yaml
|
|
170
|
+
|
|
171
|
+
- !DeploymentModule
|
|
172
|
+
id: workload_configmap
|
|
173
|
+
source: modules/management/kubectl-apply
|
|
174
|
+
settings:
|
|
175
|
+
apply_manifests:
|
|
176
|
+
- source: $(ghpc_stage("xpk-gke-a4"))/config-map.yaml.tftpl
|
|
177
|
+
template_vars:
|
|
178
|
+
resource_config_name: gke-a4-resources-configmap
|
|
179
|
+
num_nodes: '2'
|
|
180
|
+
cluster_config_name: gke-a4-metadata-configmap
|
|
181
|
+
capacity_type: reservation
|
|
182
|
+
reservation: test-reservation
|
|
183
|
+
use:
|
|
184
|
+
- gke-a4-a4-cluster
|
|
185
|
+
group: primary
|
xpk/core/capacity.py
CHANGED
|
@@ -29,6 +29,8 @@ H100_DEVICE_TYPE = 'h100-80gb-8'
|
|
|
29
29
|
H100_MEGA_DEVICE_TYPE = 'h100-mega-80gb-8'
|
|
30
30
|
H200_DEVICE_TYPE = 'h200-141gb-8'
|
|
31
31
|
B200_DEVICE_TYPE = 'b200-8'
|
|
32
|
+
GB200_DEVICE_TYPE = 'gb200-4'
|
|
33
|
+
GB200_DEVICE_TYPE_NOLSSD = 'gb200-4-no-ssd'
|
|
32
34
|
RESERVATION_CONFIG_KEY = 'reservation_id'
|
|
33
35
|
|
|
34
36
|
|
xpk/core/cluster.py
CHANGED
|
@@ -22,7 +22,7 @@ from kubernetes import config
|
|
|
22
22
|
from kubernetes.client.exceptions import ApiException
|
|
23
23
|
|
|
24
24
|
from ..utils.console import xpk_exit, xpk_print
|
|
25
|
-
from .capacity import
|
|
25
|
+
from .capacity import H200_DEVICE_TYPE
|
|
26
26
|
from .commands import (
|
|
27
27
|
run_command_for_value,
|
|
28
28
|
run_command_with_updates,
|
|
@@ -34,16 +34,11 @@ from .gcloud_context import (
|
|
|
34
34
|
zone_to_region,
|
|
35
35
|
)
|
|
36
36
|
from .resources import get_cluster_system_characteristics
|
|
37
|
-
from .system_characteristics import SystemCharacteristics
|
|
37
|
+
from .system_characteristics import INSTALLER_NCCL_TCPXO, SystemCharacteristics
|
|
38
38
|
|
|
39
39
|
JOBSET_VERSION = 'v0.8.0'
|
|
40
40
|
PATHWAYS_JOB_VERSION = 'v0.1.4'
|
|
41
|
-
INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
|
|
42
|
-
INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
|
|
43
|
-
INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
|
|
44
|
-
CONFIG_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-config.yaml'
|
|
45
41
|
NRI_DEVICE_INJECTOR = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nri_device_injector/nri-device-injector.yaml'
|
|
46
|
-
MGLRU_DISABLE = 'https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/main/examples/gke-a3-ultragpu/mglru-disable.yaml'
|
|
47
42
|
|
|
48
43
|
DEFAULT_NAMESPACE = 'default'
|
|
49
44
|
XPK_SA = 'xpk-sa'
|
|
@@ -118,12 +113,12 @@ def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
|
|
|
118
113
|
Returns:
|
|
119
114
|
0 if successful and 1 otherwise.
|
|
120
115
|
"""
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
116
|
+
nccl_installer = (
|
|
117
|
+
system.gpu_config.nccl_installer
|
|
118
|
+
if system.gpu_config and system.gpu_config.nccl_installer
|
|
119
|
+
else INSTALLER_NCCL_TCPXO
|
|
120
|
+
)
|
|
121
|
+
command = f'kubectl apply -f {nccl_installer}'
|
|
127
122
|
|
|
128
123
|
return_code = run_command_with_updates(
|
|
129
124
|
command, 'Install NCCL Plugin On Cluster'
|
|
@@ -135,35 +130,6 @@ def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
|
|
|
135
130
|
)
|
|
136
131
|
return 1
|
|
137
132
|
|
|
138
|
-
if system.device_type == H100_DEVICE_TYPE:
|
|
139
|
-
command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
|
|
140
|
-
|
|
141
|
-
return_code = run_command_with_updates(
|
|
142
|
-
command, 'Install NCCL Config On Cluster'
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
if return_code != 0:
|
|
146
|
-
xpk_print(
|
|
147
|
-
f'Install NCCL Config On Cluster request returned ERROR {return_code}'
|
|
148
|
-
)
|
|
149
|
-
return 1
|
|
150
|
-
|
|
151
|
-
return 0
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def disable_mglru_on_cluster() -> int:
|
|
155
|
-
"""Disable MGLRU on the cluster.
|
|
156
|
-
|
|
157
|
-
Returns:
|
|
158
|
-
0 if successful and 1 otherwise.
|
|
159
|
-
"""
|
|
160
|
-
command = f'kubectl apply -f {MGLRU_DISABLE}'
|
|
161
|
-
return_code = run_command_with_updates(command, 'Disable MGLRU On Cluster')
|
|
162
|
-
|
|
163
|
-
if return_code != 0:
|
|
164
|
-
xpk_print('Disablig MGLRU On Cluster request returned ERROR')
|
|
165
|
-
return 1
|
|
166
|
-
|
|
167
133
|
return 0
|
|
168
134
|
|
|
169
135
|
|
|
@@ -309,10 +275,11 @@ def update_cluster_with_lustre_driver_if_necessary(args) -> int:
|
|
|
309
275
|
Returns:
|
|
310
276
|
0 if successful and error code otherwise.
|
|
311
277
|
"""
|
|
312
|
-
if is_driver_enabled_on_cluster(
|
|
313
|
-
args
|
|
314
|
-
|
|
315
|
-
|
|
278
|
+
if is_driver_enabled_on_cluster(args, driver='lustreCsiDriver') and (
|
|
279
|
+
not args.enable_legacy_lustre_port
|
|
280
|
+
or is_driver_enabled_on_cluster(
|
|
281
|
+
args, driver='lustreCsiDriver', config_key='enableLegacyLustrePort'
|
|
282
|
+
)
|
|
316
283
|
):
|
|
317
284
|
return 0
|
|
318
285
|
cluster_update_return_code = update_gke_cluster_with_lustre_driver_enabled(
|
|
@@ -621,9 +588,13 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
|
|
|
621
588
|
"""
|
|
622
589
|
command = (
|
|
623
590
|
'gcloud container clusters update'
|
|
624
|
-
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}
|
|
591
|
+
f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
|
|
625
592
|
' --quiet'
|
|
626
593
|
)
|
|
594
|
+
if args.enable_legacy_lustre_port:
|
|
595
|
+
command += ' --enable-legacy-lustre-port'
|
|
596
|
+
else:
|
|
597
|
+
command += ' --update-addons=LustreCsiDriver=ENABLED'
|
|
627
598
|
xpk_print(
|
|
628
599
|
'Updating GKE cluster to enable Lustre CSI driver, may take a while!'
|
|
629
600
|
)
|
xpk/core/cluster_test.py
CHANGED
|
@@ -16,7 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import pytest
|
|
18
18
|
from .testing.commands_tester import CommandsTester
|
|
19
|
-
from .cluster import get_cluster_credentials
|
|
19
|
+
from .cluster import get_cluster_credentials, update_gke_cluster_with_lustre_driver_enabled, update_cluster_with_lustre_driver_if_necessary
|
|
20
20
|
from pytest_mock import MockerFixture
|
|
21
21
|
|
|
22
22
|
|
|
@@ -93,3 +93,78 @@ def test_get_cluster_credentials_retries_without_dns_when_dns_retrieval_fails(
|
|
|
93
93
|
if "dns-endpoint" not in c
|
|
94
94
|
]
|
|
95
95
|
assert len(non_dns_endpoint_commands) == 1
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_update_cluster_with_lustre_driver_if_necessary_with_default_port_runs_correct_checks(
|
|
99
|
+
commands_tester: CommandsTester, command_args
|
|
100
|
+
):
|
|
101
|
+
commands_tester.set_result_for_command(
|
|
102
|
+
(0, "True"),
|
|
103
|
+
"gcloud container clusters describe",
|
|
104
|
+
)
|
|
105
|
+
command_args.enable_legacy_lustre_port = None
|
|
106
|
+
update_cluster_with_lustre_driver_if_necessary(command_args)
|
|
107
|
+
|
|
108
|
+
executed_commands = commands_tester.get_matching_commands()
|
|
109
|
+
assert executed_commands == [
|
|
110
|
+
"gcloud container clusters describe cluster --project=project"
|
|
111
|
+
" --location=us-central1"
|
|
112
|
+
' --format="value(addonsConfig.lustreCsiDriverConfig.enabled)"'
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_update_cluster_with_lustre_driver_if_necessary_with_legacy_port_runs_correct_checks(
|
|
117
|
+
commands_tester: CommandsTester, command_args
|
|
118
|
+
):
|
|
119
|
+
commands_tester.set_result_for_command(
|
|
120
|
+
(0, "True"),
|
|
121
|
+
"gcloud container clusters describe",
|
|
122
|
+
)
|
|
123
|
+
command_args.enable_legacy_lustre_port = True
|
|
124
|
+
update_cluster_with_lustre_driver_if_necessary(command_args)
|
|
125
|
+
|
|
126
|
+
executed_commands = commands_tester.get_matching_commands()
|
|
127
|
+
assert executed_commands == [
|
|
128
|
+
(
|
|
129
|
+
"gcloud container clusters describe cluster --project=project"
|
|
130
|
+
" --location=us-central1"
|
|
131
|
+
' --format="value(addonsConfig.lustreCsiDriverConfig.enabled)"'
|
|
132
|
+
),
|
|
133
|
+
(
|
|
134
|
+
"gcloud container clusters describe cluster --project=project"
|
|
135
|
+
" --location=us-central1"
|
|
136
|
+
' --format="value(addonsConfig.lustreCsiDriverConfig.enableLegacyLustrePort)"'
|
|
137
|
+
),
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_update_gke_cluster_with_lustre_driver_enabled_default_port(
|
|
142
|
+
commands_tester: CommandsTester, command_args
|
|
143
|
+
):
|
|
144
|
+
commands_tester.set_result_for_command(
|
|
145
|
+
(0, ""), "gcloud container clusters update"
|
|
146
|
+
)
|
|
147
|
+
command_args.enable_legacy_lustre_port = None
|
|
148
|
+
update_gke_cluster_with_lustre_driver_enabled(command_args)
|
|
149
|
+
|
|
150
|
+
executed_commands = commands_tester.get_matching_commands()
|
|
151
|
+
assert executed_commands == [
|
|
152
|
+
"gcloud container clusters update cluster --project=project"
|
|
153
|
+
" --location=us-central1 --quiet --update-addons=LustreCsiDriver=ENABLED"
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_update_gke_cluster_with_lustre_driver_enabled_legacy_port(
|
|
158
|
+
commands_tester: CommandsTester, command_args
|
|
159
|
+
):
|
|
160
|
+
commands_tester.set_result_for_command(
|
|
161
|
+
(0, ""), "gcloud container clusters update"
|
|
162
|
+
)
|
|
163
|
+
command_args.enable_legacy_lustre_port = True
|
|
164
|
+
update_gke_cluster_with_lustre_driver_enabled(command_args)
|
|
165
|
+
|
|
166
|
+
executed_commands = commands_tester.get_matching_commands()
|
|
167
|
+
assert executed_commands == [
|
|
168
|
+
"gcloud container clusters update cluster --project=project"
|
|
169
|
+
" --location=us-central1 --quiet --enable-legacy-lustre-port"
|
|
170
|
+
]
|
xpk/core/config.py
CHANGED
|
@@ -17,12 +17,32 @@ limitations under the License.
|
|
|
17
17
|
import os
|
|
18
18
|
|
|
19
19
|
import ruamel.yaml
|
|
20
|
-
|
|
20
|
+
from abc import ABC, abstractmethod
|
|
21
21
|
from ..utils import file
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
|
+
from setuptools_scm import get_version as setuptools_get_version
|
|
24
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_version() -> str:
|
|
28
|
+
xpk_version_override = os.getenv('XPK_VERSION_OVERRIDE', '')
|
|
29
|
+
if xpk_version_override != '':
|
|
30
|
+
return xpk_version_override
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
return setuptools_get_version()
|
|
34
|
+
except LookupError:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
return version('xpk')
|
|
39
|
+
except PackageNotFoundError:
|
|
40
|
+
pass
|
|
23
41
|
|
|
24
|
-
|
|
25
|
-
|
|
42
|
+
raise LookupError('unable to determine version number')
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
__version__ = _get_version()
|
|
26
46
|
XPK_CURRENT_VERSION = __version__
|
|
27
47
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
28
48
|
|
|
@@ -41,7 +61,6 @@ KJOB_SHELL_WORKING_DIRECTORY = 'shell-working-directory'
|
|
|
41
61
|
CONFIGS_KEY = 'configs'
|
|
42
62
|
GKE_ENDPOINT_KEY = 'gke-endpoint'
|
|
43
63
|
DEPENDENCIES_KEY = 'deps-verified-version'
|
|
44
|
-
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
45
64
|
|
|
46
65
|
DEFAULT_KEYS = [
|
|
47
66
|
CFG_BUCKET_KEY,
|
|
@@ -64,8 +83,28 @@ VERTEX_TENSORBOARD_FEATURE_FLAG = XPK_CURRENT_VERSION >= '0.4.0'
|
|
|
64
83
|
yaml = ruamel.yaml.YAML()
|
|
65
84
|
|
|
66
85
|
|
|
67
|
-
class
|
|
68
|
-
"""
|
|
86
|
+
class Config(ABC):
|
|
87
|
+
"""Stores and manipulates XPK configuration."""
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def set(self, key: str, value: str | None) -> None:
|
|
91
|
+
"""Sets the config value"""
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
@abstractmethod
|
|
95
|
+
def get(self, key: str) -> str | None:
|
|
96
|
+
"""Reads the config value"""
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
@abstractmethod
|
|
100
|
+
def get_all(
|
|
101
|
+
self,
|
|
102
|
+
) -> dict[str, str] | None:
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class FileSystemConfig(Config):
|
|
107
|
+
"""XPK Configuration manipulation class leveraging the file system."""
|
|
69
108
|
|
|
70
109
|
def __init__(self, custom_config_file: str = XPK_CONFIG_FILE) -> None:
|
|
71
110
|
self._config = custom_config_file
|
|
@@ -120,4 +159,39 @@ class XpkConfig:
|
|
|
120
159
|
return val
|
|
121
160
|
|
|
122
161
|
|
|
123
|
-
|
|
162
|
+
class InMemoryXpkConfig(Config):
|
|
163
|
+
"""XPK Configuration manipulation class in memory."""
|
|
164
|
+
|
|
165
|
+
def __init__(self) -> None:
|
|
166
|
+
self._config: dict[str, str] = {}
|
|
167
|
+
self._allowed_keys = DEFAULT_KEYS
|
|
168
|
+
|
|
169
|
+
def set(self, key: str, value: str | None) -> None:
|
|
170
|
+
if key not in self._allowed_keys:
|
|
171
|
+
return
|
|
172
|
+
if value is None:
|
|
173
|
+
self._config.pop(key, None)
|
|
174
|
+
else:
|
|
175
|
+
self._config[key] = value
|
|
176
|
+
|
|
177
|
+
def get(self, key: str) -> str | None:
|
|
178
|
+
if key not in self._allowed_keys:
|
|
179
|
+
return None
|
|
180
|
+
return self._config.get(key)
|
|
181
|
+
|
|
182
|
+
def get_all(
|
|
183
|
+
self,
|
|
184
|
+
) -> dict[str, str] | None:
|
|
185
|
+
return None if len(self._config) <= 0 else self._config
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
_xpk_config: Config = InMemoryXpkConfig()
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def set_config(config: Config):
|
|
192
|
+
global _xpk_config
|
|
193
|
+
_xpk_config = config
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_config() -> Config:
|
|
197
|
+
return _xpk_config
|