xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +89 -32
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +16 -3
- xpk/commands/cluster_test.py +353 -7
- xpk/commands/config.py +3 -5
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +3 -1
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +143 -142
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +73 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +48 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +55 -104
- xpk/core/cluster_test.py +170 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +88 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +17 -19
- xpk/core/kueue_manager.py +205 -51
- xpk/core/kueue_manager_test.py +158 -4
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +37 -43
- xpk/core/nodepool_test.py +42 -19
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +144 -133
- xpk/core/scheduling_test.py +298 -6
- xpk/core/system_characteristics.py +256 -19
- xpk/core/system_characteristics_test.py +128 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +33 -13
- xpk/parser/cluster.py +40 -67
- xpk/parser/cluster_test.py +83 -3
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -29
- xpk/parser/workload_test.py +3 -49
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +10 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +2 -13
- xpk/utils/versions.py +31 -0
- xpk-0.16.0.dist-info/METADATA +127 -0
- xpk-0.16.0.dist-info/RECORD +168 -0
- xpk-0.14.4.dist-info/METADATA +0 -1645
- xpk-0.14.4.dist-info/RECORD +0 -139
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# Copyright 2024 Google LLC
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
!Blueprint
|
|
17
|
+
blueprint_name: xpk-gke-a3-megagpu
|
|
18
|
+
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
|
|
19
|
+
toolkit_modules_version: v1.62.2
|
|
20
|
+
|
|
21
|
+
vars:
|
|
22
|
+
project_id: "foo"
|
|
23
|
+
deployment_name: xpk-gke-a3-megagpu
|
|
24
|
+
region: us-central1
|
|
25
|
+
zone: us-central1-c
|
|
26
|
+
labels: {gke_product_type: xpk}
|
|
27
|
+
|
|
28
|
+
deployment_groups:
|
|
29
|
+
- !DeploymentGroup
|
|
30
|
+
group: primary
|
|
31
|
+
modules:
|
|
32
|
+
- !DeploymentModule
|
|
33
|
+
id: network1
|
|
34
|
+
source: modules/network/vpc
|
|
35
|
+
settings:
|
|
36
|
+
subnetwork_name: bar-xpk-gke-a3-megagpu-subnet
|
|
37
|
+
secondary_ranges:
|
|
38
|
+
bar-xpk-gke-a3-megagpu-subnet:
|
|
39
|
+
- range_name: pods
|
|
40
|
+
ip_cidr_range: 10.4.0.0/14
|
|
41
|
+
- range_name: services
|
|
42
|
+
ip_cidr_range: 10.0.32.0/20
|
|
43
|
+
- !DeploymentModule
|
|
44
|
+
id: gpunets
|
|
45
|
+
source: modules/network/multivpc
|
|
46
|
+
settings:
|
|
47
|
+
network_name_prefix: bar-gpunet
|
|
48
|
+
global_ip_address_range: 192.169.0.0/16
|
|
49
|
+
network_count: 8
|
|
50
|
+
subnetwork_cidr_suffix: 24
|
|
51
|
+
- !DeploymentModule
|
|
52
|
+
id: gke_cluster
|
|
53
|
+
source: modules/scheduler/gke-cluster
|
|
54
|
+
use: [network1, gpunets]
|
|
55
|
+
settings:
|
|
56
|
+
release_channel: RAPID
|
|
57
|
+
version_prefix: '1.2'
|
|
58
|
+
min_master_version: 1.2.3
|
|
59
|
+
prefix_with_deployment_name: false
|
|
60
|
+
name_suffix: bar
|
|
61
|
+
enable_private_endpoint: false
|
|
62
|
+
enable_gcsfuse_csi: true
|
|
63
|
+
enable_filestore_csi: true
|
|
64
|
+
master_authorized_networks:
|
|
65
|
+
- cidr_block: 10.0.0.0/32 # Allows your machine run kubectl command. It's required for the multi-network setup.
|
|
66
|
+
display_name: "kubectl-access-network"
|
|
67
|
+
system_node_pool_machine_type: "e2-standard-32"
|
|
68
|
+
system_node_pool_node_count:
|
|
69
|
+
total_min_nodes: 5
|
|
70
|
+
total_max_nodes: 1000
|
|
71
|
+
k8s_network_names:
|
|
72
|
+
gvnic_prefix: "bar-gpunet-"
|
|
73
|
+
gvnic_postfix: "-subnet"
|
|
74
|
+
gvnic_start_index: 0
|
|
75
|
+
outputs: [instructions]
|
|
76
|
+
|
|
77
|
+
- !DeploymentModule
|
|
78
|
+
id: a3_megagpu_pool_0
|
|
79
|
+
source: modules/compute/gke-node-pool
|
|
80
|
+
use: [gke_cluster, gpunets]
|
|
81
|
+
settings:
|
|
82
|
+
name: bar-a3-megagpu-pool-0
|
|
83
|
+
machine_type: a3-megagpu-8g
|
|
84
|
+
zones: [us-central1-c]
|
|
85
|
+
host_maintenance_interval: 'PERIODIC'
|
|
86
|
+
reservation_affinity:
|
|
87
|
+
consume_reservation_type: SPECIFIC_RESERVATION
|
|
88
|
+
specific_reservations:
|
|
89
|
+
- name: test-reservation
|
|
90
|
+
run_workload_script: false
|
|
91
|
+
spot: false
|
|
92
|
+
max_pods_per_node: 32
|
|
93
|
+
guest_accelerator:
|
|
94
|
+
- type: nvidia-h100-mega-80gb
|
|
95
|
+
count: 8
|
|
96
|
+
gpu_driver_installation_config:
|
|
97
|
+
gpu_driver_version: "LATEST"
|
|
98
|
+
auto_upgrade: true
|
|
99
|
+
static_node_count: 2
|
|
100
|
+
placement_policy:
|
|
101
|
+
type: COMPACT
|
|
102
|
+
name: test-reservation-placement
|
|
103
|
+
outputs: [instructions]
|
|
104
|
+
|
|
105
|
+
- !DeploymentModule
|
|
106
|
+
id: workload_component_install
|
|
107
|
+
source: modules/management/kubectl-apply
|
|
108
|
+
use: [gke_cluster]
|
|
109
|
+
settings:
|
|
110
|
+
jobset:
|
|
111
|
+
install: true
|
|
112
|
+
version: v0.7.2
|
|
113
|
+
apply_manifests:
|
|
114
|
+
- source: $(ghpc_stage("xpk-gke-a3-megagpu"))/storage_crd.yaml
|
|
115
|
+
|
|
116
|
+
- !DeploymentModule
|
|
117
|
+
id: workload_configmap
|
|
118
|
+
source: modules/management/kubectl-apply
|
|
119
|
+
use: [gke_cluster]
|
|
120
|
+
settings:
|
|
121
|
+
apply_manifests:
|
|
122
|
+
- source: $(ghpc_stage("xpk-gke-a3-megagpu"))/config-map.yaml.tftpl
|
|
123
|
+
template_vars: {
|
|
124
|
+
resource_config_name: "bar-resources-configmap",
|
|
125
|
+
num_nodes: "2",
|
|
126
|
+
cluster_config_name: "bar-metadata-configmap",
|
|
127
|
+
capacity_type: "reservation",
|
|
128
|
+
reservation: "test-reservation",
|
|
129
|
+
}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# Copyright 2024 Google LLC
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
!Blueprint
|
|
17
|
+
blueprint_name: xpk-gke-a3-megagpu
|
|
18
|
+
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
|
|
19
|
+
toolkit_modules_version: v1.62.2
|
|
20
|
+
|
|
21
|
+
vars:
|
|
22
|
+
project_id: "foo"
|
|
23
|
+
deployment_name: xpk-gke-a3-megagpu
|
|
24
|
+
region: us-central1
|
|
25
|
+
zone: us-central1-c
|
|
26
|
+
labels: {gke_product_type: xpk}
|
|
27
|
+
|
|
28
|
+
deployment_groups:
|
|
29
|
+
- !DeploymentGroup
|
|
30
|
+
group: primary
|
|
31
|
+
modules:
|
|
32
|
+
- !DeploymentModule
|
|
33
|
+
id: network1
|
|
34
|
+
source: modules/network/vpc
|
|
35
|
+
settings:
|
|
36
|
+
subnetwork_name: bar-xpk-gke-a3-megagpu-subnet
|
|
37
|
+
secondary_ranges:
|
|
38
|
+
bar-xpk-gke-a3-megagpu-subnet:
|
|
39
|
+
- range_name: pods
|
|
40
|
+
ip_cidr_range: 10.4.0.0/14
|
|
41
|
+
- range_name: services
|
|
42
|
+
ip_cidr_range: 10.0.32.0/20
|
|
43
|
+
- !DeploymentModule
|
|
44
|
+
id: gpunets
|
|
45
|
+
source: modules/network/multivpc
|
|
46
|
+
settings:
|
|
47
|
+
network_name_prefix: bar-gpunet
|
|
48
|
+
global_ip_address_range: 192.169.0.0/16
|
|
49
|
+
network_count: 8
|
|
50
|
+
subnetwork_cidr_suffix: 24
|
|
51
|
+
- !DeploymentModule
|
|
52
|
+
id: gke_cluster
|
|
53
|
+
source: modules/scheduler/gke-cluster
|
|
54
|
+
use: [network1, gpunets]
|
|
55
|
+
settings:
|
|
56
|
+
release_channel: RAPID
|
|
57
|
+
version_prefix: '1.2'
|
|
58
|
+
min_master_version: 1.2.3
|
|
59
|
+
prefix_with_deployment_name: false
|
|
60
|
+
name_suffix: bar
|
|
61
|
+
enable_private_endpoint: false
|
|
62
|
+
enable_gcsfuse_csi: true
|
|
63
|
+
enable_filestore_csi: true
|
|
64
|
+
master_authorized_networks:
|
|
65
|
+
- cidr_block: 10.0.0.0/32 # Allows your machine run kubectl command. It's required for the multi-network setup.
|
|
66
|
+
display_name: "kubectl-access-network"
|
|
67
|
+
system_node_pool_machine_type: "e2-standard-32"
|
|
68
|
+
system_node_pool_node_count:
|
|
69
|
+
total_min_nodes: 5
|
|
70
|
+
total_max_nodes: 1000
|
|
71
|
+
k8s_network_names:
|
|
72
|
+
gvnic_prefix: "bar-gpunet-"
|
|
73
|
+
gvnic_postfix: "-subnet"
|
|
74
|
+
gvnic_start_index: 0
|
|
75
|
+
outputs: [instructions]
|
|
76
|
+
|
|
77
|
+
- !DeploymentModule
|
|
78
|
+
id: a3_megagpu_pool_0
|
|
79
|
+
source: modules/compute/gke-node-pool
|
|
80
|
+
use: [gke_cluster, gpunets]
|
|
81
|
+
settings:
|
|
82
|
+
name: bar-a3-megagpu-pool-0
|
|
83
|
+
machine_type: a3-megagpu-8g
|
|
84
|
+
zones: [us-central1-c]
|
|
85
|
+
host_maintenance_interval: PERIODIC
|
|
86
|
+
reservation_affinity:
|
|
87
|
+
consume_reservation_type: NO_RESERVATION
|
|
88
|
+
specific_reservations: []
|
|
89
|
+
run_workload_script: false
|
|
90
|
+
max_pods_per_node: 32
|
|
91
|
+
spot: true
|
|
92
|
+
guest_accelerator:
|
|
93
|
+
- type: nvidia-h100-mega-80gb
|
|
94
|
+
count: 8
|
|
95
|
+
gpu_driver_installation_config:
|
|
96
|
+
gpu_driver_version: "LATEST"
|
|
97
|
+
auto_upgrade: true
|
|
98
|
+
static_node_count: 2
|
|
99
|
+
outputs: [instructions]
|
|
100
|
+
|
|
101
|
+
- !DeploymentModule
|
|
102
|
+
id: workload_component_install
|
|
103
|
+
source: modules/management/kubectl-apply
|
|
104
|
+
use: [gke_cluster]
|
|
105
|
+
settings:
|
|
106
|
+
jobset:
|
|
107
|
+
install: true
|
|
108
|
+
version: v0.7.2
|
|
109
|
+
apply_manifests:
|
|
110
|
+
- source: $(ghpc_stage("xpk-gke-a3-megagpu"))/storage_crd.yaml
|
|
111
|
+
|
|
112
|
+
- !DeploymentModule
|
|
113
|
+
id: workload_configmap
|
|
114
|
+
source: modules/management/kubectl-apply
|
|
115
|
+
use: [gke_cluster]
|
|
116
|
+
settings:
|
|
117
|
+
apply_manifests:
|
|
118
|
+
- source: $(ghpc_stage("xpk-gke-a3-megagpu"))/config-map.yaml.tftpl
|
|
119
|
+
template_vars: {
|
|
120
|
+
resource_config_name: "bar-resources-configmap",
|
|
121
|
+
num_nodes: "2",
|
|
122
|
+
cluster_config_name: "bar-metadata-configmap",
|
|
123
|
+
capacity_type: "spot",
|
|
124
|
+
reservation: "None",
|
|
125
|
+
}
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# Copyright 2024 "Google LLC"
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
!Blueprint
|
|
15
|
+
blueprint_name: xpk-gke-a3-ultra
|
|
16
|
+
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
|
|
17
|
+
toolkit_modules_version: v1.62.2
|
|
18
|
+
|
|
19
|
+
vars:
|
|
20
|
+
labels: {gke_product_type: xpk}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
terraform_backend_defaults:
|
|
24
|
+
type: gcs
|
|
25
|
+
configuration:
|
|
26
|
+
bucket: test-bucket
|
|
27
|
+
prefix: xpk_terraform_state/testdir/gke-a3-ultra/
|
|
28
|
+
|
|
29
|
+
deployment_groups:
|
|
30
|
+
- !DeploymentGroup
|
|
31
|
+
group: primary
|
|
32
|
+
modules:
|
|
33
|
+
- !DeploymentModule
|
|
34
|
+
id: gke-a3-ultra-net-0
|
|
35
|
+
source: modules/network/vpc
|
|
36
|
+
settings:
|
|
37
|
+
network_name: gke-a3-ultra-net-0
|
|
38
|
+
subnetworks:
|
|
39
|
+
- subnet_name: gke-a3-ultra-sub-0
|
|
40
|
+
subnet_region: us-central1
|
|
41
|
+
subnet_ip: 192.168.0.0/18
|
|
42
|
+
secondary_ranges_list:
|
|
43
|
+
- subnetwork_name : gke-a3-ultra-sub-0
|
|
44
|
+
ranges:
|
|
45
|
+
- range_name: pods
|
|
46
|
+
ip_cidr_range: 10.4.0.0/14
|
|
47
|
+
- range_name: services
|
|
48
|
+
ip_cidr_range: 10.0.32.0/20
|
|
49
|
+
firewall_rules:
|
|
50
|
+
- name: gke-a3-ultra-internal-0
|
|
51
|
+
ranges: [192.168.0.0/16]
|
|
52
|
+
allow:
|
|
53
|
+
- protocol: tcp
|
|
54
|
+
ports: ["0-65535"]
|
|
55
|
+
- protocol: udp
|
|
56
|
+
ports: ["0-65535"]
|
|
57
|
+
- protocol: icmp
|
|
58
|
+
|
|
59
|
+
- !DeploymentModule
|
|
60
|
+
id: gke-a3-ultra-net-1
|
|
61
|
+
source: modules/network/vpc
|
|
62
|
+
settings:
|
|
63
|
+
network_name: gke-a3-ultra-net-1
|
|
64
|
+
mtu: 8896
|
|
65
|
+
subnetworks:
|
|
66
|
+
- subnet_name: gke-a3-ultra-sub-1
|
|
67
|
+
subnet_region: us-central1
|
|
68
|
+
subnet_ip: 192.168.64.0/18
|
|
69
|
+
firewall_rules:
|
|
70
|
+
- name: gke-a3-ultra-internal-1
|
|
71
|
+
ranges: [192.168.0.0/16]
|
|
72
|
+
allow:
|
|
73
|
+
- protocol: tcp
|
|
74
|
+
ports: ["0-65535"]
|
|
75
|
+
- protocol: udp
|
|
76
|
+
ports: ["0-65535"]
|
|
77
|
+
- protocol: icmp
|
|
78
|
+
|
|
79
|
+
- !DeploymentModule
|
|
80
|
+
id: gke-a3-ultra-rdma-net
|
|
81
|
+
source: modules/network/gpu-rdma-vpc
|
|
82
|
+
settings:
|
|
83
|
+
network_name: gke-a3-ultra-rdma-net
|
|
84
|
+
mtu: 8896
|
|
85
|
+
network_profile: https://www.googleapis.com/compute/beta/projects/foo/global/networkProfiles/us-central1-c-vpc-roce
|
|
86
|
+
network_routing_mode: REGIONAL
|
|
87
|
+
subnetworks_template:
|
|
88
|
+
name_prefix: gke-a3-ultra-rdma-sub
|
|
89
|
+
count: 8
|
|
90
|
+
ip_range: 192.168.128.0/18
|
|
91
|
+
region: us-central1
|
|
92
|
+
|
|
93
|
+
- !DeploymentModule
|
|
94
|
+
id: gke-a3-ultra-a3-ultragpu-cluster
|
|
95
|
+
source: modules/scheduler/gke-cluster
|
|
96
|
+
use: [gke-a3-ultra-net-0]
|
|
97
|
+
settings:
|
|
98
|
+
release_channel: RAPID
|
|
99
|
+
version_prefix: '1.2'
|
|
100
|
+
min_cluster_version: 1.2.3
|
|
101
|
+
prefix_with_deployment_name: false
|
|
102
|
+
name_suffix: gke-a3-ultra
|
|
103
|
+
system_node_pool_machine_type: "e2-standard-16"
|
|
104
|
+
enable_dcgm_monitoring: true
|
|
105
|
+
enable_gcsfuse_csi: true
|
|
106
|
+
enable_filestore_csi: true
|
|
107
|
+
enable_private_endpoint: false # Allows access from authorized public IPs
|
|
108
|
+
master_authorized_networks:
|
|
109
|
+
- cidr_block: 10.0.0.0/32 # Allows your machine to run the kubectl command. Required for multi network setup.
|
|
110
|
+
display_name: "kubectl-access-network"
|
|
111
|
+
system_node_pool_node_count:
|
|
112
|
+
total_min_nodes: 2
|
|
113
|
+
total_max_nodes: 1000
|
|
114
|
+
additional_networks: $(concat([{network=gke-a3-ultra-net-1.network_name, subnetwork=gke-a3-ultra-net-1.subnetwork_name, subnetwork_project="foo", nic_type="GVNIC", queue_count=null, network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a3-ultra-rdma-net.subnetwork_interfaces_gke))
|
|
115
|
+
k8s_network_names:
|
|
116
|
+
rdma_prefix: "gke-a3-ultra-rdma-sub-"
|
|
117
|
+
rdma_start_index: 0
|
|
118
|
+
rdma_postfix: ""
|
|
119
|
+
gvnic_prefix: "gke-a3-ultra-sub-"
|
|
120
|
+
gvnic_start_index: 1
|
|
121
|
+
outputs: [instructions]
|
|
122
|
+
|
|
123
|
+
- !DeploymentModule
|
|
124
|
+
id: gke-a3-ultra-a3u-pool
|
|
125
|
+
source: modules/compute/gke-node-pool
|
|
126
|
+
use: [gke-a3-ultra-a3-ultragpu-cluster]
|
|
127
|
+
settings:
|
|
128
|
+
machine_type: a3-ultragpu-8g
|
|
129
|
+
auto_upgrade: true
|
|
130
|
+
zones: [us-central1-c]
|
|
131
|
+
spot: false
|
|
132
|
+
max_pods_per_node: 32
|
|
133
|
+
guest_accelerator:
|
|
134
|
+
- type: nvidia-h200-141gb
|
|
135
|
+
count: 8
|
|
136
|
+
gpu_driver_installation_config:
|
|
137
|
+
gpu_driver_version: "LATEST"
|
|
138
|
+
additional_networks:
|
|
139
|
+
$(concat([{network=gke-a3-ultra-net-1.network_name, subnetwork=gke-a3-ultra-net-1.subnetwork_name, subnetwork_project="foo", nic_type="GVNIC", queue_count=null, network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a3-ultra-rdma-net.subnetwork_interfaces_gke))
|
|
140
|
+
reservation_affinity:
|
|
141
|
+
consume_reservation_type: SPECIFIC_RESERVATION
|
|
142
|
+
specific_reservations:
|
|
143
|
+
- name: test-reservation
|
|
144
|
+
static_node_count: 2
|
|
145
|
+
outputs: [instructions]
|
|
146
|
+
|
|
147
|
+
- !DeploymentModule
|
|
148
|
+
id: workload-manager-install
|
|
149
|
+
source: modules/management/kubectl-apply
|
|
150
|
+
use: [gke-a3-ultra-a3-ultragpu-cluster]
|
|
151
|
+
settings:
|
|
152
|
+
jobset:
|
|
153
|
+
install: true
|
|
154
|
+
version: v0.7.2
|
|
155
|
+
apply_manifests:
|
|
156
|
+
- source: $(ghpc_stage("xpk-gke-a3-ultra"))/nccl-installer.yaml
|
|
157
|
+
- source: $(ghpc_stage("xpk-gke-a3-ultra"))/mlgru-disable.yaml
|
|
158
|
+
- source: $(ghpc_stage("xpk-gke-a3-ultra"))/storage_crd.yaml
|
|
159
|
+
|
|
160
|
+
- !DeploymentModule
|
|
161
|
+
id: workload_configmap
|
|
162
|
+
source: 'modules/management/kubectl-apply'
|
|
163
|
+
use: ['gke-a3-ultra-a3-ultragpu-cluster']
|
|
164
|
+
settings:
|
|
165
|
+
apply_manifests:
|
|
166
|
+
- source: '$(ghpc_stage("xpk-gke-a3-ultra"))/config-map.yaml.tftpl'
|
|
167
|
+
template_vars: {
|
|
168
|
+
resource_config_name: "gke-a3-ultra-resources-configmap",
|
|
169
|
+
num_nodes: "2",
|
|
170
|
+
cluster_config_name: "gke-a3-ultra-metadata-configmap",
|
|
171
|
+
capacity_type: "reservation",
|
|
172
|
+
reservation: "test-reservation",
|
|
173
|
+
}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# Copyright 2024 "Google LLC"
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
!Blueprint
|
|
15
|
+
blueprint_name: xpk-gke-a4
|
|
16
|
+
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
|
|
17
|
+
toolkit_modules_version: v1.62.2
|
|
18
|
+
|
|
19
|
+
vars:
|
|
20
|
+
labels: {gke_product_type: xpk}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
terraform_backend_defaults:
|
|
24
|
+
type: gcs
|
|
25
|
+
configuration:
|
|
26
|
+
bucket: test-bucket
|
|
27
|
+
prefix: xpk_terraform_state/testdir/gke-a4/
|
|
28
|
+
|
|
29
|
+
deployment_groups:
|
|
30
|
+
- !DeploymentGroup
|
|
31
|
+
modules:
|
|
32
|
+
- !DeploymentModule
|
|
33
|
+
id: gke-a4-net-0
|
|
34
|
+
source: modules/network/vpc
|
|
35
|
+
settings:
|
|
36
|
+
network_name: gke-a4-net-0
|
|
37
|
+
mtu: 8896
|
|
38
|
+
subnetworks:
|
|
39
|
+
- subnet_name: gke-a4-sub-0
|
|
40
|
+
subnet_region: us-central1
|
|
41
|
+
subnet_ip: 192.168.0.0/18
|
|
42
|
+
secondary_ranges_list:
|
|
43
|
+
- subnetwork_name: gke-a4-sub-0
|
|
44
|
+
ranges:
|
|
45
|
+
- range_name: pods
|
|
46
|
+
ip_cidr_range: 10.4.0.0/14
|
|
47
|
+
- range_name: services
|
|
48
|
+
ip_cidr_range: 10.0.32.0/20
|
|
49
|
+
firewall_rules:
|
|
50
|
+
- name: gke-a4-internal-0
|
|
51
|
+
ranges:
|
|
52
|
+
- 192.168.0.0/16
|
|
53
|
+
allow:
|
|
54
|
+
- protocol: tcp
|
|
55
|
+
ports:
|
|
56
|
+
- 0-65535
|
|
57
|
+
- protocol: udp
|
|
58
|
+
ports:
|
|
59
|
+
- 0-65535
|
|
60
|
+
- protocol: icmp
|
|
61
|
+
|
|
62
|
+
- !DeploymentModule
|
|
63
|
+
id: gke-a4-net-1
|
|
64
|
+
source: modules/network/vpc
|
|
65
|
+
settings:
|
|
66
|
+
network_name: gke-a4-net-1
|
|
67
|
+
mtu: 8896
|
|
68
|
+
subnetworks:
|
|
69
|
+
- subnet_name: gke-a4-sub-1
|
|
70
|
+
subnet_region: us-central1
|
|
71
|
+
subnet_ip: 192.168.64.0/18
|
|
72
|
+
firewall_rules:
|
|
73
|
+
- name: gke-a4-internal-1
|
|
74
|
+
ranges:
|
|
75
|
+
- 192.168.0.0/16
|
|
76
|
+
allow:
|
|
77
|
+
- protocol: tcp
|
|
78
|
+
ports:
|
|
79
|
+
- 0-65535
|
|
80
|
+
- protocol: udp
|
|
81
|
+
ports:
|
|
82
|
+
- 0-65535
|
|
83
|
+
- protocol: icmp
|
|
84
|
+
|
|
85
|
+
- !DeploymentModule
|
|
86
|
+
id: gke-a4-rdma-net
|
|
87
|
+
source: modules/network/gpu-rdma-vpc
|
|
88
|
+
settings:
|
|
89
|
+
network_name: gke-a4-rdma-net
|
|
90
|
+
mtu: 8896
|
|
91
|
+
network_profile:
|
|
92
|
+
https://www.googleapis.com/compute/beta/projects/foo/global/networkProfiles/us-central1-c-vpc-roce
|
|
93
|
+
network_routing_mode: REGIONAL
|
|
94
|
+
subnetworks_template:
|
|
95
|
+
name_prefix: gke-a4-rdma-sub
|
|
96
|
+
count: 8
|
|
97
|
+
ip_range: 192.168.128.0/18
|
|
98
|
+
region: us-central1
|
|
99
|
+
|
|
100
|
+
- !DeploymentModule
|
|
101
|
+
id: gke-a4-a4-cluster
|
|
102
|
+
source: modules/scheduler/gke-cluster
|
|
103
|
+
outputs:
|
|
104
|
+
- instructions
|
|
105
|
+
settings:
|
|
106
|
+
system_node_pool_machine_type: e2-standard-16
|
|
107
|
+
system_node_pool_node_count:
|
|
108
|
+
total_min_nodes: 2
|
|
109
|
+
total_max_nodes: 1000
|
|
110
|
+
prefix_with_deployment_name: false
|
|
111
|
+
name_suffix: gke-a4
|
|
112
|
+
enable_dcgm_monitoring: true
|
|
113
|
+
enable_gcsfuse_csi: true
|
|
114
|
+
enable_private_endpoint: false
|
|
115
|
+
master_authorized_networks:
|
|
116
|
+
- cidr_block: 10.0.0.0/32
|
|
117
|
+
display_name: kubectl-access-network
|
|
118
|
+
additional_networks: $(concat([{network=gke-a4-net-1.network_name, subnetwork=gke-a4-net-1.subnetwork_name,
|
|
119
|
+
subnetwork_project="foo", nic_type="GVNIC", queue_count=null,
|
|
120
|
+
network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null,
|
|
121
|
+
network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a4-rdma-net.subnetwork_interfaces_gke))
|
|
122
|
+
release_channel: RAPID
|
|
123
|
+
version_prefix: '1.2'
|
|
124
|
+
min_cluster_version: 1.2.3
|
|
125
|
+
use:
|
|
126
|
+
- gke-a4-net-0
|
|
127
|
+
- !DeploymentModule
|
|
128
|
+
id: gke-a4-a4-pool
|
|
129
|
+
source: modules/compute/gke-node-pool
|
|
130
|
+
use:
|
|
131
|
+
- gke-a4-a4-cluster
|
|
132
|
+
outputs:
|
|
133
|
+
- instructions
|
|
134
|
+
settings:
|
|
135
|
+
machine_type: a4-highgpu-8g
|
|
136
|
+
auto_upgrade: true
|
|
137
|
+
zones:
|
|
138
|
+
- us-central1-c
|
|
139
|
+
disk_type: hyperdisk-balanced
|
|
140
|
+
local_ssd_count_ephemeral_storage: 32
|
|
141
|
+
spot: false
|
|
142
|
+
reservation_affinity:
|
|
143
|
+
consume_reservation_type: SPECIFIC_RESERVATION
|
|
144
|
+
specific_reservations:
|
|
145
|
+
- name: test-reservation
|
|
146
|
+
max_pods_per_node: 32
|
|
147
|
+
guest_accelerator:
|
|
148
|
+
- type: nvidia-b200
|
|
149
|
+
count: 8
|
|
150
|
+
gpu_driver_installation_config:
|
|
151
|
+
gpu_driver_version: LATEST
|
|
152
|
+
additional_networks: $(concat([{network=gke-a4-net-1.network_name, subnetwork=gke-a4-net-1.subnetwork_name,
|
|
153
|
+
subnetwork_project="foo", nic_type="GVNIC", queue_count=null,
|
|
154
|
+
network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null,
|
|
155
|
+
network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a4-rdma-net.subnetwork_interfaces_gke))
|
|
156
|
+
static_node_count: 2
|
|
157
|
+
|
|
158
|
+
- !DeploymentModule
|
|
159
|
+
id: workload-manager-install
|
|
160
|
+
source: modules/management/kubectl-apply
|
|
161
|
+
use:
|
|
162
|
+
- gke-a4-a4-cluster
|
|
163
|
+
settings:
|
|
164
|
+
jobset:
|
|
165
|
+
install: true
|
|
166
|
+
version: v0.7.2
|
|
167
|
+
apply_manifests:
|
|
168
|
+
- source: $(ghpc_stage("xpk-gke-a4"))/nccl-rdma-installer-a4.yaml
|
|
169
|
+
- source: $(ghpc_stage("xpk-gke-a4"))/storage_crd.yaml
|
|
170
|
+
|
|
171
|
+
- !DeploymentModule
|
|
172
|
+
id: workload_configmap
|
|
173
|
+
source: modules/management/kubectl-apply
|
|
174
|
+
settings:
|
|
175
|
+
apply_manifests:
|
|
176
|
+
- source: $(ghpc_stage("xpk-gke-a4"))/config-map.yaml.tftpl
|
|
177
|
+
template_vars:
|
|
178
|
+
resource_config_name: gke-a4-resources-configmap
|
|
179
|
+
num_nodes: '2'
|
|
180
|
+
cluster_config_name: gke-a4-metadata-configmap
|
|
181
|
+
capacity_type: reservation
|
|
182
|
+
reservation: test-reservation
|
|
183
|
+
use:
|
|
184
|
+
- gke-a4-a4-cluster
|
|
185
|
+
group: primary
|