xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. integration/README.md +19 -0
  2. integration/gcluster_a3mega_test.py +11 -0
  3. integration/gcluster_a3ultra_test.py +11 -0
  4. integration/gcluster_a4_test.py +11 -0
  5. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  6. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  7. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  8. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  9. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  10. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  11. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  12. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  13. xpk/blueprints/a4/storage_crd.yaml +52 -0
  14. xpk/commands/cluster.py +89 -32
  15. xpk/commands/cluster_gcluster.py +25 -5
  16. xpk/commands/cluster_gcluster_test.py +16 -3
  17. xpk/commands/cluster_test.py +353 -7
  18. xpk/commands/config.py +3 -5
  19. xpk/commands/inspector.py +5 -3
  20. xpk/commands/kind.py +3 -1
  21. xpk/commands/managed_ml_diagnostics.py +249 -0
  22. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  23. xpk/commands/storage.py +8 -10
  24. xpk/commands/workload.py +143 -142
  25. xpk/commands/workload_test.py +160 -118
  26. xpk/core/blueprint/blueprint_generator.py +73 -33
  27. xpk/core/blueprint/blueprint_test.py +9 -0
  28. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  29. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  30. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  31. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  32. xpk/core/capacity.py +48 -8
  33. xpk/core/capacity_test.py +32 -1
  34. xpk/core/cluster.py +55 -104
  35. xpk/core/cluster_test.py +170 -0
  36. xpk/core/commands.py +4 -10
  37. xpk/core/config.py +88 -7
  38. xpk/core/config_test.py +67 -11
  39. xpk/core/docker_container.py +3 -1
  40. xpk/core/docker_image.py +10 -6
  41. xpk/core/docker_resources.py +1 -10
  42. xpk/core/gcloud_context.py +18 -12
  43. xpk/core/gcloud_context_test.py +111 -1
  44. xpk/core/kjob.py +17 -19
  45. xpk/core/kueue_manager.py +205 -51
  46. xpk/core/kueue_manager_test.py +158 -4
  47. xpk/core/nap.py +13 -14
  48. xpk/core/nodepool.py +37 -43
  49. xpk/core/nodepool_test.py +42 -19
  50. xpk/core/pathways.py +23 -0
  51. xpk/core/pathways_test.py +57 -0
  52. xpk/core/resources.py +84 -27
  53. xpk/core/scheduling.py +144 -133
  54. xpk/core/scheduling_test.py +298 -6
  55. xpk/core/system_characteristics.py +256 -19
  56. xpk/core/system_characteristics_test.py +128 -5
  57. xpk/core/telemetry.py +263 -0
  58. xpk/core/telemetry_test.py +211 -0
  59. xpk/core/vertex.py +4 -3
  60. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  61. xpk/main.py +33 -13
  62. xpk/parser/cluster.py +40 -67
  63. xpk/parser/cluster_test.py +83 -3
  64. xpk/parser/common.py +84 -0
  65. xpk/parser/storage.py +10 -0
  66. xpk/parser/storage_test.py +47 -0
  67. xpk/parser/workload.py +14 -29
  68. xpk/parser/workload_test.py +3 -49
  69. xpk/telemetry_uploader.py +29 -0
  70. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  71. xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
  72. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
  73. xpk/utils/console.py +41 -10
  74. xpk/utils/console_test.py +106 -0
  75. xpk/utils/feature_flags.py +10 -1
  76. xpk/utils/file.py +4 -1
  77. xpk/utils/topology.py +4 -0
  78. xpk/utils/user_agent.py +35 -0
  79. xpk/utils/user_agent_test.py +44 -0
  80. xpk/utils/user_input.py +48 -0
  81. xpk/utils/user_input_test.py +92 -0
  82. xpk/utils/validation.py +2 -13
  83. xpk/utils/versions.py +31 -0
  84. xpk-0.16.0.dist-info/METADATA +127 -0
  85. xpk-0.16.0.dist-info/RECORD +168 -0
  86. xpk-0.14.4.dist-info/METADATA +0 -1645
  87. xpk-0.14.4.dist-info/RECORD +0 -139
  88. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  89. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  90. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  91. {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,129 @@
1
+ # Copyright 2024 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ ---
16
+ !Blueprint
17
+ blueprint_name: xpk-gke-a3-megagpu
18
+ toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
19
+ toolkit_modules_version: v1.62.2
20
+
21
+ vars:
22
+ project_id: "foo"
23
+ deployment_name: xpk-gke-a3-megagpu
24
+ region: us-central1
25
+ zone: us-central1-c
26
+ labels: {gke_product_type: xpk}
27
+
28
+ deployment_groups:
29
+ - !DeploymentGroup
30
+ group: primary
31
+ modules:
32
+ - !DeploymentModule
33
+ id: network1
34
+ source: modules/network/vpc
35
+ settings:
36
+ subnetwork_name: bar-xpk-gke-a3-megagpu-subnet
37
+ secondary_ranges:
38
+ bar-xpk-gke-a3-megagpu-subnet:
39
+ - range_name: pods
40
+ ip_cidr_range: 10.4.0.0/14
41
+ - range_name: services
42
+ ip_cidr_range: 10.0.32.0/20
43
+ - !DeploymentModule
44
+ id: gpunets
45
+ source: modules/network/multivpc
46
+ settings:
47
+ network_name_prefix: bar-gpunet
48
+ global_ip_address_range: 192.169.0.0/16
49
+ network_count: 8
50
+ subnetwork_cidr_suffix: 24
51
+ - !DeploymentModule
52
+ id: gke_cluster
53
+ source: modules/scheduler/gke-cluster
54
+ use: [network1, gpunets]
55
+ settings:
56
+ release_channel: RAPID
57
+ version_prefix: '1.2'
58
+ min_master_version: 1.2.3
59
+ prefix_with_deployment_name: false
60
+ name_suffix: bar
61
+ enable_private_endpoint: false
62
+ enable_gcsfuse_csi: true
63
+ enable_filestore_csi: true
64
+ master_authorized_networks:
65
+ - cidr_block: 10.0.0.0/32 # Allows your machine run kubectl command. It's required for the multi-network setup.
66
+ display_name: "kubectl-access-network"
67
+ system_node_pool_machine_type: "e2-standard-32"
68
+ system_node_pool_node_count:
69
+ total_min_nodes: 5
70
+ total_max_nodes: 1000
71
+ k8s_network_names:
72
+ gvnic_prefix: "bar-gpunet-"
73
+ gvnic_postfix: "-subnet"
74
+ gvnic_start_index: 0
75
+ outputs: [instructions]
76
+
77
+ - !DeploymentModule
78
+ id: a3_megagpu_pool_0
79
+ source: modules/compute/gke-node-pool
80
+ use: [gke_cluster, gpunets]
81
+ settings:
82
+ name: bar-a3-megagpu-pool-0
83
+ machine_type: a3-megagpu-8g
84
+ zones: [us-central1-c]
85
+ host_maintenance_interval: 'PERIODIC'
86
+ reservation_affinity:
87
+ consume_reservation_type: SPECIFIC_RESERVATION
88
+ specific_reservations:
89
+ - name: test-reservation
90
+ run_workload_script: false
91
+ spot: false
92
+ max_pods_per_node: 32
93
+ guest_accelerator:
94
+ - type: nvidia-h100-mega-80gb
95
+ count: 8
96
+ gpu_driver_installation_config:
97
+ gpu_driver_version: "LATEST"
98
+ auto_upgrade: true
99
+ static_node_count: 2
100
+ placement_policy:
101
+ type: COMPACT
102
+ name: test-reservation-placement
103
+ outputs: [instructions]
104
+
105
+ - !DeploymentModule
106
+ id: workload_component_install
107
+ source: modules/management/kubectl-apply
108
+ use: [gke_cluster]
109
+ settings:
110
+ jobset:
111
+ install: true
112
+ version: v0.7.2
113
+ apply_manifests:
114
+ - source: $(ghpc_stage("xpk-gke-a3-megagpu"))/storage_crd.yaml
115
+
116
+ - !DeploymentModule
117
+ id: workload_configmap
118
+ source: modules/management/kubectl-apply
119
+ use: [gke_cluster]
120
+ settings:
121
+ apply_manifests:
122
+ - source: $(ghpc_stage("xpk-gke-a3-megagpu"))/config-map.yaml.tftpl
123
+ template_vars: {
124
+ resource_config_name: "bar-resources-configmap",
125
+ num_nodes: "2",
126
+ cluster_config_name: "bar-metadata-configmap",
127
+ capacity_type: "reservation",
128
+ reservation: "test-reservation",
129
+ }
@@ -0,0 +1,125 @@
1
+ # Copyright 2024 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ ---
16
+ !Blueprint
17
+ blueprint_name: xpk-gke-a3-megagpu
18
+ toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
19
+ toolkit_modules_version: v1.62.2
20
+
21
+ vars:
22
+ project_id: "foo"
23
+ deployment_name: xpk-gke-a3-megagpu
24
+ region: us-central1
25
+ zone: us-central1-c
26
+ labels: {gke_product_type: xpk}
27
+
28
+ deployment_groups:
29
+ - !DeploymentGroup
30
+ group: primary
31
+ modules:
32
+ - !DeploymentModule
33
+ id: network1
34
+ source: modules/network/vpc
35
+ settings:
36
+ subnetwork_name: bar-xpk-gke-a3-megagpu-subnet
37
+ secondary_ranges:
38
+ bar-xpk-gke-a3-megagpu-subnet:
39
+ - range_name: pods
40
+ ip_cidr_range: 10.4.0.0/14
41
+ - range_name: services
42
+ ip_cidr_range: 10.0.32.0/20
43
+ - !DeploymentModule
44
+ id: gpunets
45
+ source: modules/network/multivpc
46
+ settings:
47
+ network_name_prefix: bar-gpunet
48
+ global_ip_address_range: 192.169.0.0/16
49
+ network_count: 8
50
+ subnetwork_cidr_suffix: 24
51
+ - !DeploymentModule
52
+ id: gke_cluster
53
+ source: modules/scheduler/gke-cluster
54
+ use: [network1, gpunets]
55
+ settings:
56
+ release_channel: RAPID
57
+ version_prefix: '1.2'
58
+ min_master_version: 1.2.3
59
+ prefix_with_deployment_name: false
60
+ name_suffix: bar
61
+ enable_private_endpoint: false
62
+ enable_gcsfuse_csi: true
63
+ enable_filestore_csi: true
64
+ master_authorized_networks:
65
+ - cidr_block: 10.0.0.0/32 # Allows your machine run kubectl command. It's required for the multi-network setup.
66
+ display_name: "kubectl-access-network"
67
+ system_node_pool_machine_type: "e2-standard-32"
68
+ system_node_pool_node_count:
69
+ total_min_nodes: 5
70
+ total_max_nodes: 1000
71
+ k8s_network_names:
72
+ gvnic_prefix: "bar-gpunet-"
73
+ gvnic_postfix: "-subnet"
74
+ gvnic_start_index: 0
75
+ outputs: [instructions]
76
+
77
+ - !DeploymentModule
78
+ id: a3_megagpu_pool_0
79
+ source: modules/compute/gke-node-pool
80
+ use: [gke_cluster, gpunets]
81
+ settings:
82
+ name: bar-a3-megagpu-pool-0
83
+ machine_type: a3-megagpu-8g
84
+ zones: [us-central1-c]
85
+ host_maintenance_interval: PERIODIC
86
+ reservation_affinity:
87
+ consume_reservation_type: NO_RESERVATION
88
+ specific_reservations: []
89
+ run_workload_script: false
90
+ max_pods_per_node: 32
91
+ spot: true
92
+ guest_accelerator:
93
+ - type: nvidia-h100-mega-80gb
94
+ count: 8
95
+ gpu_driver_installation_config:
96
+ gpu_driver_version: "LATEST"
97
+ auto_upgrade: true
98
+ static_node_count: 2
99
+ outputs: [instructions]
100
+
101
+ - !DeploymentModule
102
+ id: workload_component_install
103
+ source: modules/management/kubectl-apply
104
+ use: [gke_cluster]
105
+ settings:
106
+ jobset:
107
+ install: true
108
+ version: v0.7.2
109
+ apply_manifests:
110
+ - source: $(ghpc_stage("xpk-gke-a3-megagpu"))/storage_crd.yaml
111
+
112
+ - !DeploymentModule
113
+ id: workload_configmap
114
+ source: modules/management/kubectl-apply
115
+ use: [gke_cluster]
116
+ settings:
117
+ apply_manifests:
118
+ - source: $(ghpc_stage("xpk-gke-a3-megagpu"))/config-map.yaml.tftpl
119
+ template_vars: {
120
+ resource_config_name: "bar-resources-configmap",
121
+ num_nodes: "2",
122
+ cluster_config_name: "bar-metadata-configmap",
123
+ capacity_type: "spot",
124
+ reservation: "None",
125
+ }
@@ -0,0 +1,173 @@
1
+ # Copyright 2024 "Google LLC"
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ !Blueprint
15
+ blueprint_name: xpk-gke-a3-ultra
16
+ toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
17
+ toolkit_modules_version: v1.62.2
18
+
19
+ vars:
20
+ labels: {gke_product_type: xpk}
21
+
22
+
23
+ terraform_backend_defaults:
24
+ type: gcs
25
+ configuration:
26
+ bucket: test-bucket
27
+ prefix: xpk_terraform_state/testdir/gke-a3-ultra/
28
+
29
+ deployment_groups:
30
+ - !DeploymentGroup
31
+ group: primary
32
+ modules:
33
+ - !DeploymentModule
34
+ id: gke-a3-ultra-net-0
35
+ source: modules/network/vpc
36
+ settings:
37
+ network_name: gke-a3-ultra-net-0
38
+ subnetworks:
39
+ - subnet_name: gke-a3-ultra-sub-0
40
+ subnet_region: us-central1
41
+ subnet_ip: 192.168.0.0/18
42
+ secondary_ranges_list:
43
+ - subnetwork_name : gke-a3-ultra-sub-0
44
+ ranges:
45
+ - range_name: pods
46
+ ip_cidr_range: 10.4.0.0/14
47
+ - range_name: services
48
+ ip_cidr_range: 10.0.32.0/20
49
+ firewall_rules:
50
+ - name: gke-a3-ultra-internal-0
51
+ ranges: [192.168.0.0/16]
52
+ allow:
53
+ - protocol: tcp
54
+ ports: ["0-65535"]
55
+ - protocol: udp
56
+ ports: ["0-65535"]
57
+ - protocol: icmp
58
+
59
+ - !DeploymentModule
60
+ id: gke-a3-ultra-net-1
61
+ source: modules/network/vpc
62
+ settings:
63
+ network_name: gke-a3-ultra-net-1
64
+ mtu: 8896
65
+ subnetworks:
66
+ - subnet_name: gke-a3-ultra-sub-1
67
+ subnet_region: us-central1
68
+ subnet_ip: 192.168.64.0/18
69
+ firewall_rules:
70
+ - name: gke-a3-ultra-internal-1
71
+ ranges: [192.168.0.0/16]
72
+ allow:
73
+ - protocol: tcp
74
+ ports: ["0-65535"]
75
+ - protocol: udp
76
+ ports: ["0-65535"]
77
+ - protocol: icmp
78
+
79
+ - !DeploymentModule
80
+ id: gke-a3-ultra-rdma-net
81
+ source: modules/network/gpu-rdma-vpc
82
+ settings:
83
+ network_name: gke-a3-ultra-rdma-net
84
+ mtu: 8896
85
+ network_profile: https://www.googleapis.com/compute/beta/projects/foo/global/networkProfiles/us-central1-c-vpc-roce
86
+ network_routing_mode: REGIONAL
87
+ subnetworks_template:
88
+ name_prefix: gke-a3-ultra-rdma-sub
89
+ count: 8
90
+ ip_range: 192.168.128.0/18
91
+ region: us-central1
92
+
93
+ - !DeploymentModule
94
+ id: gke-a3-ultra-a3-ultragpu-cluster
95
+ source: modules/scheduler/gke-cluster
96
+ use: [gke-a3-ultra-net-0]
97
+ settings:
98
+ release_channel: RAPID
99
+ version_prefix: '1.2'
100
+ min_cluster_version: 1.2.3
101
+ prefix_with_deployment_name: false
102
+ name_suffix: gke-a3-ultra
103
+ system_node_pool_machine_type: "e2-standard-16"
104
+ enable_dcgm_monitoring: true
105
+ enable_gcsfuse_csi: true
106
+ enable_filestore_csi: true
107
+ enable_private_endpoint: false # Allows access from authorized public IPs
108
+ master_authorized_networks:
109
+ - cidr_block: 10.0.0.0/32 # Allows your machine to run the kubectl command. Required for multi network setup.
110
+ display_name: "kubectl-access-network"
111
+ system_node_pool_node_count:
112
+ total_min_nodes: 2
113
+ total_max_nodes: 1000
114
+ additional_networks: $(concat([{network=gke-a3-ultra-net-1.network_name, subnetwork=gke-a3-ultra-net-1.subnetwork_name, subnetwork_project="foo", nic_type="GVNIC", queue_count=null, network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a3-ultra-rdma-net.subnetwork_interfaces_gke))
115
+ k8s_network_names:
116
+ rdma_prefix: "gke-a3-ultra-rdma-sub-"
117
+ rdma_start_index: 0
118
+ rdma_postfix: ""
119
+ gvnic_prefix: "gke-a3-ultra-sub-"
120
+ gvnic_start_index: 1
121
+ outputs: [instructions]
122
+
123
+ - !DeploymentModule
124
+ id: gke-a3-ultra-a3u-pool
125
+ source: modules/compute/gke-node-pool
126
+ use: [gke-a3-ultra-a3-ultragpu-cluster]
127
+ settings:
128
+ machine_type: a3-ultragpu-8g
129
+ auto_upgrade: true
130
+ zones: [us-central1-c]
131
+ spot: false
132
+ max_pods_per_node: 32
133
+ guest_accelerator:
134
+ - type: nvidia-h200-141gb
135
+ count: 8
136
+ gpu_driver_installation_config:
137
+ gpu_driver_version: "LATEST"
138
+ additional_networks:
139
+ $(concat([{network=gke-a3-ultra-net-1.network_name, subnetwork=gke-a3-ultra-net-1.subnetwork_name, subnetwork_project="foo", nic_type="GVNIC", queue_count=null, network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a3-ultra-rdma-net.subnetwork_interfaces_gke))
140
+ reservation_affinity:
141
+ consume_reservation_type: SPECIFIC_RESERVATION
142
+ specific_reservations:
143
+ - name: test-reservation
144
+ static_node_count: 2
145
+ outputs: [instructions]
146
+
147
+ - !DeploymentModule
148
+ id: workload-manager-install
149
+ source: modules/management/kubectl-apply
150
+ use: [gke-a3-ultra-a3-ultragpu-cluster]
151
+ settings:
152
+ jobset:
153
+ install: true
154
+ version: v0.7.2
155
+ apply_manifests:
156
+ - source: $(ghpc_stage("xpk-gke-a3-ultra"))/nccl-installer.yaml
157
+ - source: $(ghpc_stage("xpk-gke-a3-ultra"))/mlgru-disable.yaml
158
+ - source: $(ghpc_stage("xpk-gke-a3-ultra"))/storage_crd.yaml
159
+
160
+ - !DeploymentModule
161
+ id: workload_configmap
162
+ source: 'modules/management/kubectl-apply'
163
+ use: ['gke-a3-ultra-a3-ultragpu-cluster']
164
+ settings:
165
+ apply_manifests:
166
+ - source: '$(ghpc_stage("xpk-gke-a3-ultra"))/config-map.yaml.tftpl'
167
+ template_vars: {
168
+ resource_config_name: "gke-a3-ultra-resources-configmap",
169
+ num_nodes: "2",
170
+ cluster_config_name: "gke-a3-ultra-metadata-configmap",
171
+ capacity_type: "reservation",
172
+ reservation: "test-reservation",
173
+ }
@@ -0,0 +1,185 @@
1
+ # Copyright 2024 "Google LLC"
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ !Blueprint
15
+ blueprint_name: xpk-gke-a4
16
+ toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
17
+ toolkit_modules_version: v1.62.2
18
+
19
+ vars:
20
+ labels: {gke_product_type: xpk}
21
+
22
+
23
+ terraform_backend_defaults:
24
+ type: gcs
25
+ configuration:
26
+ bucket: test-bucket
27
+ prefix: xpk_terraform_state/testdir/gke-a4/
28
+
29
+ deployment_groups:
30
+ - !DeploymentGroup
31
+ modules:
32
+ - !DeploymentModule
33
+ id: gke-a4-net-0
34
+ source: modules/network/vpc
35
+ settings:
36
+ network_name: gke-a4-net-0
37
+ mtu: 8896
38
+ subnetworks:
39
+ - subnet_name: gke-a4-sub-0
40
+ subnet_region: us-central1
41
+ subnet_ip: 192.168.0.0/18
42
+ secondary_ranges_list:
43
+ - subnetwork_name: gke-a4-sub-0
44
+ ranges:
45
+ - range_name: pods
46
+ ip_cidr_range: 10.4.0.0/14
47
+ - range_name: services
48
+ ip_cidr_range: 10.0.32.0/20
49
+ firewall_rules:
50
+ - name: gke-a4-internal-0
51
+ ranges:
52
+ - 192.168.0.0/16
53
+ allow:
54
+ - protocol: tcp
55
+ ports:
56
+ - 0-65535
57
+ - protocol: udp
58
+ ports:
59
+ - 0-65535
60
+ - protocol: icmp
61
+
62
+ - !DeploymentModule
63
+ id: gke-a4-net-1
64
+ source: modules/network/vpc
65
+ settings:
66
+ network_name: gke-a4-net-1
67
+ mtu: 8896
68
+ subnetworks:
69
+ - subnet_name: gke-a4-sub-1
70
+ subnet_region: us-central1
71
+ subnet_ip: 192.168.64.0/18
72
+ firewall_rules:
73
+ - name: gke-a4-internal-1
74
+ ranges:
75
+ - 192.168.0.0/16
76
+ allow:
77
+ - protocol: tcp
78
+ ports:
79
+ - 0-65535
80
+ - protocol: udp
81
+ ports:
82
+ - 0-65535
83
+ - protocol: icmp
84
+
85
+ - !DeploymentModule
86
+ id: gke-a4-rdma-net
87
+ source: modules/network/gpu-rdma-vpc
88
+ settings:
89
+ network_name: gke-a4-rdma-net
90
+ mtu: 8896
91
+ network_profile:
92
+ https://www.googleapis.com/compute/beta/projects/foo/global/networkProfiles/us-central1-c-vpc-roce
93
+ network_routing_mode: REGIONAL
94
+ subnetworks_template:
95
+ name_prefix: gke-a4-rdma-sub
96
+ count: 8
97
+ ip_range: 192.168.128.0/18
98
+ region: us-central1
99
+
100
+ - !DeploymentModule
101
+ id: gke-a4-a4-cluster
102
+ source: modules/scheduler/gke-cluster
103
+ outputs:
104
+ - instructions
105
+ settings:
106
+ system_node_pool_machine_type: e2-standard-16
107
+ system_node_pool_node_count:
108
+ total_min_nodes: 2
109
+ total_max_nodes: 1000
110
+ prefix_with_deployment_name: false
111
+ name_suffix: gke-a4
112
+ enable_dcgm_monitoring: true
113
+ enable_gcsfuse_csi: true
114
+ enable_private_endpoint: false
115
+ master_authorized_networks:
116
+ - cidr_block: 10.0.0.0/32
117
+ display_name: kubectl-access-network
118
+ additional_networks: $(concat([{network=gke-a4-net-1.network_name, subnetwork=gke-a4-net-1.subnetwork_name,
119
+ subnetwork_project="foo", nic_type="GVNIC", queue_count=null,
120
+ network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null,
121
+ network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a4-rdma-net.subnetwork_interfaces_gke))
122
+ release_channel: RAPID
123
+ version_prefix: '1.2'
124
+ min_cluster_version: 1.2.3
125
+ use:
126
+ - gke-a4-net-0
127
+ - !DeploymentModule
128
+ id: gke-a4-a4-pool
129
+ source: modules/compute/gke-node-pool
130
+ use:
131
+ - gke-a4-a4-cluster
132
+ outputs:
133
+ - instructions
134
+ settings:
135
+ machine_type: a4-highgpu-8g
136
+ auto_upgrade: true
137
+ zones:
138
+ - us-central1-c
139
+ disk_type: hyperdisk-balanced
140
+ local_ssd_count_ephemeral_storage: 32
141
+ spot: false
142
+ reservation_affinity:
143
+ consume_reservation_type: SPECIFIC_RESERVATION
144
+ specific_reservations:
145
+ - name: test-reservation
146
+ max_pods_per_node: 32
147
+ guest_accelerator:
148
+ - type: nvidia-b200
149
+ count: 8
150
+ gpu_driver_installation_config:
151
+ gpu_driver_version: LATEST
152
+ additional_networks: $(concat([{network=gke-a4-net-1.network_name, subnetwork=gke-a4-net-1.subnetwork_name,
153
+ subnetwork_project="foo", nic_type="GVNIC", queue_count=null,
154
+ network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null,
155
+ network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a4-rdma-net.subnetwork_interfaces_gke))
156
+ static_node_count: 2
157
+
158
+ - !DeploymentModule
159
+ id: workload-manager-install
160
+ source: modules/management/kubectl-apply
161
+ use:
162
+ - gke-a4-a4-cluster
163
+ settings:
164
+ jobset:
165
+ install: true
166
+ version: v0.7.2
167
+ apply_manifests:
168
+ - source: $(ghpc_stage("xpk-gke-a4"))/nccl-rdma-installer-a4.yaml
169
+ - source: $(ghpc_stage("xpk-gke-a4"))/storage_crd.yaml
170
+
171
+ - !DeploymentModule
172
+ id: workload_configmap
173
+ source: modules/management/kubectl-apply
174
+ settings:
175
+ apply_manifests:
176
+ - source: $(ghpc_stage("xpk-gke-a4"))/config-map.yaml.tftpl
177
+ template_vars:
178
+ resource_config_name: gke-a4-resources-configmap
179
+ num_nodes: '2'
180
+ cluster_config_name: gke-a4-metadata-configmap
181
+ capacity_type: reservation
182
+ reservation: test-reservation
183
+ use:
184
+ - gke-a4-a4-cluster
185
+ group: primary