xpk 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. integration/README.md +19 -0
  2. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  3. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  4. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  5. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  6. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  7. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  8. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  9. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  10. xpk/blueprints/a4/storage_crd.yaml +52 -0
  11. xpk/commands/cluster.py +33 -12
  12. xpk/commands/cluster_gcluster_test.py +5 -1
  13. xpk/commands/cluster_test.py +125 -0
  14. xpk/commands/config.py +3 -3
  15. xpk/commands/inspector.py +5 -3
  16. xpk/commands/kind.py +2 -0
  17. xpk/commands/managed_ml_diagnostics.py +249 -0
  18. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  19. xpk/commands/workload.py +125 -139
  20. xpk/commands/workload_test.py +160 -118
  21. xpk/core/blueprint/blueprint_generator.py +3 -0
  22. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  23. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  24. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  25. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  26. xpk/core/capacity.py +2 -0
  27. xpk/core/cluster.py +18 -47
  28. xpk/core/cluster_test.py +76 -1
  29. xpk/core/config.py +81 -7
  30. xpk/core/config_test.py +67 -11
  31. xpk/core/docker_container.py +3 -1
  32. xpk/core/docker_image.py +10 -6
  33. xpk/core/docker_resources.py +1 -10
  34. xpk/core/kjob.py +17 -16
  35. xpk/core/kueue_manager.py +13 -19
  36. xpk/core/kueue_manager_test.py +27 -1
  37. xpk/core/nap.py +13 -14
  38. xpk/core/nodepool.py +17 -15
  39. xpk/core/nodepool_test.py +25 -4
  40. xpk/core/pathways.py +23 -0
  41. xpk/core/pathways_test.py +57 -0
  42. xpk/core/resources.py +84 -27
  43. xpk/core/scheduling.py +128 -132
  44. xpk/core/scheduling_test.py +215 -2
  45. xpk/core/system_characteristics.py +179 -0
  46. xpk/core/system_characteristics_test.py +49 -1
  47. xpk/core/telemetry.py +4 -4
  48. xpk/core/telemetry_test.py +9 -9
  49. xpk/core/vertex.py +4 -3
  50. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  51. xpk/main.py +2 -0
  52. xpk/parser/cluster.py +22 -88
  53. xpk/parser/cluster_test.py +41 -0
  54. xpk/parser/common.py +84 -0
  55. xpk/parser/storage.py +10 -0
  56. xpk/parser/storage_test.py +47 -0
  57. xpk/parser/workload.py +14 -41
  58. xpk/parser/workload_test.py +2 -48
  59. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  60. xpk/utils/feature_flags.py +3 -0
  61. xpk/utils/validation.py +2 -2
  62. xpk-0.16.1.dist-info/METADATA +127 -0
  63. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/RECORD +67 -48
  64. xpk-0.15.0.dist-info/METADATA +0 -1666
  65. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/WHEEL +0 -0
  66. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/top_level.txt +0 -0
@@ -16,10 +16,13 @@ limitations under the License.
16
16
 
17
17
  import dataclasses
18
18
  from unittest.mock import MagicMock, patch
19
+ import yaml
19
20
  import pytest
20
- from ..core.system_characteristics import SystemCharacteristics, AcceleratorType
21
- from .workload import _validate_sub_slicing_topology, _validate_sub_slicing_availability
22
- from packaging.version import Version
21
+
22
+ from ..core.scheduling import WorkloadScheduling
23
+ from ..core.system_characteristics import DockerPlatform, SystemCharacteristics, AcceleratorType, UserFacingNameToSystemCharacteristics, GpuConfig
24
+ from .workload import workload_create
25
+ from .cluster_test import construct_args
23
26
 
24
27
 
25
28
  SYSTEM_CHARACTERISTICS = SystemCharacteristics(
@@ -32,133 +35,172 @@ SYSTEM_CHARACTERISTICS = SystemCharacteristics(
32
35
  device_type='l4-1',
33
36
  supports_sub_slicing=True,
34
37
  requires_workload_policy=False,
38
+ docker_platform=DockerPlatform.AMD,
35
39
  )
36
40
 
37
41
 
38
- @pytest.fixture(autouse=True)
42
+ @dataclasses.dataclass
43
+ class _WorkloadCreateMocks:
44
+ """Holds all the mocked dependencies for the workload_create function."""
45
+
46
+ get_user_workload_container: MagicMock
47
+ get_gpu_scheduler: MagicMock
48
+ get_storages_to_mount: MagicMock
49
+ add_bucket_iam_members: MagicMock
50
+ get_gke_outlier_dashboard: MagicMock
51
+ check_if_workload_exists: MagicMock
52
+ get_cluster_configmap: MagicMock
53
+ check_if_workload_can_schedule: MagicMock
54
+ setup_k8s_env: MagicMock
55
+ setup_k8s_service_accounts: MagicMock
56
+ validate_dependencies_list: MagicMock
57
+ write_tmp_file: MagicMock
58
+ get_cluster_capacity_type: MagicMock
59
+ is_TAS_possible: MagicMock
60
+ get_cluster_location: MagicMock
61
+ xpk_exit: MagicMock
62
+ run_command_with_updates: MagicMock
63
+ ensure_resource_policy_exists: MagicMock
64
+ get_cluster_subnetworks: MagicMock
65
+
66
+
67
+ @pytest.fixture
39
68
  def xpk_print(mocker):
40
69
  return mocker.patch('xpk.commands.workload.xpk_print')
41
70
 
42
71
 
43
- def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
44
- xpk_print: MagicMock,
45
- ):
46
- with pytest.raises(SystemExit):
47
- _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '2x1')
48
-
49
- assert (
50
- 'shape is invalid. It has to be one of' in xpk_print.mock_calls[0].args[0]
51
- )
52
-
53
-
54
- def test_validate_sub_slicing_topology_exits_for_too_large_topology(
55
- xpk_print: MagicMock,
56
- ):
57
- with pytest.raises(SystemExit):
58
- _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '16x16')
59
-
60
- assert (
61
- 'shape is too large. The shape cannot be'
62
- in xpk_print.mock_calls[0].args[0]
63
- )
64
-
65
-
66
- def test_validate_sub_slicing_topology_does_nothing_for_supported_topology():
67
- _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '4x4')
68
-
69
-
70
- def test_validate_sub_slicing_availability_exits_when_getting_topologies_fails(
71
- xpk_print: MagicMock, mocker
72
- ):
73
- mocker.patch(
74
- 'xpk.commands.workload.has_sub_slicing_enabled',
75
- return_value=(1, None),
76
- )
77
- with pytest.raises(SystemExit):
78
- _validate_sub_slicing_availability()
79
-
80
- assert (
81
- 'Unable to validate sub-slicing support'
82
- in xpk_print.mock_calls[0].args[0]
83
- )
84
-
85
-
86
- def test_validate_sub_slicing_availability_exits_when_subslicing_topology_is_not_defined(
87
- xpk_print: MagicMock, mocker
88
- ):
89
- mocker.patch(
90
- 'xpk.commands.workload.has_sub_slicing_enabled',
91
- return_value=(0, False),
92
- )
93
- with pytest.raises(SystemExit):
94
- _validate_sub_slicing_availability()
95
-
96
- assert (
97
- 'Cluster has not been not set up for Sub-slicing.'
98
- in xpk_print.mock_calls[0].args[0]
72
+ @pytest.fixture
73
+ def workload_create_mocks(mocker) -> _WorkloadCreateMocks:
74
+ """Mocks all dependencies for the workload_create function."""
75
+ return _WorkloadCreateMocks(
76
+ get_user_workload_container=mocker.patch(
77
+ 'xpk.commands.workload.get_user_workload_container',
78
+ return_value=('', None),
79
+ ),
80
+ get_gpu_scheduler=mocker.patch(
81
+ 'xpk.commands.workload.get_gpu_scheduler', return_value=('', 0)
82
+ ),
83
+ get_storages_to_mount=mocker.patch(
84
+ 'xpk.commands.workload.get_storages_to_mount', return_value=[]
85
+ ),
86
+ add_bucket_iam_members=mocker.patch(
87
+ 'xpk.commands.workload.add_bucket_iam_members'
88
+ ),
89
+ get_gke_outlier_dashboard=mocker.patch(
90
+ 'xpk.commands.workload.get_gke_outlier_dashboard'
91
+ ),
92
+ check_if_workload_exists=mocker.patch(
93
+ 'xpk.commands.workload.check_if_workload_exists', return_value=False
94
+ ),
95
+ get_cluster_configmap=mocker.patch(
96
+ 'xpk.commands.workload.get_cluster_configmap', return_value={}
97
+ ),
98
+ check_if_workload_can_schedule=mocker.patch(
99
+ 'xpk.commands.workload.check_if_workload_can_schedule',
100
+ return_value=WorkloadScheduling.AVAILABLE,
101
+ ),
102
+ setup_k8s_env=mocker.patch('xpk.commands.workload.setup_k8s_env'),
103
+ setup_k8s_service_accounts=mocker.patch(
104
+ 'xpk.commands.workload.setup_k8s_service_accounts'
105
+ ),
106
+ validate_dependencies_list=mocker.patch(
107
+ 'xpk.commands.workload.validate_dependencies_list'
108
+ ),
109
+ write_tmp_file=mocker.patch('xpk.commands.workload.write_tmp_file'),
110
+ get_cluster_capacity_type=mocker.patch(
111
+ 'xpk.commands.workload.get_cluster_capacity_type',
112
+ return_value='on-demand',
113
+ ),
114
+ is_TAS_possible=mocker.patch(
115
+ 'xpk.commands.workload.is_TAS_possible', return_value=False
116
+ ),
117
+ get_cluster_location=mocker.patch(
118
+ 'xpk.commands.workload.get_cluster_location',
119
+ return_value='us-central1',
120
+ ),
121
+ xpk_exit=mocker.patch('xpk.commands.workload.xpk_exit'),
122
+ run_command_with_updates=mocker.patch(
123
+ 'xpk.commands.workload.run_command_with_updates', return_value=0
124
+ ),
125
+ ensure_resource_policy_exists=mocker.patch(
126
+ 'xpk.commands.workload.ensure_resource_policy_exists'
127
+ ),
128
+ get_cluster_subnetworks=mocker.patch(
129
+ 'xpk.commands.workload.get_cluster_subnetworks', return_value=[]
130
+ ),
99
131
  )
100
132
 
101
133
 
102
- def test_validate_sub_slicing_availability_exits_when_kueue_version_cannot_be_determined(
103
- xpk_print: MagicMock, mocker
134
+ def test_workload_create_for_a4x_has_arm_toleration(
135
+ workload_create_mocks: _WorkloadCreateMocks,
104
136
  ):
105
- mocker.patch(
106
- 'xpk.commands.workload.has_sub_slicing_enabled',
107
- return_value=(0, True),
137
+ """Tests that the generated YAML for an A4X workload has arm64 toleration."""
138
+ # Copy and overwrite the decorator with a no-op lambda.
139
+ gb200_system_chars = UserFacingNameToSystemCharacteristics['gb200-4']
140
+ gb200_system_chars_no_decorator = dataclasses.replace(
141
+ gb200_system_chars,
142
+ gpu_config=GpuConfig(
143
+ requires_topology=False, jobset_decorator_fn=lambda yml, *_: yml
144
+ ),
108
145
  )
109
- mocker.patch(
110
- 'xpk.commands.workload.get_installed_kueue_version',
111
- return_value=(1, None),
112
- )
113
- with pytest.raises(SystemExit):
114
- _validate_sub_slicing_availability()
115
-
116
- assert 'Unable to validate sub-slicing' in xpk_print.mock_calls[0].args[0]
146
+ # Patch the function that returns the system characteristics
147
+ # to return our modified object.
148
+ with patch(
149
+ 'xpk.commands.workload.get_system_characteristics',
150
+ return_value=(gb200_system_chars_no_decorator, 0),
151
+ ):
152
+ args = construct_args(
153
+ device_type='gb200-4',
154
+ workload='test-workload',
155
+ command='echo hello',
156
+ num_nodes=1,
157
+ restart_on_exit_codes=None,
158
+ )
159
+ workload_create(args)
160
+
161
+ assert workload_create_mocks.write_tmp_file.called
162
+ yaml_content = workload_create_mocks.write_tmp_file.call_args[0][0]
163
+ jobset = yaml.safe_load(yaml_content)
164
+
165
+ tolerations = jobset['spec']['replicatedJobs'][0]['template']['spec'][
166
+ 'template'
167
+ ]['spec']['tolerations']
168
+ assert {
169
+ 'key': 'kubernetes.io/arch',
170
+ 'operator': 'Equal',
171
+ 'value': 'arm64',
172
+ 'effect': 'NoSchedule',
173
+ } in tolerations
174
+
175
+
176
+ def test_workload_create_dry_run_with_output_file(mocker):
177
+ args = MagicMock()
178
+ args.workload = 'test-workload'
179
+ args.output_manifest_file = 'manifest.yaml'
180
+ args.use_pathways = False
181
+ args.use_vertex_tensorboard = False
182
+ args.project = 'test-project'
183
+ args.cluster = 'test-cluster'
184
+ args.zone = 'test-zone'
185
+ args.sub_slicing_topology = None
186
+
187
+ # Mock dependencies to avoid external calls and simulate state
188
+ mocker.patch('xpk.utils.execution_context.dry_run', True)
189
+ mocks = {
190
+ 'get_system_characteristics': (SYSTEM_CHARACTERISTICS, 0),
191
+ 'get_user_workload_container': ('container_yaml', None),
192
+ 'write_tmp_file': 'tmp_file',
193
+ 'parse_env_config': None,
194
+ }
195
+ for name, return_value in mocks.items():
196
+ mocker.patch(f'xpk.commands.workload.{name}', return_value=return_value)
197
+
198
+ mock_open = mocker.patch('builtins.open', mocker.mock_open())
117
199
 
118
-
119
- def test_validate_sub_slicing_availability_exits_when_kueue_version_does_not_meet_minimum_requirements(
120
- xpk_print: MagicMock, mocker
121
- ):
122
- mocker.patch(
123
- 'xpk.commands.workload.has_sub_slicing_enabled',
124
- return_value=(0, True),
125
- )
126
- mocker.patch(
127
- 'xpk.commands.workload.get_installed_kueue_version',
128
- return_value=(0, Version('0.0.0')),
129
- )
130
200
  with pytest.raises(SystemExit):
131
- _validate_sub_slicing_availability()
132
-
133
- assert 'The minimal required version is' in xpk_print.mock_calls[0].args[0]
134
-
201
+ workload_create(args)
135
202
 
136
- def test_validate_sub_slicing_availability_does_nothing_when_cluster_is_correctly_configured_for_subslicing(
137
- mocker,
138
- ):
139
- mocker.patch(
140
- 'xpk.commands.workload.has_sub_slicing_enabled',
141
- return_value=(0, True),
142
- )
143
- mocker.patch(
144
- 'xpk.commands.workload.get_installed_kueue_version',
145
- return_value=(0, Version('0.13.0')),
146
- )
147
- _validate_sub_slicing_availability()
148
-
149
-
150
- @patch('xpk.commands.common.xpk_print')
151
- def test_validate_sub_slicing_topology_fails_for_unsupported_system(
152
- common_xpk_print: MagicMock,
153
- ):
154
- unsupported_system = dataclasses.replace(
155
- SYSTEM_CHARACTERISTICS, supports_sub_slicing=False
156
- )
157
-
158
- with pytest.raises(SystemExit):
159
- _validate_sub_slicing_topology(unsupported_system, '4x4')
160
-
161
- assert (
162
- 'l4-1 does not support Sub-slicing.'
163
- in common_xpk_print.mock_calls[0].args[0]
164
- )
203
+ mock_open.assert_called_once_with('manifest.yaml', 'w', encoding='utf-8')
204
+ written_content = mock_open.return_value.write.call_args[0][0]
205
+ assert 'test-workload' in written_content
206
+ assert 'cloud.google.com/gke-tpu-topology: 8x8' in written_content
@@ -31,6 +31,8 @@ from ..capacity import (
31
31
  B200_DEVICE_TYPE,
32
32
  H100_MEGA_DEVICE_TYPE,
33
33
  H200_DEVICE_TYPE,
34
+ GB200_DEVICE_TYPE,
35
+ GB200_DEVICE_TYPE_NOLSSD,
34
36
  CapacityType,
35
37
  )
36
38
  from ..system_characteristics import get_system_characteristics_by_device_type
@@ -42,6 +44,7 @@ a3high_device_type = H100_DEVICE_TYPE
42
44
  a3mega_device_type = H100_MEGA_DEVICE_TYPE
43
45
  a3ultra_device_type = H200_DEVICE_TYPE
44
46
  a4_device_type = B200_DEVICE_TYPE
47
+ a4x_device_types = (GB200_DEVICE_TYPE, GB200_DEVICE_TYPE_NOLSSD)
45
48
  supported_device_types = {
46
49
  a3mega_device_type,
47
50
  a3ultra_device_type,
@@ -0,0 +1,129 @@
1
+ # Copyright 2024 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ ---
16
+ !Blueprint
17
+ blueprint_name: xpk-gke-a3-megagpu
18
+ toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
19
+ toolkit_modules_version: v1.62.2
20
+
21
+ vars:
22
+ project_id: "foo"
23
+ deployment_name: xpk-gke-a3-megagpu
24
+ region: us-central1
25
+ zone: us-central1-c
26
+ labels: {gke_product_type: xpk}
27
+
28
+ deployment_groups:
29
+ - !DeploymentGroup
30
+ group: primary
31
+ modules:
32
+ - !DeploymentModule
33
+ id: network1
34
+ source: modules/network/vpc
35
+ settings:
36
+ subnetwork_name: bar-xpk-gke-a3-megagpu-subnet
37
+ secondary_ranges:
38
+ bar-xpk-gke-a3-megagpu-subnet:
39
+ - range_name: pods
40
+ ip_cidr_range: 10.4.0.0/14
41
+ - range_name: services
42
+ ip_cidr_range: 10.0.32.0/20
43
+ - !DeploymentModule
44
+ id: gpunets
45
+ source: modules/network/multivpc
46
+ settings:
47
+ network_name_prefix: bar-gpunet
48
+ global_ip_address_range: 192.169.0.0/16
49
+ network_count: 8
50
+ subnetwork_cidr_suffix: 24
51
+ - !DeploymentModule
52
+ id: gke_cluster
53
+ source: modules/scheduler/gke-cluster
54
+ use: [network1, gpunets]
55
+ settings:
56
+ release_channel: RAPID
57
+ version_prefix: '1.2'
58
+ min_master_version: 1.2.3
59
+ prefix_with_deployment_name: false
60
+ name_suffix: bar
61
+ enable_private_endpoint: false
62
+ enable_gcsfuse_csi: true
63
+ enable_filestore_csi: true
64
+ master_authorized_networks:
65
+ - cidr_block: 10.0.0.0/32 # Allows your machine run kubectl command. It's required for the multi-network setup.
66
+ display_name: "kubectl-access-network"
67
+ system_node_pool_machine_type: "e2-standard-32"
68
+ system_node_pool_node_count:
69
+ total_min_nodes: 5
70
+ total_max_nodes: 1000
71
+ k8s_network_names:
72
+ gvnic_prefix: "bar-gpunet-"
73
+ gvnic_postfix: "-subnet"
74
+ gvnic_start_index: 0
75
+ outputs: [instructions]
76
+
77
+ - !DeploymentModule
78
+ id: a3_megagpu_pool_0
79
+ source: modules/compute/gke-node-pool
80
+ use: [gke_cluster, gpunets]
81
+ settings:
82
+ name: bar-a3-megagpu-pool-0
83
+ machine_type: a3-megagpu-8g
84
+ zones: [us-central1-c]
85
+ host_maintenance_interval: 'PERIODIC'
86
+ reservation_affinity:
87
+ consume_reservation_type: SPECIFIC_RESERVATION
88
+ specific_reservations:
89
+ - name: test-reservation
90
+ run_workload_script: false
91
+ spot: false
92
+ max_pods_per_node: 32
93
+ guest_accelerator:
94
+ - type: nvidia-h100-mega-80gb
95
+ count: 8
96
+ gpu_driver_installation_config:
97
+ gpu_driver_version: "LATEST"
98
+ auto_upgrade: true
99
+ static_node_count: 2
100
+ placement_policy:
101
+ type: COMPACT
102
+ name: test-reservation-placement
103
+ outputs: [instructions]
104
+
105
+ - !DeploymentModule
106
+ id: workload_component_install
107
+ source: modules/management/kubectl-apply
108
+ use: [gke_cluster]
109
+ settings:
110
+ jobset:
111
+ install: true
112
+ version: v0.7.2
113
+ apply_manifests:
114
+ - source: $(ghpc_stage("xpk-gke-a3-megagpu"))/storage_crd.yaml
115
+
116
+ - !DeploymentModule
117
+ id: workload_configmap
118
+ source: modules/management/kubectl-apply
119
+ use: [gke_cluster]
120
+ settings:
121
+ apply_manifests:
122
+ - source: $(ghpc_stage("xpk-gke-a3-megagpu"))/config-map.yaml.tftpl
123
+ template_vars: {
124
+ resource_config_name: "bar-resources-configmap",
125
+ num_nodes: "2",
126
+ cluster_config_name: "bar-metadata-configmap",
127
+ capacity_type: "reservation",
128
+ reservation: "test-reservation",
129
+ }
@@ -0,0 +1,125 @@
1
+ # Copyright 2024 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ ---
16
+ !Blueprint
17
+ blueprint_name: xpk-gke-a3-megagpu
18
+ toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
19
+ toolkit_modules_version: v1.62.2
20
+
21
+ vars:
22
+ project_id: "foo"
23
+ deployment_name: xpk-gke-a3-megagpu
24
+ region: us-central1
25
+ zone: us-central1-c
26
+ labels: {gke_product_type: xpk}
27
+
28
+ deployment_groups:
29
+ - !DeploymentGroup
30
+ group: primary
31
+ modules:
32
+ - !DeploymentModule
33
+ id: network1
34
+ source: modules/network/vpc
35
+ settings:
36
+ subnetwork_name: bar-xpk-gke-a3-megagpu-subnet
37
+ secondary_ranges:
38
+ bar-xpk-gke-a3-megagpu-subnet:
39
+ - range_name: pods
40
+ ip_cidr_range: 10.4.0.0/14
41
+ - range_name: services
42
+ ip_cidr_range: 10.0.32.0/20
43
+ - !DeploymentModule
44
+ id: gpunets
45
+ source: modules/network/multivpc
46
+ settings:
47
+ network_name_prefix: bar-gpunet
48
+ global_ip_address_range: 192.169.0.0/16
49
+ network_count: 8
50
+ subnetwork_cidr_suffix: 24
51
+ - !DeploymentModule
52
+ id: gke_cluster
53
+ source: modules/scheduler/gke-cluster
54
+ use: [network1, gpunets]
55
+ settings:
56
+ release_channel: RAPID
57
+ version_prefix: '1.2'
58
+ min_master_version: 1.2.3
59
+ prefix_with_deployment_name: false
60
+ name_suffix: bar
61
+ enable_private_endpoint: false
62
+ enable_gcsfuse_csi: true
63
+ enable_filestore_csi: true
64
+ master_authorized_networks:
65
+ - cidr_block: 10.0.0.0/32 # Allows your machine run kubectl command. It's required for the multi-network setup.
66
+ display_name: "kubectl-access-network"
67
+ system_node_pool_machine_type: "e2-standard-32"
68
+ system_node_pool_node_count:
69
+ total_min_nodes: 5
70
+ total_max_nodes: 1000
71
+ k8s_network_names:
72
+ gvnic_prefix: "bar-gpunet-"
73
+ gvnic_postfix: "-subnet"
74
+ gvnic_start_index: 0
75
+ outputs: [instructions]
76
+
77
+ - !DeploymentModule
78
+ id: a3_megagpu_pool_0
79
+ source: modules/compute/gke-node-pool
80
+ use: [gke_cluster, gpunets]
81
+ settings:
82
+ name: bar-a3-megagpu-pool-0
83
+ machine_type: a3-megagpu-8g
84
+ zones: [us-central1-c]
85
+ host_maintenance_interval: PERIODIC
86
+ reservation_affinity:
87
+ consume_reservation_type: NO_RESERVATION
88
+ specific_reservations: []
89
+ run_workload_script: false
90
+ max_pods_per_node: 32
91
+ spot: true
92
+ guest_accelerator:
93
+ - type: nvidia-h100-mega-80gb
94
+ count: 8
95
+ gpu_driver_installation_config:
96
+ gpu_driver_version: "LATEST"
97
+ auto_upgrade: true
98
+ static_node_count: 2
99
+ outputs: [instructions]
100
+
101
+ - !DeploymentModule
102
+ id: workload_component_install
103
+ source: modules/management/kubectl-apply
104
+ use: [gke_cluster]
105
+ settings:
106
+ jobset:
107
+ install: true
108
+ version: v0.7.2
109
+ apply_manifests:
110
+ - source: $(ghpc_stage("xpk-gke-a3-megagpu"))/storage_crd.yaml
111
+
112
+ - !DeploymentModule
113
+ id: workload_configmap
114
+ source: modules/management/kubectl-apply
115
+ use: [gke_cluster]
116
+ settings:
117
+ apply_manifests:
118
+ - source: $(ghpc_stage("xpk-gke-a3-megagpu"))/config-map.yaml.tftpl
119
+ template_vars: {
120
+ resource_config_name: "bar-resources-configmap",
121
+ num_nodes: "2",
122
+ cluster_config_name: "bar-metadata-configmap",
123
+ capacity_type: "spot",
124
+ reservation: "None",
125
+ }