xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/README.md +19 -0
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3mega/storage_crd.yaml +52 -0
- xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
- xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
- xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
- xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
- xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
- xpk/blueprints/a4/storage_crd.yaml +52 -0
- xpk/commands/cluster.py +89 -32
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +16 -3
- xpk/commands/cluster_test.py +353 -7
- xpk/commands/config.py +3 -5
- xpk/commands/inspector.py +5 -3
- xpk/commands/kind.py +3 -1
- xpk/commands/managed_ml_diagnostics.py +249 -0
- xpk/commands/managed_ml_diagnostics_test.py +146 -0
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +143 -142
- xpk/commands/workload_test.py +160 -118
- xpk/core/blueprint/blueprint_generator.py +73 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
- xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
- xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
- xpk/core/blueprint/testing/data/a4.yaml +185 -0
- xpk/core/capacity.py +48 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +55 -104
- xpk/core/cluster_test.py +170 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +88 -7
- xpk/core/config_test.py +67 -11
- xpk/core/docker_container.py +3 -1
- xpk/core/docker_image.py +10 -6
- xpk/core/docker_resources.py +1 -10
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +17 -19
- xpk/core/kueue_manager.py +205 -51
- xpk/core/kueue_manager_test.py +158 -4
- xpk/core/nap.py +13 -14
- xpk/core/nodepool.py +37 -43
- xpk/core/nodepool_test.py +42 -19
- xpk/core/pathways.py +23 -0
- xpk/core/pathways_test.py +57 -0
- xpk/core/resources.py +84 -27
- xpk/core/scheduling.py +144 -133
- xpk/core/scheduling_test.py +298 -6
- xpk/core/system_characteristics.py +256 -19
- xpk/core/system_characteristics_test.py +128 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/core/vertex.py +4 -3
- xpk/core/workload_decorators/tcpx_decorator.py +5 -1
- xpk/main.py +33 -13
- xpk/parser/cluster.py +40 -67
- xpk/parser/cluster_test.py +83 -3
- xpk/parser/common.py +84 -0
- xpk/parser/storage.py +10 -0
- xpk/parser/storage_test.py +47 -0
- xpk/parser/workload.py +14 -29
- xpk/parser/workload_test.py +3 -49
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +10 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +2 -13
- xpk/utils/versions.py +31 -0
- xpk-0.16.0.dist-info/METADATA +127 -0
- xpk-0.16.0.dist-info/RECORD +168 -0
- xpk-0.14.4.dist-info/METADATA +0 -1645
- xpk-0.14.4.dist-info/RECORD +0 -139
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/core/nodepool.py
CHANGED
|
@@ -15,8 +15,8 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from typing import List
|
|
18
|
-
from ..utils.console import
|
|
19
|
-
from
|
|
18
|
+
from ..utils.console import ask_for_user_consent, xpk_print
|
|
19
|
+
from .scheduling import get_placement_policy_name, is_placement_policy_supported
|
|
20
20
|
from .capacity import (
|
|
21
21
|
AUTOPROVISIONING_CONFIG_VALUE,
|
|
22
22
|
H100_MEGA_DEVICE_TYPE,
|
|
@@ -28,10 +28,9 @@ from .capacity import (
|
|
|
28
28
|
from .commands import run_command_for_value, run_commands
|
|
29
29
|
from .gcloud_context import GkeServerConfig, get_cluster_location, zone_to_region
|
|
30
30
|
from .resources import (
|
|
31
|
-
|
|
32
|
-
CLUSTER_RESOURCES_CONFIGMAP,
|
|
31
|
+
ConfigMapType,
|
|
33
32
|
check_cluster_resources,
|
|
34
|
-
|
|
33
|
+
update_cluster_configmap,
|
|
35
34
|
)
|
|
36
35
|
from .system_characteristics import AcceleratorType
|
|
37
36
|
|
|
@@ -110,6 +109,7 @@ def run_gke_node_pool_create_command(
|
|
|
110
109
|
existing_node_pool_names, args.cluster, desired_node_pool_count
|
|
111
110
|
)
|
|
112
111
|
|
|
112
|
+
node_pools_to_delete = []
|
|
113
113
|
node_pools_to_remain = []
|
|
114
114
|
delete_commands = []
|
|
115
115
|
delete_task_names = []
|
|
@@ -186,14 +186,10 @@ def run_gke_node_pool_create_command(
|
|
|
186
186
|
# when cluster is getting updated from 'x' device_type/gke_accelerator to 'y' device_type/gke_accelerator.
|
|
187
187
|
# In that case, '{args.cluster}-np-i' nodepool will be re-created for 'y' device_type/gke_accelerator.
|
|
188
188
|
if delete_commands:
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
f' {node_pools_to_delete}. \nDo you wish to delete: y (yes) / n'
|
|
194
|
-
' (no):\n'
|
|
195
|
-
)
|
|
196
|
-
if not will_delete:
|
|
189
|
+
if node_pools_to_delete and not ask_for_user_consent(
|
|
190
|
+
f'Planning to delete {len(node_pools_to_delete)} node pools including'
|
|
191
|
+
f' {node_pools_to_delete}. \nDo you wish to delete?'
|
|
192
|
+
):
|
|
197
193
|
xpk_print(
|
|
198
194
|
'You have requested to not delete the existing nodepools in the'
|
|
199
195
|
' cluster. There will be no change to the cluster.'
|
|
@@ -215,18 +211,15 @@ def run_gke_node_pool_create_command(
|
|
|
215
211
|
|
|
216
212
|
# Enable Workload Identity on existing Nodepools
|
|
217
213
|
if update_WI_commands:
|
|
218
|
-
will_update_WI =
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
' (yes) / n (no):\n'
|
|
228
|
-
)
|
|
229
|
-
if not will_update_WI:
|
|
214
|
+
will_update_WI = not node_pools_to_update_WI or ask_for_user_consent(
|
|
215
|
+
'Planning to enable Workload Identity Federation on'
|
|
216
|
+
f' {len(node_pools_to_update_WI)} existing node pools including'
|
|
217
|
+
f' {node_pools_to_update_WI}. This immediately enables Workload'
|
|
218
|
+
' Identity Federation for GKE for any workloads running in the node'
|
|
219
|
+
' pool. Also, xpk does not support disabling Workload Identity on'
|
|
220
|
+
' clusters that have it enabled already \nDo you wish to update?'
|
|
221
|
+
)
|
|
222
|
+
if will_update_WI:
|
|
230
223
|
for i, command in enumerate(update_WI_commands):
|
|
231
224
|
xpk_print(
|
|
232
225
|
f'To complete {update_WI_task_names[i]} we are executing {command}'
|
|
@@ -253,22 +246,23 @@ def run_gke_node_pool_create_command(
|
|
|
253
246
|
)
|
|
254
247
|
else:
|
|
255
248
|
resources_data = f'{device_type}: "0"'
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
249
|
+
return_code = update_cluster_configmap(
|
|
250
|
+
cluster_name=args.cluster,
|
|
251
|
+
config_map_type=ConfigMapType.RESOURCES,
|
|
252
|
+
data=resources_data,
|
|
259
253
|
)
|
|
260
|
-
configmap_yml = {}
|
|
261
|
-
configmap_yml[resources_configmap_name] = resources_yml
|
|
262
|
-
return_code = create_or_update_cluster_configmap(configmap_yml)
|
|
263
254
|
if return_code != 0:
|
|
264
255
|
return 1
|
|
265
256
|
|
|
266
257
|
placement_args = ''
|
|
267
|
-
if
|
|
268
|
-
placement_policy = (
|
|
269
|
-
|
|
258
|
+
if is_placement_policy_supported(system):
|
|
259
|
+
placement_policy = get_placement_policy_name(system)
|
|
260
|
+
ensure_resource_policy_exists(
|
|
261
|
+
resource_policy_name=placement_policy,
|
|
262
|
+
project=args.project,
|
|
263
|
+
zone=args.zone,
|
|
264
|
+
topology=system.topology,
|
|
270
265
|
)
|
|
271
|
-
ensure_resource_policy_exists(placement_policy, args, system.topology)
|
|
272
266
|
placement_args = f' --placement-policy={placement_policy}'
|
|
273
267
|
|
|
274
268
|
create_commands = []
|
|
@@ -290,16 +284,16 @@ def run_gke_node_pool_create_command(
|
|
|
290
284
|
)
|
|
291
285
|
if system.accelerator_type == AcceleratorType.TPU:
|
|
292
286
|
command += f' --node-version={gke_node_pool_version}'
|
|
293
|
-
topology_product = get_topology_product(system.topology)
|
|
294
287
|
if capacity_type == CapacityType.FLEX_START:
|
|
295
288
|
command += ' --num-nodes=0'
|
|
296
|
-
|
|
289
|
+
else:
|
|
297
290
|
command += f' --num-nodes={system.vms_per_slice}'
|
|
298
291
|
command += (
|
|
299
292
|
f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
|
|
300
293
|
)
|
|
301
294
|
|
|
302
|
-
|
|
295
|
+
# --tpu-topology should not be set for single-host node pools
|
|
296
|
+
if system.vms_per_slice > 1:
|
|
303
297
|
# --placement-type=COMPACT enables group placement policy which
|
|
304
298
|
# is mutually exclusive with workload policy, --tpu-topology should
|
|
305
299
|
# also not be passed when workload policy is used
|
|
@@ -319,7 +313,7 @@ def run_gke_node_pool_create_command(
|
|
|
319
313
|
command += (
|
|
320
314
|
' --accelerator'
|
|
321
315
|
f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
|
|
322
|
-
f' --
|
|
316
|
+
f' --scopes={CLOUD_PLATFORM_AUTH_SCOPE_URL}'
|
|
323
317
|
)
|
|
324
318
|
if device_type == H100_MEGA_DEVICE_TYPE:
|
|
325
319
|
for i in range(1, 9):
|
|
@@ -595,14 +589,14 @@ def get_desired_node_pool_names(
|
|
|
595
589
|
|
|
596
590
|
|
|
597
591
|
def ensure_resource_policy_exists(
|
|
598
|
-
resource_policy_name: str,
|
|
592
|
+
resource_policy_name: str, project: str, zone: str, topology: str
|
|
599
593
|
) -> None:
|
|
600
594
|
return_code, _ = run_command_for_value(
|
|
601
595
|
(
|
|
602
596
|
'gcloud compute resource-policies describe'
|
|
603
597
|
f' {resource_policy_name} '
|
|
604
|
-
f'--project={
|
|
605
|
-
f'--region={zone_to_region(
|
|
598
|
+
f'--project={project} '
|
|
599
|
+
f'--region={zone_to_region(zone)}'
|
|
606
600
|
),
|
|
607
601
|
'Retrieve resource policy',
|
|
608
602
|
)
|
|
@@ -613,7 +607,7 @@ def ensure_resource_policy_exists(
|
|
|
613
607
|
return_code, _ = run_command_for_value(
|
|
614
608
|
(
|
|
615
609
|
'gcloud compute resource-policies create workload-policy'
|
|
616
|
-
f' {resource_policy_name} --project={
|
|
610
|
+
f' {resource_policy_name} --project={project} --region={zone_to_region(zone)} --type=HIGH_THROUGHPUT'
|
|
617
611
|
f' --accelerator-topology={topology}'
|
|
618
612
|
),
|
|
619
613
|
'Create resource policy',
|
xpk/core/nodepool_test.py
CHANGED
|
@@ -20,7 +20,7 @@ from xpk.core.nodepool import (
|
|
|
20
20
|
get_desired_node_pool_names,
|
|
21
21
|
run_gke_node_pool_create_command,
|
|
22
22
|
)
|
|
23
|
-
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
|
|
23
|
+
from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
|
|
24
24
|
|
|
25
25
|
CLUSTER_NAME = "running-cucumber"
|
|
26
26
|
|
|
@@ -96,7 +96,12 @@ def test_ensure_resource_policy_exists_with_existing_policy_retrieves_existing_p
|
|
|
96
96
|
mock = mocker.patch(
|
|
97
97
|
"xpk.core.nodepool.run_command_for_value", return_value=(0, "")
|
|
98
98
|
)
|
|
99
|
-
ensure_resource_policy_exists(
|
|
99
|
+
ensure_resource_policy_exists(
|
|
100
|
+
resource_policy_name="resource-policy",
|
|
101
|
+
project="test-project",
|
|
102
|
+
zone="us-central1-a",
|
|
103
|
+
topology="2x2x1",
|
|
104
|
+
)
|
|
100
105
|
mock.assert_called_once()
|
|
101
106
|
|
|
102
107
|
|
|
@@ -108,7 +113,12 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
|
|
|
108
113
|
mock = mocker.patch(
|
|
109
114
|
"xpk.core.nodepool.run_command_for_value", side_effect=[(1, ""), (0, "")]
|
|
110
115
|
)
|
|
111
|
-
ensure_resource_policy_exists(
|
|
116
|
+
ensure_resource_policy_exists(
|
|
117
|
+
resource_policy_name="resource-policy",
|
|
118
|
+
project="test-project",
|
|
119
|
+
zone="us-central1-a",
|
|
120
|
+
topology="2x2x1",
|
|
121
|
+
)
|
|
112
122
|
assert mock.call_count == 2
|
|
113
123
|
assert mock.call_args_list[0].args[1] == "Retrieve resource policy"
|
|
114
124
|
|
|
@@ -125,7 +135,12 @@ def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creati
|
|
|
125
135
|
"xpk.core.nodepool.run_command_for_value",
|
|
126
136
|
side_effect=[(1, ""), (1, "")],
|
|
127
137
|
)
|
|
128
|
-
ensure_resource_policy_exists(
|
|
138
|
+
ensure_resource_policy_exists(
|
|
139
|
+
resource_policy_name="resource-policy",
|
|
140
|
+
project="test-project",
|
|
141
|
+
zone="us-central1-a",
|
|
142
|
+
topology="2x2x1",
|
|
143
|
+
)
|
|
129
144
|
|
|
130
145
|
|
|
131
146
|
@pytest.fixture
|
|
@@ -145,22 +160,24 @@ def mock_nodepool_dependencies(mocker):
|
|
|
145
160
|
"xpk.core.nodepool.get_cluster_location", return_value="us-central1"
|
|
146
161
|
)
|
|
147
162
|
mocker.patch("xpk.core.nodepool.run_commands", return_value=0)
|
|
148
|
-
mocker.patch("xpk.core.nodepool.
|
|
149
|
-
|
|
163
|
+
mocker.patch("xpk.core.nodepool.ask_for_user_consent", return_value=True)
|
|
164
|
+
mock_is_placement_policy_supported = mocker.patch(
|
|
165
|
+
"xpk.core.nodepool.is_placement_policy_supported"
|
|
166
|
+
)
|
|
150
167
|
mock_ensure_resource_policy = mocker.patch(
|
|
151
168
|
"xpk.core.nodepool.ensure_resource_policy_exists"
|
|
152
169
|
)
|
|
153
|
-
return
|
|
170
|
+
return mock_is_placement_policy_supported, mock_ensure_resource_policy
|
|
154
171
|
|
|
155
172
|
|
|
156
173
|
def test_placement_policy_created_for_gpu_with_valid_topology(
|
|
157
174
|
mocker, mock_nodepool_dependencies
|
|
158
175
|
):
|
|
159
176
|
"""Tests that placement policy is created for GPUs with a valid topology."""
|
|
160
|
-
|
|
177
|
+
mock_is_placement_policy_supported, mock_ensure_resource_policy = (
|
|
161
178
|
mock_nodepool_dependencies
|
|
162
179
|
)
|
|
163
|
-
|
|
180
|
+
mock_is_placement_policy_supported.return_value = True
|
|
164
181
|
args = mocker.Mock(
|
|
165
182
|
tpu_type=None,
|
|
166
183
|
device_type="h100-80gb-8",
|
|
@@ -170,13 +187,15 @@ def test_placement_policy_created_for_gpu_with_valid_topology(
|
|
|
170
187
|
)
|
|
171
188
|
system = SystemCharacteristics(
|
|
172
189
|
topology="N/A",
|
|
173
|
-
vms_per_slice=
|
|
190
|
+
vms_per_slice=2,
|
|
174
191
|
gke_accelerator="nvidia-h100-80gb",
|
|
175
192
|
gce_machine_type="a3-highgpu-8g",
|
|
176
193
|
chips_per_vm=8,
|
|
177
194
|
accelerator_type=AcceleratorType.GPU,
|
|
178
195
|
device_type="h100-80gb-8",
|
|
179
196
|
supports_sub_slicing=False,
|
|
197
|
+
docker_platform=DockerPlatform.ARM,
|
|
198
|
+
gpu_config=GpuConfig(requires_topology=True),
|
|
180
199
|
)
|
|
181
200
|
|
|
182
201
|
run_gke_node_pool_create_command(args, system, "1.2.3")
|
|
@@ -188,10 +207,10 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
|
|
|
188
207
|
mocker, mock_nodepool_dependencies
|
|
189
208
|
):
|
|
190
209
|
"""Tests that placement policy is not created for GPUs with an invalid topology."""
|
|
191
|
-
|
|
210
|
+
mock_is_placement_policy_supported, mock_ensure_resource_policy = (
|
|
192
211
|
mock_nodepool_dependencies
|
|
193
212
|
)
|
|
194
|
-
|
|
213
|
+
mock_is_placement_policy_supported.return_value = False
|
|
195
214
|
args = mocker.Mock(
|
|
196
215
|
tpu_type=None,
|
|
197
216
|
device_type="h100-80gb-8",
|
|
@@ -200,13 +219,15 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
|
|
|
200
219
|
)
|
|
201
220
|
system = SystemCharacteristics(
|
|
202
221
|
topology="N/A",
|
|
203
|
-
vms_per_slice=
|
|
222
|
+
vms_per_slice=2,
|
|
204
223
|
gke_accelerator="nvidia-h100-80gb",
|
|
205
224
|
gce_machine_type="a3-highgpu-8g",
|
|
206
225
|
chips_per_vm=8,
|
|
207
226
|
accelerator_type=AcceleratorType.GPU,
|
|
208
227
|
device_type="h100-80gb-8",
|
|
209
228
|
supports_sub_slicing=False,
|
|
229
|
+
docker_platform=DockerPlatform.ARM,
|
|
230
|
+
gpu_config=GpuConfig(requires_topology=True),
|
|
210
231
|
)
|
|
211
232
|
|
|
212
233
|
run_gke_node_pool_create_command(args, system, "1.2.3")
|
|
@@ -218,10 +239,10 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
|
|
|
218
239
|
mocker, mock_nodepool_dependencies
|
|
219
240
|
):
|
|
220
241
|
"""Tests that placement policy is created for tpu7x with a valid topology."""
|
|
221
|
-
|
|
242
|
+
mock_is_placement_policy_supported, mock_ensure_resource_policy = (
|
|
222
243
|
mock_nodepool_dependencies
|
|
223
244
|
)
|
|
224
|
-
|
|
245
|
+
mock_is_placement_policy_supported.return_value = True
|
|
225
246
|
args = mocker.Mock(
|
|
226
247
|
tpu_type="tpu7x-8",
|
|
227
248
|
device_type=None,
|
|
@@ -232,7 +253,7 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
|
|
|
232
253
|
)
|
|
233
254
|
system = SystemCharacteristics(
|
|
234
255
|
topology="2x2x1",
|
|
235
|
-
vms_per_slice=
|
|
256
|
+
vms_per_slice=2,
|
|
236
257
|
gke_accelerator="tpu7x",
|
|
237
258
|
gce_machine_type="tpu7x-standard-4t",
|
|
238
259
|
chips_per_vm=4,
|
|
@@ -240,6 +261,7 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
|
|
|
240
261
|
device_type="tpu7x-8",
|
|
241
262
|
requires_workload_policy=True,
|
|
242
263
|
supports_sub_slicing=False,
|
|
264
|
+
docker_platform=DockerPlatform.ARM,
|
|
243
265
|
)
|
|
244
266
|
|
|
245
267
|
run_gke_node_pool_create_command(args, system, "1.2.3")
|
|
@@ -251,14 +273,14 @@ def test_placement_policy_not_created_for_non7x_tpu(
|
|
|
251
273
|
mocker, mock_nodepool_dependencies
|
|
252
274
|
):
|
|
253
275
|
"""Tests that placement policy is not created for non-tpu7x TPUs."""
|
|
254
|
-
|
|
276
|
+
mock_is_placement_policy_supported, mock_ensure_resource_policy = (
|
|
255
277
|
mock_nodepool_dependencies
|
|
256
278
|
)
|
|
257
|
-
|
|
279
|
+
mock_is_placement_policy_supported.return_value = False
|
|
258
280
|
args = mocker.Mock(
|
|
259
281
|
tpu_type="v6e",
|
|
260
282
|
device_type=None,
|
|
261
|
-
num_slices=
|
|
283
|
+
num_slices=2,
|
|
262
284
|
cluster="test-cluster",
|
|
263
285
|
project="test-project",
|
|
264
286
|
zone="us-central1-a",
|
|
@@ -272,6 +294,7 @@ def test_placement_policy_not_created_for_non7x_tpu(
|
|
|
272
294
|
accelerator_type=AcceleratorType.TPU,
|
|
273
295
|
device_type="v6e-4",
|
|
274
296
|
supports_sub_slicing=True,
|
|
297
|
+
docker_platform=DockerPlatform.ARM,
|
|
275
298
|
)
|
|
276
299
|
|
|
277
300
|
run_gke_node_pool_create_command(args, system, "1.2.3")
|
xpk/core/pathways.py
CHANGED
|
@@ -333,3 +333,26 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
|
|
|
333
333
|
xpk_print(f'Delete Workload request returned ERROR {return_code}')
|
|
334
334
|
return False
|
|
335
335
|
return True
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def get_pathways_machine_types(
|
|
339
|
+
project: str, zone: str
|
|
340
|
+
) -> tuple[int, list[str]]:
|
|
341
|
+
# Identify machine types with sufficient allocatable capacity to
|
|
342
|
+
# schedule the Pathways pod. This filter ensures the selected node
|
|
343
|
+
# is large enough to handle the control plane workload plus GKE
|
|
344
|
+
# system overhead.
|
|
345
|
+
min_memory_mb = 233 * 1024
|
|
346
|
+
command = (
|
|
347
|
+
'gcloud compute machine-types list --filter "guestCpus >= 49 AND memoryMb'
|
|
348
|
+
f' >= {min_memory_mb} AND zone = \'{zone}\'" --format="value(name)"'
|
|
349
|
+
f' --project={project}'
|
|
350
|
+
)
|
|
351
|
+
return_code, result = run_command_for_value(
|
|
352
|
+
command=command,
|
|
353
|
+
task='Retrieve available pathways machine types',
|
|
354
|
+
dry_run_return_val='n2-standard-64',
|
|
355
|
+
)
|
|
356
|
+
if return_code != 0:
|
|
357
|
+
return return_code, []
|
|
358
|
+
return 0, result.strip().splitlines()
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
from unittest.mock import MagicMock
|
|
19
|
+
from xpk.core.testing.commands_tester import CommandsTester
|
|
20
|
+
from .pathways import get_pathways_machine_types
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture(autouse=True)
|
|
24
|
+
def commands_tester(mocker: MagicMock):
|
|
25
|
+
return CommandsTester(
|
|
26
|
+
mocker,
|
|
27
|
+
run_command_with_updates_path=(
|
|
28
|
+
"xpk.core.pathways.run_command_with_updates"
|
|
29
|
+
),
|
|
30
|
+
run_command_for_value_path="xpk.core.pathways.run_command_for_value",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_get_pathways_machine_types_when_command_fails_returns_failed_exit_code(
|
|
35
|
+
commands_tester: CommandsTester,
|
|
36
|
+
):
|
|
37
|
+
commands_tester.set_result_for_command(
|
|
38
|
+
(1, ""), "gcloud compute machine-types list"
|
|
39
|
+
)
|
|
40
|
+
return_code, machine_types = get_pathways_machine_types(
|
|
41
|
+
project="gke-project", zone="us-central1-a"
|
|
42
|
+
)
|
|
43
|
+
assert return_code == 1
|
|
44
|
+
assert machine_types == []
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_get_pathways_machine_types_when_command_suceeds_returns_machine_types(
|
|
48
|
+
commands_tester: CommandsTester,
|
|
49
|
+
):
|
|
50
|
+
commands_tester.set_result_for_command(
|
|
51
|
+
(0, "abc\ncba"), "gcloud compute machine-types list"
|
|
52
|
+
)
|
|
53
|
+
return_code, machine_types = get_pathways_machine_types(
|
|
54
|
+
project="gke-project", zone="us-central1-a"
|
|
55
|
+
)
|
|
56
|
+
assert return_code == 0
|
|
57
|
+
assert machine_types == ["abc", "cba"]
|
xpk/core/resources.py
CHANGED
|
@@ -15,6 +15,7 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from dataclasses import dataclass
|
|
18
|
+
import os
|
|
18
19
|
|
|
19
20
|
from ..utils.console import xpk_print
|
|
20
21
|
from ..utils.file import write_tmp_file
|
|
@@ -30,9 +31,13 @@ from .capacity import (
|
|
|
30
31
|
from .commands import run_command_for_value, run_commands
|
|
31
32
|
from .config import XPK_CURRENT_VERSION
|
|
32
33
|
from .system_characteristics import AcceleratorType, get_system_characteristics_by_device_type, SystemCharacteristics
|
|
34
|
+
from enum import Enum
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ConfigMapType(Enum):
|
|
38
|
+
RESOURCES = 'resources-configmap'
|
|
39
|
+
METADATA = 'metadata-configmap'
|
|
33
40
|
|
|
34
|
-
CLUSTER_RESOURCES_CONFIGMAP = 'resources-configmap'
|
|
35
|
-
CLUSTER_METADATA_CONFIGMAP = 'metadata-configmap'
|
|
36
41
|
|
|
37
42
|
CLUSTER_CONFIGMAP_YAML = """kind: ConfigMap
|
|
38
43
|
apiVersion: v1
|
|
@@ -50,7 +55,15 @@ class AutoprovisioningConfig:
|
|
|
50
55
|
maximum_chips: int
|
|
51
56
|
|
|
52
57
|
|
|
53
|
-
def
|
|
58
|
+
def get_config_map_name(
|
|
59
|
+
cluster_name: str, config_map_type: ConfigMapType
|
|
60
|
+
) -> str:
|
|
61
|
+
return f'{cluster_name}-{config_map_type.value}'
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_cluster_configmap(
|
|
65
|
+
cluster_name: str, config_map_type: ConfigMapType
|
|
66
|
+
) -> dict[str, str] | None:
|
|
54
67
|
"""Run the Get GKE Cluster ConfigMap request.
|
|
55
68
|
|
|
56
69
|
Args:
|
|
@@ -59,15 +72,17 @@ def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
|
|
|
59
72
|
Returns:
|
|
60
73
|
key:value pairs stored in cluster ConfigMap.
|
|
61
74
|
"""
|
|
75
|
+
config_map_name = get_config_map_name(cluster_name, config_map_type)
|
|
62
76
|
command = (
|
|
63
77
|
'kubectl get configmap'
|
|
64
|
-
f' {
|
|
78
|
+
f' {config_map_name} -o=custom-columns="ConfigData:data"'
|
|
79
|
+
' --no-headers=true'
|
|
65
80
|
)
|
|
66
81
|
|
|
67
82
|
return_code, return_value = run_command_for_value(
|
|
68
83
|
command,
|
|
69
84
|
'GKE Cluster Get ConfigMap',
|
|
70
|
-
dry_run_return_val=
|
|
85
|
+
dry_run_return_val=_get_dry_run_config_map_value(config_map_type),
|
|
71
86
|
)
|
|
72
87
|
if return_code != 0:
|
|
73
88
|
xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
|
|
@@ -89,9 +104,18 @@ def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
|
|
|
89
104
|
return config_map
|
|
90
105
|
|
|
91
106
|
|
|
107
|
+
def _get_dry_run_config_map_value(config_map_type: ConfigMapType) -> str:
|
|
108
|
+
default_value = 'map[]'
|
|
109
|
+
|
|
110
|
+
if config_map_type == ConfigMapType.RESOURCES:
|
|
111
|
+
return os.getenv('DRY_RUN_RESOURCES_CONFIG_MAP', default_value)
|
|
112
|
+
|
|
113
|
+
return default_value
|
|
114
|
+
|
|
115
|
+
|
|
92
116
|
def create_cluster_configmaps(
|
|
93
117
|
args,
|
|
94
|
-
system,
|
|
118
|
+
system: SystemCharacteristics,
|
|
95
119
|
tensorboard_config: dict,
|
|
96
120
|
autoprovisioning_config: AutoprovisioningConfig | None,
|
|
97
121
|
) -> int:
|
|
@@ -127,9 +151,11 @@ def create_cluster_configmaps(
|
|
|
127
151
|
resources_data = (
|
|
128
152
|
f'{device_type}: "{int(args.num_slices) * system.vms_per_slice}"'
|
|
129
153
|
)
|
|
130
|
-
resources_configmap_name =
|
|
154
|
+
resources_configmap_name = get_config_map_name(
|
|
155
|
+
args.cluster, ConfigMapType.RESOURCES
|
|
156
|
+
)
|
|
131
157
|
resources_yml = CLUSTER_CONFIGMAP_YAML.format(
|
|
132
|
-
|
|
158
|
+
name=resources_configmap_name, data=resources_data
|
|
133
159
|
)
|
|
134
160
|
configmap_yml[resources_configmap_name] = resources_yml
|
|
135
161
|
|
|
@@ -148,15 +174,17 @@ def create_cluster_configmaps(
|
|
|
148
174
|
# Reservation ID if applicable.
|
|
149
175
|
if capacity_type == CapacityType.RESERVATION:
|
|
150
176
|
metadata += f'\n {RESERVATION_CONFIG_KEY}: {args.reservation}'
|
|
151
|
-
metadata_configmap_name =
|
|
177
|
+
metadata_configmap_name = get_config_map_name(
|
|
178
|
+
args.cluster, ConfigMapType.METADATA
|
|
179
|
+
)
|
|
152
180
|
metadata_yml = CLUSTER_CONFIGMAP_YAML.format(
|
|
153
|
-
|
|
181
|
+
name=metadata_configmap_name, data=metadata
|
|
154
182
|
)
|
|
155
183
|
configmap_yml[metadata_configmap_name] = metadata_yml
|
|
156
|
-
return
|
|
184
|
+
return _create_or_update_cluster_configmap(configmap_yml)
|
|
157
185
|
|
|
158
186
|
|
|
159
|
-
def
|
|
187
|
+
def _create_or_update_cluster_configmap(configmap_yml: dict[str, str]) -> int:
|
|
160
188
|
"""
|
|
161
189
|
Args:
|
|
162
190
|
configmap_yml: dict containing ConfigMap name and yml string.
|
|
@@ -187,7 +215,18 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
|
|
|
187
215
|
return 0
|
|
188
216
|
|
|
189
217
|
|
|
190
|
-
def
|
|
218
|
+
def update_cluster_configmap(
|
|
219
|
+
cluster_name: str, config_map_type: ConfigMapType, data: str
|
|
220
|
+
) -> int:
|
|
221
|
+
config_map_name = get_config_map_name(cluster_name, config_map_type)
|
|
222
|
+
yaml = CLUSTER_CONFIGMAP_YAML.format(name=config_map_name, data=data)
|
|
223
|
+
config_map_dict = {config_map_name: yaml}
|
|
224
|
+
return _create_or_update_cluster_configmap(config_map_dict)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def check_cluster_resources(
|
|
228
|
+
args, system: SystemCharacteristics
|
|
229
|
+
) -> tuple[bool, bool]:
|
|
191
230
|
"""Check if cluster has resources of a specified device_type/gke_accelerator.
|
|
192
231
|
This check will be skipped if <args.cluster>-<_CLUSTER_RESOURCES_CONFIGMAP> ConfigMap doesn't exist for the cluster.
|
|
193
232
|
|
|
@@ -200,8 +239,9 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
|
|
|
200
239
|
True if resources in the cluster should be checked, False otherwise.
|
|
201
240
|
True if device_type/gke_accelerator exists in the cluster, False otherwise.
|
|
202
241
|
"""
|
|
203
|
-
|
|
204
|
-
|
|
242
|
+
resources_config_map = get_cluster_configmap(
|
|
243
|
+
args.cluster, ConfigMapType.RESOURCES
|
|
244
|
+
)
|
|
205
245
|
if resources_config_map is None:
|
|
206
246
|
xpk_print(
|
|
207
247
|
f'No ConfigMap exist for cluster with the name {resources_config_map}.'
|
|
@@ -216,20 +256,35 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
|
|
|
216
256
|
|
|
217
257
|
|
|
218
258
|
def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
|
|
219
|
-
"""Get
|
|
259
|
+
"""Get SystemCharcteristics based on the cluster resources configMap.
|
|
260
|
+
|
|
220
261
|
Args:
|
|
221
262
|
args: user provided arguments for running the command.
|
|
222
263
|
|
|
223
264
|
Returns:
|
|
224
|
-
returns system characteristics
|
|
265
|
+
returns system characteristics, or None if not found.
|
|
266
|
+
"""
|
|
267
|
+
resources_config_map = get_cluster_configmap(
|
|
268
|
+
args.cluster, ConfigMapType.RESOURCES
|
|
269
|
+
)
|
|
270
|
+
return get_cluster_system_characteristics_from_config_map(
|
|
271
|
+
resources_config_map
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def get_cluster_system_characteristics_from_config_map(
|
|
276
|
+
resources_config_map: dict[str, str] | None,
|
|
277
|
+
) -> SystemCharacteristics | None:
|
|
278
|
+
"""Get SystemCharcteristics based on the cluster resources configMap.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
returns system characteristics, or None if not found.
|
|
225
282
|
"""
|
|
226
|
-
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
227
|
-
cluster_config_map = get_cluster_configmap(resources_configmap_name)
|
|
228
283
|
|
|
229
|
-
if
|
|
284
|
+
if resources_config_map is None:
|
|
230
285
|
return None
|
|
231
286
|
|
|
232
|
-
for key in
|
|
287
|
+
for key in resources_config_map:
|
|
233
288
|
system, result_code = get_system_characteristics_by_device_type(key)
|
|
234
289
|
if result_code == 0:
|
|
235
290
|
return system
|
|
@@ -238,20 +293,22 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
|
|
|
238
293
|
|
|
239
294
|
|
|
240
295
|
def get_cluster_capacity_type(args) -> CapacityType | None:
|
|
241
|
-
"""Get
|
|
296
|
+
"""Get CapacityType based on the cluster metadata configMap.
|
|
297
|
+
|
|
242
298
|
Args:
|
|
243
299
|
args: user provided arguments for running the command.
|
|
244
300
|
|
|
245
301
|
Returns:
|
|
246
|
-
returns
|
|
302
|
+
returns CapacityType, or None if not found.
|
|
247
303
|
"""
|
|
248
|
-
metadata_configmap_name =
|
|
249
|
-
|
|
304
|
+
metadata_configmap_name = get_cluster_configmap(
|
|
305
|
+
args.cluster, ConfigMapType.METADATA
|
|
306
|
+
)
|
|
250
307
|
|
|
251
|
-
if
|
|
308
|
+
if metadata_configmap_name is None:
|
|
252
309
|
return None
|
|
253
310
|
|
|
254
|
-
capacityValue =
|
|
311
|
+
capacityValue = metadata_configmap_name.get('capacity_type')
|
|
255
312
|
if capacityValue is not None:
|
|
256
313
|
return CapacityType[capacityValue.upper()]
|
|
257
314
|
|