xpk 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +270 -8
- xpk/commands/cluster_gcluster.py +2 -1
- xpk/commands/common.py +3 -3
- xpk/commands/info.py +12 -12
- xpk/commands/job.py +12 -10
- xpk/commands/kjob_common.py +2 -1
- xpk/commands/storage.py +1 -1
- xpk/commands/workload.py +12 -6
- xpk/core/blueprint/blueprint_generator.py +7 -7
- xpk/core/blueprint/blueprint_test.py +218 -0
- xpk/core/capacity.py +5 -3
- xpk/core/cluster.py +9 -7
- xpk/core/cluster_private.py +5 -1
- xpk/core/commands.py +3 -3
- xpk/core/config.py +3 -4
- xpk/core/config_test.py +71 -0
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +1 -1
- xpk/core/filestore.py +7 -2
- xpk/core/gcloud_context.py +2 -2
- xpk/core/jobset.py +1 -1
- xpk/core/kjob.py +2 -1
- xpk/core/kueue.py +12 -4
- xpk/core/nap.py +20 -6
- xpk/core/nodepool.py +52 -19
- xpk/core/nodepool_test.py +82 -0
- xpk/core/resources.py +1 -7
- xpk/core/scheduling.py +1 -1
- xpk/core/storage.py +14 -14
- xpk/core/system_characteristics.py +267 -1081
- xpk/core/workload.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +3 -2
- xpk/core/workload_decorators/storage_decorator.py +2 -1
- xpk/core/workload_decorators/tcpx_decorator.py +4 -2
- xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
- xpk/core/workload_test.py +28 -0
- xpk/main.py +9 -10
- xpk/parser/cluster.py +67 -49
- xpk/parser/common.py +45 -36
- xpk/parser/storage.py +12 -13
- xpk/parser/workload.py +57 -39
- xpk/utils/console.py +2 -1
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/METADATA +4 -1
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/RECORD +49 -44
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/WHEEL +0 -0
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/top_level.txt +0 -0
xpk/core/nodepool.py
CHANGED
|
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from typing import List
|
|
17
18
|
from ..utils.console import get_user_input, xpk_print
|
|
18
19
|
from .capacity import (
|
|
19
20
|
AUTOPROVISIONING_CONFIG_VALUE,
|
|
@@ -32,6 +33,8 @@ from .resources import (
|
|
|
32
33
|
create_or_update_cluster_configmap,
|
|
33
34
|
)
|
|
34
35
|
from .system_characteristics import AcceleratorType
|
|
36
|
+
from functools import reduce
|
|
37
|
+
from operator import mul
|
|
35
38
|
|
|
36
39
|
CLOUD_PLATFORM_AUTH_SCOPE_URL = (
|
|
37
40
|
'"https://www.googleapis.com/auth/cloud-platform"'
|
|
@@ -88,20 +91,26 @@ def run_gke_node_pool_create_command(
|
|
|
88
91
|
xpk_print('Parsing capacity arguments failed!')
|
|
89
92
|
return return_code
|
|
90
93
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
94
|
+
desired_node_pool_count = (
|
|
95
|
+
1
|
|
96
|
+
if system.accelerator_type == AcceleratorType['GPU']
|
|
97
|
+
else args.num_slices
|
|
98
|
+
)
|
|
99
|
+
message = (
|
|
100
|
+
(
|
|
101
|
+
f'Creating 1 node pool with {args.num_nodes} nodes of'
|
|
102
|
+
f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
|
|
103
|
+
)
|
|
104
|
+
if system.accelerator_type == AcceleratorType['GPU']
|
|
105
|
+
else (
|
|
106
|
+
f'Creating {args.num_slices} node pool or pools of'
|
|
107
|
+
f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
xpk_print(message)
|
|
111
|
+
desired_node_pool_names = get_desired_node_pool_names(
|
|
112
|
+
existing_node_pool_names, args.cluster, desired_node_pool_count
|
|
113
|
+
)
|
|
105
114
|
|
|
106
115
|
node_pools_to_remain = []
|
|
107
116
|
delete_commands = []
|
|
@@ -275,20 +284,24 @@ def run_gke_node_pool_create_command(
|
|
|
275
284
|
f' --host-maintenance-interval={args.host_maintenance_interval}'
|
|
276
285
|
f' {capacity_args}'
|
|
277
286
|
' --enable-gvnic'
|
|
278
|
-
f' {args.custom_nodepool_arguments}'
|
|
279
287
|
)
|
|
280
288
|
if system.accelerator_type == AcceleratorType['TPU']:
|
|
281
289
|
command += f' --node-version={gke_node_pool_version}'
|
|
290
|
+
topology_product = reduce(
|
|
291
|
+
mul, (int(x) for x in system.topology.split('x')), 1
|
|
292
|
+
)
|
|
282
293
|
if capacity_type == CapacityType.FLEX_START:
|
|
283
294
|
command += ' --num-nodes=0'
|
|
284
|
-
|
|
295
|
+
elif topology_product > 1:
|
|
285
296
|
command += f' --num-nodes={system.vms_per_slice}'
|
|
286
|
-
command += ' --placement-type=COMPACT --max-pods-per-node 15'
|
|
287
297
|
command += (
|
|
288
298
|
f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
|
|
289
299
|
)
|
|
290
|
-
|
|
291
|
-
|
|
300
|
+
|
|
301
|
+
if topology_product > 1:
|
|
302
|
+
command += ' --placement-type=COMPACT --max-pods-per-node 15'
|
|
303
|
+
command += f' --tpu-topology={system.topology}'
|
|
304
|
+
command += f' {args.custom_tpu_nodepool_arguments}'
|
|
292
305
|
elif system.accelerator_type == AcceleratorType['GPU']:
|
|
293
306
|
subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
|
|
294
307
|
if capacity_type == CapacityType.FLEX_START:
|
|
@@ -319,6 +332,8 @@ def run_gke_node_pool_create_command(
|
|
|
319
332
|
if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
|
|
320
333
|
command += ' --workload-metadata=GKE_METADATA'
|
|
321
334
|
|
|
335
|
+
command += f' {args.custom_nodepool_arguments}'
|
|
336
|
+
|
|
322
337
|
task = f'NodepoolCreate-{node_pool_name}'
|
|
323
338
|
create_commands.append(command)
|
|
324
339
|
create_task_names.append(task)
|
|
@@ -594,3 +609,21 @@ def get_nodepool_workload_metadata_mode(
|
|
|
594
609
|
return 1, None
|
|
595
610
|
|
|
596
611
|
return 0, nodepool_WI_mode.strip()
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def get_desired_node_pool_names(
|
|
615
|
+
existing_node_pool_names: List[str],
|
|
616
|
+
cluster_name: str,
|
|
617
|
+
desired_node_pool_count: int,
|
|
618
|
+
) -> List[str]:
|
|
619
|
+
cluster_node_pools = [
|
|
620
|
+
np
|
|
621
|
+
for np in existing_node_pool_names
|
|
622
|
+
if np.startswith(f'{cluster_name}-np-')
|
|
623
|
+
]
|
|
624
|
+
result = set(cluster_node_pools[:desired_node_pool_count])
|
|
625
|
+
i = 0
|
|
626
|
+
while len(result) < desired_node_pool_count:
|
|
627
|
+
result.add(f'{cluster_name}-np-{i}')
|
|
628
|
+
i += 1
|
|
629
|
+
return list(result)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from xpk.core.nodepool import get_desired_node_pool_names
|
|
18
|
+
|
|
19
|
+
CLUSTER_NAME = "running-cucumber"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def node_pool_name(number: int) -> str:
|
|
23
|
+
return f"{CLUSTER_NAME}-np-{number}"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_compute_desired_node_pool_names_with_desired_larger_than_existing():
|
|
27
|
+
result = get_desired_node_pool_names(
|
|
28
|
+
existing_node_pool_names=[node_pool_name(0)],
|
|
29
|
+
cluster_name=CLUSTER_NAME,
|
|
30
|
+
desired_node_pool_count=2,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
expected_result = [node_pool_name(0), node_pool_name(1)]
|
|
34
|
+
assert set(result) == set(expected_result)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_compute_desired_node_pool_names_with_desired_smaller_than_existing():
|
|
38
|
+
result = get_desired_node_pool_names(
|
|
39
|
+
existing_node_pool_names=[node_pool_name(0), node_pool_name(1)],
|
|
40
|
+
cluster_name=CLUSTER_NAME,
|
|
41
|
+
desired_node_pool_count=1,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
expected_result = [node_pool_name(0)]
|
|
45
|
+
assert set(result) == set(expected_result)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_compute_desired_node_pool_names_with_consecutive_numbers_missing():
|
|
49
|
+
result = get_desired_node_pool_names(
|
|
50
|
+
existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
|
|
51
|
+
cluster_name=CLUSTER_NAME,
|
|
52
|
+
desired_node_pool_count=3,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
expected_result = [node_pool_name(0), node_pool_name(1), node_pool_name(3)]
|
|
56
|
+
assert set(result) == set(expected_result)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_compute_desired_node_pool_names_with_consecutive_numbers_missing_and_desired_equal_to_existing():
|
|
60
|
+
result = get_desired_node_pool_names(
|
|
61
|
+
existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
|
|
62
|
+
cluster_name=CLUSTER_NAME,
|
|
63
|
+
desired_node_pool_count=2,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
expected_result = [node_pool_name(0), node_pool_name(3)]
|
|
67
|
+
assert set(result) == set(expected_result)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_compute_desired_node_pool_names_with_unknown_node_pools():
|
|
71
|
+
result = get_desired_node_pool_names(
|
|
72
|
+
existing_node_pool_names=[
|
|
73
|
+
"unknown-node-pool",
|
|
74
|
+
node_pool_name(0),
|
|
75
|
+
node_pool_name(3),
|
|
76
|
+
],
|
|
77
|
+
cluster_name=CLUSTER_NAME,
|
|
78
|
+
desired_node_pool_count=2,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
expected_result = [node_pool_name(0), node_pool_name(3)]
|
|
82
|
+
assert set(result) == set(expected_result)
|
xpk/core/resources.py
CHANGED
|
@@ -108,13 +108,7 @@ def create_cluster_configmaps(
|
|
|
108
108
|
device_type = system.device_type
|
|
109
109
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
110
110
|
resources_data = f'{device_type}: "{int(args.num_nodes)}"'
|
|
111
|
-
elif
|
|
112
|
-
not args.enable_pathways
|
|
113
|
-
and args.enable_autoprovisioning
|
|
114
|
-
and autoprovisioning_config
|
|
115
|
-
):
|
|
116
|
-
# Currently autoprovisioning is not supported with Pathways.
|
|
117
|
-
# Auto provisioning will have variable topologies for a gke accelerator type.
|
|
111
|
+
elif args.enable_autoprovisioning and autoprovisioning_config:
|
|
118
112
|
resources_data = (
|
|
119
113
|
f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
|
|
120
114
|
)
|
xpk/core/scheduling.py
CHANGED
|
@@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
|
|
49
49
|
missing_gke_accelerator_type = False
|
|
50
50
|
if not cluster_config_map.get(system.gke_accelerator):
|
|
51
51
|
xpk_print(
|
|
52
|
-
f'
|
|
52
|
+
f'GKE Accelerator Type Check: {args.workload} is requesting'
|
|
53
53
|
f' {system.gke_accelerator} but cluster only contains'
|
|
54
54
|
f' {cluster_config_map.keys()}. '
|
|
55
55
|
)
|
xpk/core/storage.py
CHANGED
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
import os
|
|
18
18
|
from argparse import Namespace
|
|
19
19
|
from dataclasses import dataclass
|
|
20
|
-
from typing import Any
|
|
20
|
+
from typing import Any, cast
|
|
21
21
|
|
|
22
22
|
import ruamel.yaml
|
|
23
23
|
from google.cloud import storage as gcp_storage
|
|
@@ -95,17 +95,17 @@ class Storage:
|
|
|
95
95
|
Args:
|
|
96
96
|
data: A dictionary containing the Storage resource definition.
|
|
97
97
|
"""
|
|
98
|
-
metadata
|
|
98
|
+
metadata = data.get("metadata", {})
|
|
99
99
|
self.name = metadata.get("name")
|
|
100
100
|
spec = data.get("spec", {})
|
|
101
|
-
self.type
|
|
102
|
-
self.auto_mount
|
|
103
|
-
self.mount_point
|
|
104
|
-
self.readonly
|
|
105
|
-
self.manifest
|
|
106
|
-
self.pvc
|
|
107
|
-
self.pv
|
|
108
|
-
self.bucket
|
|
101
|
+
self.type = spec.get("type")
|
|
102
|
+
self.auto_mount = spec.get("auto_mount")
|
|
103
|
+
self.mount_point = spec.get("mount_point")
|
|
104
|
+
self.readonly = spec.get("readonly")
|
|
105
|
+
self.manifest = spec.get("manifest")
|
|
106
|
+
self.pvc = spec.get("pvc")
|
|
107
|
+
self.pv = spec.get("pv")
|
|
108
|
+
self.bucket = self._get_bucket()
|
|
109
109
|
|
|
110
110
|
def fields_as_list(self) -> list[str]:
|
|
111
111
|
"""
|
|
@@ -117,9 +117,9 @@ class Storage:
|
|
|
117
117
|
return [
|
|
118
118
|
self.name,
|
|
119
119
|
self.type,
|
|
120
|
-
self.auto_mount,
|
|
120
|
+
str(self.auto_mount),
|
|
121
121
|
self.mount_point,
|
|
122
|
-
self.readonly,
|
|
122
|
+
str(self.readonly),
|
|
123
123
|
self.manifest,
|
|
124
124
|
]
|
|
125
125
|
|
|
@@ -133,7 +133,7 @@ class Storage:
|
|
|
133
133
|
client = k8s_client.CoreV1Api()
|
|
134
134
|
try:
|
|
135
135
|
pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
|
|
136
|
-
return pv.spec.csi.volume_handle
|
|
136
|
+
return cast(str, pv.spec.csi.volume_handle)
|
|
137
137
|
except ApiException as e:
|
|
138
138
|
xpk_print(
|
|
139
139
|
f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
|
|
@@ -150,7 +150,7 @@ class Storage:
|
|
|
150
150
|
client = k8s_client.CoreV1Api()
|
|
151
151
|
try:
|
|
152
152
|
pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
|
|
153
|
-
return pv.spec.mount_options
|
|
153
|
+
return cast(list[str], pv.spec.mount_options)
|
|
154
154
|
except ApiException as e:
|
|
155
155
|
xpk_print(
|
|
156
156
|
f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
|