xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +17 -10
- xpk/commands/cluster.py +137 -123
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +13 -27
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +22 -11
- xpk/commands/job.py +53 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +26 -2
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +58 -30
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +96 -195
- xpk/core/cluster_private.py +9 -12
- xpk/core/commands.py +21 -25
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +17 -9
- xpk/core/docker_resources.py +9 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +5 -8
- xpk/core/kjob.py +19 -29
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +11 -16
- xpk/core/network.py +18 -19
- xpk/core/nodepool.py +65 -71
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +9 -5
- xpk/core/ray.py +11 -15
- xpk/core/resources.py +15 -10
- xpk/core/scheduling.py +23 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +3 -2
- xpk/parser/cluster.py +50 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/execution_context.py +28 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/kueue.py +20 -0
- xpk/utils/network.py +4 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -545
- xpk-0.12.0.dist-info/RECORD +0 -100
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/core/ray.py
CHANGED
|
@@ -102,16 +102,16 @@ def install_ray_cluster(args, system) -> int:
|
|
|
102
102
|
0 if successful and 1 otherwise.
|
|
103
103
|
"""
|
|
104
104
|
|
|
105
|
-
delete_ray_cluster(
|
|
105
|
+
delete_ray_cluster()
|
|
106
106
|
|
|
107
107
|
label = 'cloud.google.com/gke-nodepool=default-pool'
|
|
108
108
|
available_head_cpu, available_head_mem = generate_available_resources(
|
|
109
|
-
label,
|
|
109
|
+
label, HEAD_CPU
|
|
110
110
|
)
|
|
111
111
|
|
|
112
112
|
label = f'cloud.google.com/gke-tpu-accelerator={system.gke_accelerator}'
|
|
113
113
|
available_worker_cpu, available_worker_mem = generate_available_resources(
|
|
114
|
-
label,
|
|
114
|
+
label, WORKER_CPU
|
|
115
115
|
)
|
|
116
116
|
|
|
117
117
|
yml_string = ray_cluster_crd_yaml.format(
|
|
@@ -132,11 +132,11 @@ def install_ray_cluster(args, system) -> int:
|
|
|
132
132
|
)
|
|
133
133
|
|
|
134
134
|
tmp = write_tmp_file(yml_string)
|
|
135
|
-
command = f'kubectl apply -f {str(tmp
|
|
135
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
136
136
|
task = 'Applying RayCluster'
|
|
137
137
|
retry_attempts = 1
|
|
138
138
|
return_code = run_command_with_updates_retry(
|
|
139
|
-
command, task,
|
|
139
|
+
command, task, num_retry_attempts=retry_attempts
|
|
140
140
|
)
|
|
141
141
|
if return_code != 0:
|
|
142
142
|
xpk_print(f'{task} not successful.')
|
|
@@ -144,12 +144,9 @@ def install_ray_cluster(args, system) -> int:
|
|
|
144
144
|
return return_code
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
def delete_ray_cluster(
|
|
147
|
+
def delete_ray_cluster() -> None:
|
|
148
148
|
"""Delete all RayClusters on the cluster
|
|
149
149
|
|
|
150
|
-
Args:
|
|
151
|
-
args: user provided arguments for running the command.
|
|
152
|
-
|
|
153
150
|
Returns:
|
|
154
151
|
None
|
|
155
152
|
"""
|
|
@@ -158,7 +155,7 @@ def delete_ray_cluster(args) -> None:
|
|
|
158
155
|
task = 'Deleting old RayCluster'
|
|
159
156
|
retry_attempts = 1
|
|
160
157
|
return_code = run_command_with_updates_retry(
|
|
161
|
-
command, task,
|
|
158
|
+
command, task, num_retry_attempts=retry_attempts
|
|
162
159
|
)
|
|
163
160
|
|
|
164
161
|
if return_code != 0:
|
|
@@ -168,12 +165,11 @@ def delete_ray_cluster(args) -> None:
|
|
|
168
165
|
return
|
|
169
166
|
|
|
170
167
|
|
|
171
|
-
def generate_available_resources(label,
|
|
168
|
+
def generate_available_resources(label, percent) -> tuple:
|
|
172
169
|
"""Generate the available resources for the nodes that match the given label
|
|
173
170
|
|
|
174
171
|
Args:
|
|
175
172
|
label: the label used to match the appropriate nodes
|
|
176
|
-
args: user provided arguments for running the command
|
|
177
173
|
percent: the percent of the available resources to use
|
|
178
174
|
|
|
179
175
|
Returns:
|
|
@@ -184,13 +180,13 @@ def generate_available_resources(label, args, percent) -> tuple:
|
|
|
184
180
|
f"kubectl get nodes -l {label} -o jsonpath='{{.items[0].metadata.name}}'"
|
|
185
181
|
)
|
|
186
182
|
task = f'Getting nodes with label {label}'
|
|
187
|
-
_, node_name = run_command_for_value(command, task
|
|
183
|
+
_, node_name = run_command_for_value(command, task)
|
|
188
184
|
|
|
189
185
|
command = (
|
|
190
186
|
f"kubectl get node {node_name} -o jsonpath='{{.status.allocatable.cpu}}'"
|
|
191
187
|
)
|
|
192
188
|
task = 'Fetching available CPU on node'
|
|
193
|
-
_, available_cpu = run_command_for_value(command, task
|
|
189
|
+
_, available_cpu = run_command_for_value(command, task)
|
|
194
190
|
match = re.match(r'(\d+)([a-zA-Z]+)', available_cpu)
|
|
195
191
|
if not match:
|
|
196
192
|
xpk_print(
|
|
@@ -207,7 +203,7 @@ def generate_available_resources(label, args, percent) -> tuple:
|
|
|
207
203
|
" jsonpath='{.status.allocatable.memory}'"
|
|
208
204
|
)
|
|
209
205
|
task = 'Fetching available memory on node'
|
|
210
|
-
_, available_memory = run_command_for_value(command, task
|
|
206
|
+
_, available_memory = run_command_for_value(command, task)
|
|
211
207
|
match = re.match(r'(\d+)([a-zA-Z]+)', available_memory)
|
|
212
208
|
if not match:
|
|
213
209
|
xpk_print(
|
xpk/core/resources.py
CHANGED
|
@@ -50,11 +50,10 @@ class AutoprovisioningConfig:
|
|
|
50
50
|
maximum_chips: int
|
|
51
51
|
|
|
52
52
|
|
|
53
|
-
def get_cluster_configmap(
|
|
53
|
+
def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
|
|
54
54
|
"""Run the Get GKE Cluster ConfigMap request.
|
|
55
55
|
|
|
56
56
|
Args:
|
|
57
|
-
args: user provided arguments for running the command.
|
|
58
57
|
configmap_name: name of the configmap.
|
|
59
58
|
|
|
60
59
|
Returns:
|
|
@@ -66,7 +65,9 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
|
|
|
66
65
|
)
|
|
67
66
|
|
|
68
67
|
return_code, return_value = run_command_for_value(
|
|
69
|
-
command,
|
|
68
|
+
command,
|
|
69
|
+
'GKE Cluster Get ConfigMap',
|
|
70
|
+
dry_run_return_val='map[]',
|
|
70
71
|
)
|
|
71
72
|
if return_code != 0:
|
|
72
73
|
xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
|
|
@@ -81,8 +82,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
|
|
|
81
82
|
configs = return_value[4:-1].split(' ')
|
|
82
83
|
|
|
83
84
|
for config in configs:
|
|
84
|
-
|
|
85
|
-
|
|
85
|
+
parts = config.strip().split(':')
|
|
86
|
+
if len(parts) != 2:
|
|
87
|
+
continue
|
|
88
|
+
config_map[parts[0]] = parts[1]
|
|
86
89
|
return config_map
|
|
87
90
|
|
|
88
91
|
|
|
@@ -165,13 +168,15 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
|
|
|
165
168
|
task_names = []
|
|
166
169
|
for configmap_name, yml_string in configmap_yml.items():
|
|
167
170
|
tmp = write_tmp_file(yml_string)
|
|
168
|
-
command = f'kubectl apply -f {str(tmp
|
|
171
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
169
172
|
commands.append(command)
|
|
170
173
|
task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
|
|
171
174
|
task_names.append(task_name)
|
|
172
175
|
|
|
173
176
|
return_code = run_commands(
|
|
174
|
-
commands,
|
|
177
|
+
commands,
|
|
178
|
+
'GKE Cluster CreateOrUpdate ConfigMap(s)',
|
|
179
|
+
task_names,
|
|
175
180
|
)
|
|
176
181
|
if return_code != 0:
|
|
177
182
|
xpk_print(
|
|
@@ -196,7 +201,7 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
|
|
|
196
201
|
True if device_type/gke_accelerator exists in the cluster, False otherwise.
|
|
197
202
|
"""
|
|
198
203
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
199
|
-
resources_config_map = get_cluster_configmap(
|
|
204
|
+
resources_config_map = get_cluster_configmap(resources_configmap_name)
|
|
200
205
|
if resources_config_map is None:
|
|
201
206
|
xpk_print(
|
|
202
207
|
f'No ConfigMap exist for cluster with the name {resources_config_map}.'
|
|
@@ -219,7 +224,7 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
|
|
|
219
224
|
returns system characteristics
|
|
220
225
|
"""
|
|
221
226
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
222
|
-
cluster_config_map = get_cluster_configmap(
|
|
227
|
+
cluster_config_map = get_cluster_configmap(resources_configmap_name)
|
|
223
228
|
|
|
224
229
|
if cluster_config_map is None:
|
|
225
230
|
return None
|
|
@@ -241,7 +246,7 @@ def get_cluster_capacity_type(args) -> CapacityType | None:
|
|
|
241
246
|
returns system characteristics
|
|
242
247
|
"""
|
|
243
248
|
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
244
|
-
cluster_config_map = get_cluster_configmap(
|
|
249
|
+
cluster_config_map = get_cluster_configmap(metadata_configmap_name)
|
|
245
250
|
|
|
246
251
|
if cluster_config_map is None:
|
|
247
252
|
return None
|
xpk/core/scheduling.py
CHANGED
|
@@ -15,6 +15,7 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
from ..utils.console import xpk_print
|
|
18
|
+
from ..utils.execution_context import is_dry_run
|
|
18
19
|
from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
|
|
19
20
|
from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
|
|
20
21
|
from .system_characteristics import (
|
|
@@ -35,7 +36,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
|
|
35
36
|
returns true if workload can schedule, otherwise returns false.
|
|
36
37
|
"""
|
|
37
38
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
38
|
-
cluster_config_map = get_cluster_configmap(
|
|
39
|
+
cluster_config_map = get_cluster_configmap(resources_configmap_name)
|
|
39
40
|
|
|
40
41
|
# Prevents workload creation failure for existing clusters with no ConfigMap
|
|
41
42
|
if cluster_config_map is None:
|
|
@@ -45,6 +46,9 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
|
|
45
46
|
)
|
|
46
47
|
return True
|
|
47
48
|
|
|
49
|
+
if is_dry_run():
|
|
50
|
+
return True
|
|
51
|
+
|
|
48
52
|
# Check for gke accelerator type:
|
|
49
53
|
missing_gke_accelerator_type = False
|
|
50
54
|
if not cluster_config_map.get(system.gke_accelerator):
|
|
@@ -287,3 +291,21 @@ def create_tpu_topology(
|
|
|
287
291
|
):
|
|
288
292
|
return f'{system.topology}'
|
|
289
293
|
return ''
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
|
|
297
|
+
"""Generates subslicing annotations.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
sub_slicing_topology: subslice topology.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Annotations to be rendered in deployment yaml.
|
|
304
|
+
"""
|
|
305
|
+
return [
|
|
306
|
+
(
|
|
307
|
+
'kueue.x-k8s.io/podset-required-topology:'
|
|
308
|
+
f' "google.com/gke-tpu-slice-{sub_slicing_topology}-id"'
|
|
309
|
+
),
|
|
310
|
+
f'cloud.google.com/gke-tpu-slice-topology: {sub_slicing_topology}',
|
|
311
|
+
]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .scheduling import create_sub_slicing_annotations
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_create_sub_slicing_annotations_returns_valid_annotations():
|
|
21
|
+
subslicing_topology = '2x2'
|
|
22
|
+
|
|
23
|
+
result = create_sub_slicing_annotations(subslicing_topology)
|
|
24
|
+
|
|
25
|
+
assert result == [
|
|
26
|
+
(
|
|
27
|
+
'kueue.x-k8s.io/podset-required-topology:'
|
|
28
|
+
' "google.com/gke-tpu-slice-2x2-id"'
|
|
29
|
+
),
|
|
30
|
+
'cloud.google.com/gke-tpu-slice-topology: 2x2',
|
|
31
|
+
]
|