xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/__init__.py +15 -0
- integration/docker_manager_test.py +102 -0
- integration/gcluster_a3mega_test.py +204 -0
- integration/gcluster_a3ultra_test.py +176 -0
- integration/gcluster_a4_test.py +176 -0
- integration/gcluster_test.py +107 -0
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +143 -117
- xpk/commands/cluster_gcluster.py +81 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/cluster_test.py +92 -0
- xpk/commands/common.py +14 -26
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +39 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +84 -29
- xpk/commands/workload_test.py +81 -0
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/blueprint/testing/__init__.py +15 -0
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +423 -0
- xpk/core/kueue_manager_test.py +574 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +350 -232
- xpk/core/system_characteristics_test.py +73 -0
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/templates/cluster_preheat.yaml.j2 +31 -0
- xpk/templates/filestore-pv.yaml +17 -0
- xpk/templates/filestore-pvc.yaml +11 -0
- xpk/templates/filestore-sc.yaml +10 -0
- xpk/templates/fuse-pv.yaml +17 -0
- xpk/templates/fuse-pvc.yaml +13 -0
- xpk/templates/kueue_config.yaml.j2 +95 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
- xpk/templates/mtc-cpc.yaml +15 -0
- xpk/templates/volume_bundle.yaml +7 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +15 -0
- xpk/utils/topology.py +46 -0
- xpk/utils/topology_test.py +63 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
- xpk-0.14.1.dist-info/RECORD +133 -0
- xpk-0.14.1.dist-info/top_level.txt +2 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- xpk-0.13.0.dist-info/top_level.txt +0 -1
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
xpk/core/ray.py
CHANGED
|
@@ -102,16 +102,16 @@ def install_ray_cluster(args, system) -> int:
|
|
|
102
102
|
0 if successful and 1 otherwise.
|
|
103
103
|
"""
|
|
104
104
|
|
|
105
|
-
delete_ray_cluster(
|
|
105
|
+
delete_ray_cluster()
|
|
106
106
|
|
|
107
107
|
label = 'cloud.google.com/gke-nodepool=default-pool'
|
|
108
108
|
available_head_cpu, available_head_mem = generate_available_resources(
|
|
109
|
-
label,
|
|
109
|
+
label, HEAD_CPU
|
|
110
110
|
)
|
|
111
111
|
|
|
112
112
|
label = f'cloud.google.com/gke-tpu-accelerator={system.gke_accelerator}'
|
|
113
113
|
available_worker_cpu, available_worker_mem = generate_available_resources(
|
|
114
|
-
label,
|
|
114
|
+
label, WORKER_CPU
|
|
115
115
|
)
|
|
116
116
|
|
|
117
117
|
yml_string = ray_cluster_crd_yaml.format(
|
|
@@ -136,7 +136,7 @@ def install_ray_cluster(args, system) -> int:
|
|
|
136
136
|
task = 'Applying RayCluster'
|
|
137
137
|
retry_attempts = 1
|
|
138
138
|
return_code = run_command_with_updates_retry(
|
|
139
|
-
command, task,
|
|
139
|
+
command, task, num_retry_attempts=retry_attempts
|
|
140
140
|
)
|
|
141
141
|
if return_code != 0:
|
|
142
142
|
xpk_print(f'{task} not successful.')
|
|
@@ -144,12 +144,9 @@ def install_ray_cluster(args, system) -> int:
|
|
|
144
144
|
return return_code
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
def delete_ray_cluster(
|
|
147
|
+
def delete_ray_cluster() -> None:
|
|
148
148
|
"""Delete all RayClusters on the cluster
|
|
149
149
|
|
|
150
|
-
Args:
|
|
151
|
-
args: user provided arguments for running the command.
|
|
152
|
-
|
|
153
150
|
Returns:
|
|
154
151
|
None
|
|
155
152
|
"""
|
|
@@ -158,7 +155,7 @@ def delete_ray_cluster(args) -> None:
|
|
|
158
155
|
task = 'Deleting old RayCluster'
|
|
159
156
|
retry_attempts = 1
|
|
160
157
|
return_code = run_command_with_updates_retry(
|
|
161
|
-
command, task,
|
|
158
|
+
command, task, num_retry_attempts=retry_attempts
|
|
162
159
|
)
|
|
163
160
|
|
|
164
161
|
if return_code != 0:
|
|
@@ -168,12 +165,11 @@ def delete_ray_cluster(args) -> None:
|
|
|
168
165
|
return
|
|
169
166
|
|
|
170
167
|
|
|
171
|
-
def generate_available_resources(label,
|
|
168
|
+
def generate_available_resources(label, percent) -> tuple:
|
|
172
169
|
"""Generate the available resources for the nodes that match the given label
|
|
173
170
|
|
|
174
171
|
Args:
|
|
175
172
|
label: the label used to match the appropriate nodes
|
|
176
|
-
args: user provided arguments for running the command
|
|
177
173
|
percent: the percent of the available resources to use
|
|
178
174
|
|
|
179
175
|
Returns:
|
|
@@ -184,13 +180,13 @@ def generate_available_resources(label, args, percent) -> tuple:
|
|
|
184
180
|
f"kubectl get nodes -l {label} -o jsonpath='{{.items[0].metadata.name}}'"
|
|
185
181
|
)
|
|
186
182
|
task = f'Getting nodes with label {label}'
|
|
187
|
-
_, node_name = run_command_for_value(command, task
|
|
183
|
+
_, node_name = run_command_for_value(command, task)
|
|
188
184
|
|
|
189
185
|
command = (
|
|
190
186
|
f"kubectl get node {node_name} -o jsonpath='{{.status.allocatable.cpu}}'"
|
|
191
187
|
)
|
|
192
188
|
task = 'Fetching available CPU on node'
|
|
193
|
-
_, available_cpu = run_command_for_value(command, task
|
|
189
|
+
_, available_cpu = run_command_for_value(command, task)
|
|
194
190
|
match = re.match(r'(\d+)([a-zA-Z]+)', available_cpu)
|
|
195
191
|
if not match:
|
|
196
192
|
xpk_print(
|
|
@@ -207,7 +203,7 @@ def generate_available_resources(label, args, percent) -> tuple:
|
|
|
207
203
|
" jsonpath='{.status.allocatable.memory}'"
|
|
208
204
|
)
|
|
209
205
|
task = 'Fetching available memory on node'
|
|
210
|
-
_, available_memory = run_command_for_value(command, task
|
|
206
|
+
_, available_memory = run_command_for_value(command, task)
|
|
211
207
|
match = re.match(r'(\d+)([a-zA-Z]+)', available_memory)
|
|
212
208
|
if not match:
|
|
213
209
|
xpk_print(
|
xpk/core/resources.py
CHANGED
|
@@ -50,11 +50,10 @@ class AutoprovisioningConfig:
|
|
|
50
50
|
maximum_chips: int
|
|
51
51
|
|
|
52
52
|
|
|
53
|
-
def get_cluster_configmap(
|
|
53
|
+
def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
|
|
54
54
|
"""Run the Get GKE Cluster ConfigMap request.
|
|
55
55
|
|
|
56
56
|
Args:
|
|
57
|
-
args: user provided arguments for running the command.
|
|
58
57
|
configmap_name: name of the configmap.
|
|
59
58
|
|
|
60
59
|
Returns:
|
|
@@ -68,7 +67,6 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
|
|
|
68
67
|
return_code, return_value = run_command_for_value(
|
|
69
68
|
command,
|
|
70
69
|
'GKE Cluster Get ConfigMap',
|
|
71
|
-
args,
|
|
72
70
|
dry_run_return_val='map[]',
|
|
73
71
|
)
|
|
74
72
|
if return_code != 0:
|
|
@@ -155,12 +153,10 @@ def create_cluster_configmaps(
|
|
|
155
153
|
args=args, name=metadata_configmap_name, data=metadata
|
|
156
154
|
)
|
|
157
155
|
configmap_yml[metadata_configmap_name] = metadata_yml
|
|
158
|
-
return create_or_update_cluster_configmap(configmap_yml
|
|
156
|
+
return create_or_update_cluster_configmap(configmap_yml)
|
|
159
157
|
|
|
160
158
|
|
|
161
|
-
def create_or_update_cluster_configmap(
|
|
162
|
-
configmap_yml: dict, dry_run: bool
|
|
163
|
-
) -> int:
|
|
159
|
+
def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
|
|
164
160
|
"""
|
|
165
161
|
Args:
|
|
166
162
|
configmap_yml: dict containing ConfigMap name and yml string.
|
|
@@ -181,7 +177,6 @@ def create_or_update_cluster_configmap(
|
|
|
181
177
|
commands,
|
|
182
178
|
'GKE Cluster CreateOrUpdate ConfigMap(s)',
|
|
183
179
|
task_names,
|
|
184
|
-
dry_run=dry_run,
|
|
185
180
|
)
|
|
186
181
|
if return_code != 0:
|
|
187
182
|
xpk_print(
|
|
@@ -206,7 +201,7 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
|
|
|
206
201
|
True if device_type/gke_accelerator exists in the cluster, False otherwise.
|
|
207
202
|
"""
|
|
208
203
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
209
|
-
resources_config_map = get_cluster_configmap(
|
|
204
|
+
resources_config_map = get_cluster_configmap(resources_configmap_name)
|
|
210
205
|
if resources_config_map is None:
|
|
211
206
|
xpk_print(
|
|
212
207
|
f'No ConfigMap exist for cluster with the name {resources_config_map}.'
|
|
@@ -229,7 +224,7 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
|
|
|
229
224
|
returns system characteristics
|
|
230
225
|
"""
|
|
231
226
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
232
|
-
cluster_config_map = get_cluster_configmap(
|
|
227
|
+
cluster_config_map = get_cluster_configmap(resources_configmap_name)
|
|
233
228
|
|
|
234
229
|
if cluster_config_map is None:
|
|
235
230
|
return None
|
|
@@ -251,7 +246,7 @@ def get_cluster_capacity_type(args) -> CapacityType | None:
|
|
|
251
246
|
returns system characteristics
|
|
252
247
|
"""
|
|
253
248
|
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
254
|
-
cluster_config_map = get_cluster_configmap(
|
|
249
|
+
cluster_config_map = get_cluster_configmap(metadata_configmap_name)
|
|
255
250
|
|
|
256
251
|
if cluster_config_map is None:
|
|
257
252
|
return None
|
xpk/core/scheduling.py
CHANGED
|
@@ -36,7 +36,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
|
|
36
36
|
returns true if workload can schedule, otherwise returns false.
|
|
37
37
|
"""
|
|
38
38
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
39
|
-
cluster_config_map = get_cluster_configmap(
|
|
39
|
+
cluster_config_map = get_cluster_configmap(resources_configmap_name)
|
|
40
40
|
|
|
41
41
|
# Prevents workload creation failure for existing clusters with no ConfigMap
|
|
42
42
|
if cluster_config_map is None:
|
|
@@ -291,3 +291,21 @@ def create_tpu_topology(
|
|
|
291
291
|
):
|
|
292
292
|
return f'{system.topology}'
|
|
293
293
|
return ''
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
|
|
297
|
+
"""Generates subslicing annotations.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
sub_slicing_topology: subslice topology.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Annotations to be rendered in deployment yaml.
|
|
304
|
+
"""
|
|
305
|
+
return [
|
|
306
|
+
(
|
|
307
|
+
'kueue.x-k8s.io/podset-required-topology:'
|
|
308
|
+
f' "google.com/gke-tpu-slice-{sub_slicing_topology}-id"'
|
|
309
|
+
),
|
|
310
|
+
f'cloud.google.com/gke-tpu-slice-topology: {sub_slicing_topology}',
|
|
311
|
+
]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .scheduling import create_sub_slicing_annotations
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_create_sub_slicing_annotations_returns_valid_annotations():
|
|
21
|
+
subslicing_topology = '2x2'
|
|
22
|
+
|
|
23
|
+
result = create_sub_slicing_annotations(subslicing_topology)
|
|
24
|
+
|
|
25
|
+
assert result == [
|
|
26
|
+
(
|
|
27
|
+
'kueue.x-k8s.io/podset-required-topology:'
|
|
28
|
+
' "google.com/gke-tpu-slice-2x2-id"'
|
|
29
|
+
),
|
|
30
|
+
'cloud.google.com/gke-tpu-slice-topology: 2x2',
|
|
31
|
+
]
|