xpk 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +131 -0
- xpk/commands/cluster.py +808 -0
- xpk/commands/cluster_gcluster.py +269 -0
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +243 -0
- xpk/commands/inspector.py +357 -0
- xpk/commands/job.py +199 -0
- xpk/commands/kind.py +283 -0
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +140 -0
- xpk/commands/storage.py +267 -0
- xpk/commands/version.py +27 -0
- xpk/commands/workload.py +889 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +62 -0
- xpk/core/blueprint/blueprint_generator.py +708 -0
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +200 -0
- xpk/core/commands.py +356 -0
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +176 -0
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +444 -0
- xpk/core/kueue.py +358 -0
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +361 -0
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +377 -0
- xpk/core/ray.py +222 -0
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +1432 -0
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +341 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +129 -0
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
- xpk/main.py +75 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +43 -0
- xpk/parser/cluster.py +662 -0
- xpk/parser/common.py +259 -0
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +135 -0
- xpk/parser/info.py +64 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +147 -0
- xpk/parser/kind.py +95 -0
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +59 -0
- xpk/parser/storage.py +316 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +726 -0
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +88 -0
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- xpk/utils/yaml.py +30 -0
- {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/METADATA +456 -32
- xpk-0.7.0.dist-info/RECORD +92 -0
- {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/WHEEL +1 -1
- xpk-0.7.0.dist-info/entry_points.txt +2 -0
- xpk-0.5.0.dist-info/RECORD +0 -7
- xpk-0.5.0.dist-info/entry_points.txt +0 -2
- xpk.py +0 -7282
- {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/LICENSE +0 -0
- {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RemoteStateClient(ABC):
|
|
21
|
+
"""This is a base class that defines methods a class for managing remote cluster state.
|
|
22
|
+
Args:
|
|
23
|
+
ABC (_type_): _description_
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def upload_state(self) -> None:
|
|
28
|
+
"""Upload state to remote storage"""
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def download_state(self) -> None:
|
|
33
|
+
"""Download state from remote storage"""
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def check_remote_state_exists(self) -> bool:
|
|
38
|
+
return False
|
xpk/core/resources.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
|
|
19
|
+
from ..utils.console import xpk_print
|
|
20
|
+
from ..utils.file import write_tmp_file
|
|
21
|
+
from .capacity import (
|
|
22
|
+
AUTOPROVISIONING_CONFIG_MAXIMUM_KEY,
|
|
23
|
+
AUTOPROVISIONING_CONFIG_MINIMUM_KEY,
|
|
24
|
+
AUTOPROVISIONING_CONFIG_VALUE,
|
|
25
|
+
CAPACITY_TYPE_CONFIG_KEY,
|
|
26
|
+
RESERVATION_CONFIG_KEY,
|
|
27
|
+
CapacityType,
|
|
28
|
+
get_capacity_type,
|
|
29
|
+
)
|
|
30
|
+
from .commands import run_command_for_value, run_commands
|
|
31
|
+
from .config import XPK_CURRENT_VERSION
|
|
32
|
+
from .system_characteristics import AcceleratorType, get_system_characteristics_by_device_type, SystemCharacteristics
|
|
33
|
+
|
|
34
|
+
CLUSTER_RESOURCES_CONFIGMAP = 'resources-configmap'
|
|
35
|
+
CLUSTER_METADATA_CONFIGMAP = 'metadata-configmap'
|
|
36
|
+
|
|
37
|
+
CLUSTER_CONFIGMAP_YAML = """kind: ConfigMap
|
|
38
|
+
apiVersion: v1
|
|
39
|
+
metadata:
|
|
40
|
+
name: {name}
|
|
41
|
+
data:
|
|
42
|
+
{data}
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class AutoprovisioningConfig:
|
|
48
|
+
config_filename: str
|
|
49
|
+
minimum_chips: int
|
|
50
|
+
maximum_chips: int
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
|
|
54
|
+
"""Run the Get GKE Cluster ConfigMap request.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
args: user provided arguments for running the command.
|
|
58
|
+
configmap_name: name of the configmap.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
key:value pairs stored in cluster ConfigMap.
|
|
62
|
+
"""
|
|
63
|
+
command = (
|
|
64
|
+
'kubectl get configmap'
|
|
65
|
+
f' {configmap_name} -o=custom-columns="ConfigData:data" --no-headers=true'
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return_code, return_value = run_command_for_value(
|
|
69
|
+
command, 'GKE Cluster Get ConfigMap', args
|
|
70
|
+
)
|
|
71
|
+
if return_code != 0:
|
|
72
|
+
xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
config_map = {}
|
|
76
|
+
return_value = return_value.strip()
|
|
77
|
+
|
|
78
|
+
if return_value:
|
|
79
|
+
# Format of ConfigMap: map[key1:value1 key2:value2]
|
|
80
|
+
return_value = return_value[return_value.index('map') :]
|
|
81
|
+
configs = return_value[4:-1].split(' ')
|
|
82
|
+
|
|
83
|
+
for config in configs:
|
|
84
|
+
key, value = config.strip().split(':')
|
|
85
|
+
config_map[key] = value
|
|
86
|
+
return config_map
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def create_cluster_configmaps(
|
|
90
|
+
args,
|
|
91
|
+
system,
|
|
92
|
+
tensorboard_config: dict,
|
|
93
|
+
autoprovisioning_config: AutoprovisioningConfig | None,
|
|
94
|
+
) -> int:
|
|
95
|
+
"""Run the Create GKE Cluster ConfigMap request.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
args: user provided arguments for running the command.
|
|
99
|
+
system: system characteristics.
|
|
100
|
+
tensorboard_config: map that contains Vertex Tensorboard name, id and location
|
|
101
|
+
autoprovisioning_config: Config used in autoprovisioning.
|
|
102
|
+
Returns:
|
|
103
|
+
0 if successful and 1 otherwise.
|
|
104
|
+
"""
|
|
105
|
+
configmap_yml = {}
|
|
106
|
+
|
|
107
|
+
# ConfigMap to store resources available in the cluster.
|
|
108
|
+
device_type = system.device_type
|
|
109
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
110
|
+
resources_data = f'{device_type}: "{int(args.num_nodes)}"'
|
|
111
|
+
elif (
|
|
112
|
+
not args.enable_pathways
|
|
113
|
+
and args.enable_autoprovisioning
|
|
114
|
+
and autoprovisioning_config
|
|
115
|
+
):
|
|
116
|
+
# Currently autoprovisioning is not supported with Pathways.
|
|
117
|
+
# Auto provisioning will have variable topologies for a gke accelerator type.
|
|
118
|
+
resources_data = (
|
|
119
|
+
f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
|
|
120
|
+
)
|
|
121
|
+
resources_data += (
|
|
122
|
+
f'\n {AUTOPROVISIONING_CONFIG_MINIMUM_KEY}:'
|
|
123
|
+
f' "{autoprovisioning_config.minimum_chips}"'
|
|
124
|
+
)
|
|
125
|
+
resources_data += (
|
|
126
|
+
f'\n {AUTOPROVISIONING_CONFIG_MAXIMUM_KEY}:'
|
|
127
|
+
f' "{autoprovisioning_config.maximum_chips}"'
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
resources_data = (
|
|
131
|
+
f'{device_type}: "{int(args.num_slices) * system.vms_per_slice}"'
|
|
132
|
+
)
|
|
133
|
+
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
134
|
+
resources_yml = CLUSTER_CONFIGMAP_YAML.format(
|
|
135
|
+
args=args, name=resources_configmap_name, data=resources_data
|
|
136
|
+
)
|
|
137
|
+
configmap_yml[resources_configmap_name] = resources_yml
|
|
138
|
+
|
|
139
|
+
# ConfigMap to store cluster metadata.
|
|
140
|
+
# XPK Version.
|
|
141
|
+
metadata = f'xpk_version: {XPK_CURRENT_VERSION}'
|
|
142
|
+
# Vertex Tensorboard information
|
|
143
|
+
for key, value in tensorboard_config.items():
|
|
144
|
+
metadata += f'\n {key}: "{value}"'
|
|
145
|
+
# Capacity Type.
|
|
146
|
+
capacity_type, return_code = get_capacity_type(args)
|
|
147
|
+
if return_code != 0:
|
|
148
|
+
xpk_print('Unable to determine capacity type.')
|
|
149
|
+
return return_code
|
|
150
|
+
metadata += f'\n {CAPACITY_TYPE_CONFIG_KEY}: {capacity_type.name}'
|
|
151
|
+
# Reservation ID if applicable.
|
|
152
|
+
if capacity_type == CapacityType.RESERVATION:
|
|
153
|
+
metadata += f'\n {RESERVATION_CONFIG_KEY}: {args.reservation}'
|
|
154
|
+
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
155
|
+
metadata_yml = CLUSTER_CONFIGMAP_YAML.format(
|
|
156
|
+
args=args, name=metadata_configmap_name, data=metadata
|
|
157
|
+
)
|
|
158
|
+
configmap_yml[metadata_configmap_name] = metadata_yml
|
|
159
|
+
return create_or_update_cluster_configmap(configmap_yml)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
|
|
163
|
+
"""
|
|
164
|
+
Args:
|
|
165
|
+
configmap_yml: dict containing ConfigMap name and yml string.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
0 if successful, 1 otherwise.
|
|
169
|
+
"""
|
|
170
|
+
commands = []
|
|
171
|
+
task_names = []
|
|
172
|
+
for configmap_name, yml_string in configmap_yml.items():
|
|
173
|
+
tmp = write_tmp_file(yml_string)
|
|
174
|
+
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
175
|
+
commands.append(command)
|
|
176
|
+
task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
|
|
177
|
+
task_names.append(task_name)
|
|
178
|
+
|
|
179
|
+
return_code = run_commands(
|
|
180
|
+
commands, 'GKE Cluster CreateOrUpdate ConfigMap(s)', task_names
|
|
181
|
+
)
|
|
182
|
+
if return_code != 0:
|
|
183
|
+
xpk_print(
|
|
184
|
+
'GKE Cluster Create/Update ConfigMap(s) request returned ERROR'
|
|
185
|
+
f' {return_code}'
|
|
186
|
+
)
|
|
187
|
+
return 1
|
|
188
|
+
return 0
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def check_cluster_resources(args, system) -> tuple[bool, bool]:
|
|
192
|
+
"""Check if cluster has resources of a specified device_type/gke_accelerator.
|
|
193
|
+
This check will be skipped if <args.cluster>-<_CLUSTER_RESOURCES_CONFIGMAP> ConfigMap doesn't exist for the cluster.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
args: user provided arguments for running the command.
|
|
197
|
+
system: system characteristics.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Tuple of bool, bool
|
|
201
|
+
True if resources in the cluster should be checked, False otherwise.
|
|
202
|
+
True if device_type/gke_accelerator exists in the cluster, False otherwise.
|
|
203
|
+
"""
|
|
204
|
+
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
205
|
+
resources_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
206
|
+
if resources_config_map is None:
|
|
207
|
+
xpk_print(
|
|
208
|
+
f'No ConfigMap exist for cluster with the name {resources_config_map}.'
|
|
209
|
+
' Cluster resources check will be skipped.'
|
|
210
|
+
)
|
|
211
|
+
return False, False
|
|
212
|
+
if system.device_type in resources_config_map:
|
|
213
|
+
return True, True
|
|
214
|
+
elif system.gke_accelerator in resources_config_map:
|
|
215
|
+
return True, True
|
|
216
|
+
return True, False
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
|
|
220
|
+
"""Get systemCharcteristics based on the cluster resources configMap
|
|
221
|
+
Args:
|
|
222
|
+
args: user provided arguments for running the command.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
returns system characteristics
|
|
226
|
+
"""
|
|
227
|
+
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
228
|
+
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
229
|
+
|
|
230
|
+
if cluster_config_map is None:
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
for key in cluster_config_map:
|
|
234
|
+
system, result_code = get_system_characteristics_by_device_type(key)
|
|
235
|
+
if result_code == 0:
|
|
236
|
+
return system
|
|
237
|
+
|
|
238
|
+
return None
|
xpk/core/scheduling.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..utils.console import xpk_print
|
|
18
|
+
from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
|
|
19
|
+
from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
|
|
20
|
+
from .system_characteristics import (
|
|
21
|
+
AcceleratorType,
|
|
22
|
+
AcceleratorTypeToAcceleratorCharacteristics,
|
|
23
|
+
SystemCharacteristics,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
|
|
28
|
+
"""Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
args: user provided arguments for running the command.
|
|
32
|
+
system: system characteristics
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
returns true if workload can schedule, otherwise returns false.
|
|
36
|
+
"""
|
|
37
|
+
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
38
|
+
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
39
|
+
|
|
40
|
+
# Prevents workload creation failure for existing clusters with no ConfigMap
|
|
41
|
+
if cluster_config_map is None:
|
|
42
|
+
xpk_print(
|
|
43
|
+
'No ConfigMap exist for cluster with the name'
|
|
44
|
+
f' {resources_configmap_name}.'
|
|
45
|
+
)
|
|
46
|
+
return True
|
|
47
|
+
|
|
48
|
+
# Check for gke accelerator type:
|
|
49
|
+
missing_gke_accelerator_type = False
|
|
50
|
+
if not cluster_config_map.get(system.gke_accelerator):
|
|
51
|
+
xpk_print(
|
|
52
|
+
f'Gke Accelerator Type Check: {args.workload} is requesting'
|
|
53
|
+
f' {system.gke_accelerator} but cluster only contains'
|
|
54
|
+
f' {cluster_config_map.keys()}. '
|
|
55
|
+
)
|
|
56
|
+
missing_gke_accelerator_type = True
|
|
57
|
+
elif (
|
|
58
|
+
cluster_config_map[system.gke_accelerator]
|
|
59
|
+
== AUTOPROVISIONING_CONFIG_VALUE
|
|
60
|
+
):
|
|
61
|
+
# Run total chip check when in autoprovisioning mode.
|
|
62
|
+
max_chips_in_cluster = int(
|
|
63
|
+
cluster_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY]
|
|
64
|
+
)
|
|
65
|
+
num_chips_in_workload = get_total_chips_requested_from_args(args, system)
|
|
66
|
+
|
|
67
|
+
if num_chips_in_workload > max_chips_in_cluster:
|
|
68
|
+
xpk_print(
|
|
69
|
+
f'{args.workload} is requesting {num_chips_in_workload} chips but'
|
|
70
|
+
f' the cluster {args.cluster} supports up to {max_chips_in_cluster}.'
|
|
71
|
+
' Resize the cluster to support more chips with'
|
|
72
|
+
' `xpk cluster create --autoprovisioning-max-chips=X ...`'
|
|
73
|
+
)
|
|
74
|
+
return False
|
|
75
|
+
return True
|
|
76
|
+
|
|
77
|
+
# Check for device type
|
|
78
|
+
missing_device_type = False
|
|
79
|
+
device_type = system.device_type
|
|
80
|
+
if device_type not in cluster_config_map:
|
|
81
|
+
xpk_print(
|
|
82
|
+
f'Device Type Check: {args.workload} is requesting {device_type} but '
|
|
83
|
+
f'cluster only contains {cluster_config_map.keys()}. '
|
|
84
|
+
)
|
|
85
|
+
missing_device_type = True
|
|
86
|
+
|
|
87
|
+
if missing_device_type and missing_gke_accelerator_type:
|
|
88
|
+
xpk_print(
|
|
89
|
+
'Both Device Type and GKE Accelerator Type checks failed.'
|
|
90
|
+
f' XPK will not create the workload {args.workload}.'
|
|
91
|
+
)
|
|
92
|
+
return False
|
|
93
|
+
else:
|
|
94
|
+
# Check if the size of the workload will fit in the cluster.
|
|
95
|
+
max_vm_in_cluster = int(cluster_config_map[device_type])
|
|
96
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
97
|
+
vm_required_by_workload = args.num_nodes
|
|
98
|
+
else:
|
|
99
|
+
vm_required_by_workload = args.num_slices * system.vms_per_slice
|
|
100
|
+
if vm_required_by_workload > max_vm_in_cluster:
|
|
101
|
+
xpk_print(
|
|
102
|
+
f'{args.workload} is requesting {args.num_slices} slice/slices of'
|
|
103
|
+
f' {device_type}, which is {vm_required_by_workload} VMs, but the'
|
|
104
|
+
f' cluster only contains {max_vm_in_cluster} VMs of {device_type}.'
|
|
105
|
+
' XPK will not create this workload.'
|
|
106
|
+
)
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
return True
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def get_total_chips_requested_from_args(
|
|
113
|
+
args, system: SystemCharacteristics
|
|
114
|
+
) -> int:
|
|
115
|
+
"""Return the total chips requested based on user args.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
args: user provided arguments for running the command.
|
|
119
|
+
system: system characteristics.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
num of chips for the current request.
|
|
123
|
+
"""
|
|
124
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
125
|
+
num_chips = system.vms_per_slice * system.chips_per_vm * args.num_nodes
|
|
126
|
+
else:
|
|
127
|
+
num_chips = system.vms_per_slice * system.chips_per_vm * args.num_slices
|
|
128
|
+
|
|
129
|
+
return int(num_chips)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def get_cpu_affinity(accelerator_type) -> str:
|
|
133
|
+
"""Generate affinity rules for CPU nodepools, so that workload pods are
|
|
134
|
+
not scheduled on the default pool machines.
|
|
135
|
+
Args:
|
|
136
|
+
accelerator_type: TPU / GPU / CPU
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
str: yaml containing affinity constraints
|
|
140
|
+
"""
|
|
141
|
+
yaml = """affinity:
|
|
142
|
+
nodeAffinity:
|
|
143
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
144
|
+
nodeSelectorTerms:
|
|
145
|
+
- matchExpressions:
|
|
146
|
+
- key: cloud.google.com/gke-nodepool
|
|
147
|
+
operator: NotIn
|
|
148
|
+
values:
|
|
149
|
+
- default-pool
|
|
150
|
+
"""
|
|
151
|
+
if accelerator_type == AcceleratorType['CPU']:
|
|
152
|
+
return yaml
|
|
153
|
+
return ''
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_gpu_scheduler(
|
|
157
|
+
args, system: SystemCharacteristics, autoprovisioning_args: str
|
|
158
|
+
) -> tuple[str, int]:
|
|
159
|
+
"""Get gpu scheduler configuration.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
args: user provided arguments for running the command.
|
|
163
|
+
system: system characteristics.
|
|
164
|
+
autoprovisioning_args: a string of arguments for Autoprovisioning.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
str: yaml containing gpu scheduler configuration
|
|
168
|
+
int of 0 if successful and 1 otherwise.
|
|
169
|
+
"""
|
|
170
|
+
gpu_scheduler = ''
|
|
171
|
+
return_code = 0
|
|
172
|
+
|
|
173
|
+
if args.scheduler == 'gke.io/topology-aware-auto':
|
|
174
|
+
gpu_scheduler = f"""schedulingGates:
|
|
175
|
+
- name: "{args.scheduler}-{args.workload}"
|
|
176
|
+
"""
|
|
177
|
+
elif args.scheduler == 'default-scheduler':
|
|
178
|
+
gpu_scheduler_yaml = """schedulerName: {scheduler_name}
|
|
179
|
+
affinity:
|
|
180
|
+
nodeAffinity:
|
|
181
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
182
|
+
nodeSelectorTerms:
|
|
183
|
+
- matchExpressions:
|
|
184
|
+
- key: cloud.google.com/gke-accelerator
|
|
185
|
+
operator: Exists
|
|
186
|
+
- key: cloud.google.com/gke-nodepool
|
|
187
|
+
operator: In
|
|
188
|
+
values: [{node_pool_name}]
|
|
189
|
+
nodeSelector:
|
|
190
|
+
{accelerator_label}
|
|
191
|
+
{machine_label}
|
|
192
|
+
{autoprovisioning_args}
|
|
193
|
+
"""
|
|
194
|
+
gpu_scheduler = gpu_scheduler_yaml.format(
|
|
195
|
+
scheduler_name=args.scheduler,
|
|
196
|
+
accelerator_label=create_accelerator_label(
|
|
197
|
+
system.accelerator_type, system
|
|
198
|
+
),
|
|
199
|
+
machine_label=create_machine_label(system.accelerator_type, system),
|
|
200
|
+
node_pool_name=f'{args.cluster}-np-0',
|
|
201
|
+
autoprovisioning_args=autoprovisioning_args,
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
return_code = 1
|
|
205
|
+
xpk_print(
|
|
206
|
+
'--scheduler needs to be set as either `default-scheduler`'
|
|
207
|
+
' or `gke.io/topology-aware-auto` in order to schedule the'
|
|
208
|
+
' workloads on GPUs.'
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
return gpu_scheduler, return_code
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def create_accelerator_label(accelerator_type, system) -> str:
|
|
215
|
+
"""Generates accelerator label.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
accelerator_type: type of accelerator.
|
|
219
|
+
system: system characteristics.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
The accelerator label.
|
|
223
|
+
"""
|
|
224
|
+
if accelerator_type == AcceleratorType['CPU']:
|
|
225
|
+
return ''
|
|
226
|
+
return (
|
|
227
|
+
f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].accelerator_label}:'
|
|
228
|
+
f' {system.gke_accelerator}'
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def create_machine_label(
|
|
233
|
+
accelerator_type, system, autoprovisioning_enabled: bool = False
|
|
234
|
+
) -> str:
|
|
235
|
+
"""Generates machine label.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
accelerator_type: type of accelerator.
|
|
239
|
+
system: system characteristics.
|
|
240
|
+
autoprovisioning_enabled: describes autoprovisioning enablement.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
The machine label.
|
|
244
|
+
"""
|
|
245
|
+
if (
|
|
246
|
+
accelerator_type == AcceleratorType['TPU']
|
|
247
|
+
and not autoprovisioning_enabled
|
|
248
|
+
):
|
|
249
|
+
return (
|
|
250
|
+
f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].machine_label}:'
|
|
251
|
+
f' {system.topology}'
|
|
252
|
+
)
|
|
253
|
+
return ''
|