xpk 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +109 -0
- xpk/commands/cluster.py +784 -0
- xpk/commands/cluster_gcluster.py +185 -0
- xpk/commands/info.py +245 -0
- xpk/commands/inspector.py +363 -0
- xpk/commands/job.py +197 -0
- xpk/commands/kind.py +253 -0
- xpk/commands/shell.py +120 -0
- xpk/commands/version.py +39 -0
- xpk/commands/workload.py +692 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +61 -0
- xpk/core/blueprint/blueprint_generator.py +652 -0
- xpk/core/cluster_private.py +197 -0
- xpk/core/commands.py +352 -0
- xpk/core/core.py +2824 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/gcluster_manager.py +158 -0
- xpk/core/kjob.py +205 -0
- xpk/core/kueue.py +352 -0
- xpk/core/nap.py +349 -0
- xpk/core/pathways.py +298 -0
- xpk/core/ray.py +222 -0
- xpk/core/system_characteristics.py +1395 -0
- xpk/core/workload.py +133 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +109 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
- xpk/main.py +73 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +184 -0
- xpk/parser/cluster.py +621 -0
- xpk/parser/common.py +71 -0
- xpk/parser/core.py +109 -0
- xpk/parser/info.py +63 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +126 -0
- xpk/parser/kind.py +94 -0
- xpk/parser/shell.py +50 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +684 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +85 -0
- xpk/utils/yaml.py +30 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/METADATA +307 -38
- xpk-0.6.0.dist-info/RECORD +57 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
- xpk-0.6.0.dist-info/entry_points.txt +2 -0
- xpk-0.4.0.dist-info/RECORD +0 -7
- xpk-0.4.0.dist-info/entry_points.txt +0 -2
- xpk.py +0 -7218
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
- {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
xpk/core/pathways.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
18
|
+
from .core import (
|
|
19
|
+
AcceleratorType,
|
|
20
|
+
get_all_nodepools_programmatic,
|
|
21
|
+
get_user_workload_container,
|
|
22
|
+
zone_to_region,
|
|
23
|
+
)
|
|
24
|
+
from .system_characteristics import SystemCharacteristics
|
|
25
|
+
|
|
26
|
+
PathwaysExpectedInstancesMap = {
|
|
27
|
+
'v6e': 'tpuv6e',
|
|
28
|
+
'v5p': 'tpuv5',
|
|
29
|
+
'v5litepod': 'tpuv5e',
|
|
30
|
+
'v4': 'tpuv4',
|
|
31
|
+
'v3': 'tpuv3',
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_pathways_worker_args(args) -> str:
|
|
36
|
+
"""Arguments for the Pathways workers.
|
|
37
|
+
Args:
|
|
38
|
+
args: user provided arguments for running the command.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
str: yaml containing arguments for the Pathways workers.
|
|
42
|
+
"""
|
|
43
|
+
yaml = """- --server_port=29001
|
|
44
|
+
- --resource_manager_address={rm_address}
|
|
45
|
+
- --gcs_scratch_location={args.pathways_gcs_location}"""
|
|
46
|
+
if args.use_pathways:
|
|
47
|
+
return yaml.format(args=args, rm_address=get_rm_address(args))
|
|
48
|
+
else:
|
|
49
|
+
return ''
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_pathways_proxy_args(args) -> str:
|
|
53
|
+
"""Arguments for the Pathways proxy.
|
|
54
|
+
Args:
|
|
55
|
+
args: user provided arguments for running the command.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
str: yaml containing arguments for the Pathways proxy.
|
|
59
|
+
"""
|
|
60
|
+
yaml = """- --server_port=29000
|
|
61
|
+
- --resource_manager_address={rm_address}
|
|
62
|
+
- --gcs_scratch_location={args.pathways_gcs_location}"""
|
|
63
|
+
|
|
64
|
+
if args.use_pathways:
|
|
65
|
+
return yaml.format(args=args, rm_address=get_rm_address(args))
|
|
66
|
+
else:
|
|
67
|
+
return ''
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def add_pw_resource_flavors(args):
|
|
71
|
+
"""Add resource flavors required for Pathways enabled clusters."""
|
|
72
|
+
resource_flavor_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
|
|
73
|
+
kind: ResourceFlavor
|
|
74
|
+
metadata:
|
|
75
|
+
name: cpu-rm
|
|
76
|
+
spec:
|
|
77
|
+
nodeLabels:
|
|
78
|
+
cloud.google.com/gke-nodepool: cpu-rm-np
|
|
79
|
+
---
|
|
80
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
81
|
+
kind: ResourceFlavor
|
|
82
|
+
metadata:
|
|
83
|
+
name: cpu-proxy
|
|
84
|
+
spec:
|
|
85
|
+
nodeLabels:
|
|
86
|
+
cloud.google.com/gke-nodepool: cpu-proxy-np
|
|
87
|
+
---
|
|
88
|
+
apiVersion: kueue.x-k8s.io/v1beta1
|
|
89
|
+
kind: ResourceFlavor
|
|
90
|
+
metadata:
|
|
91
|
+
name: cpu-user
|
|
92
|
+
spec:
|
|
93
|
+
nodeLabels:
|
|
94
|
+
cloud.google.com/gke-nodepool: cpu-user-np
|
|
95
|
+
---"""
|
|
96
|
+
if args.enable_pathways:
|
|
97
|
+
return resource_flavor_yaml
|
|
98
|
+
return ''
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def add_pw_resources_to_kueue(args):
|
|
102
|
+
"""Add resource flavors required for Pathways, to the cluster queue."""
|
|
103
|
+
resources_yaml = """- coveredResources: ["cpu", "memory"]
|
|
104
|
+
flavors:
|
|
105
|
+
- name: cpu-rm
|
|
106
|
+
resources:
|
|
107
|
+
- name: "cpu"
|
|
108
|
+
nominalQuota: 80
|
|
109
|
+
- name: "memory"
|
|
110
|
+
nominalQuota: 160G
|
|
111
|
+
- name: cpu-proxy
|
|
112
|
+
resources:
|
|
113
|
+
- name: "cpu"
|
|
114
|
+
nominalQuota: 480
|
|
115
|
+
- name: "memory"
|
|
116
|
+
nominalQuota: 2000G
|
|
117
|
+
- name: cpu-user
|
|
118
|
+
resources:
|
|
119
|
+
- name: "cpu"
|
|
120
|
+
nominalQuota: 480
|
|
121
|
+
- name: "memory"
|
|
122
|
+
nominalQuota: 2000G"""
|
|
123
|
+
if args.enable_pathways:
|
|
124
|
+
return resources_yaml
|
|
125
|
+
return ''
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def ensure_pathways_workload_prerequisites(args, system) -> bool:
|
|
129
|
+
"""Check all Pathways workload prerequisites and set necessary args.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
args: user provided arguments for running the command.
|
|
133
|
+
system: system characteristics.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
True once conditions satisfy and variables are set. Exits otherwise.
|
|
137
|
+
"""
|
|
138
|
+
# Ensure command is provided if not using Pathways in headless mode
|
|
139
|
+
if args.command is None and not args.headless:
|
|
140
|
+
xpk_print(
|
|
141
|
+
'Please provide a command using "--command" for the docker container to'
|
|
142
|
+
' execute. Command is not required if you wish to run Pathways'
|
|
143
|
+
' workloads in headless mode (`xpk workload create-pathways'
|
|
144
|
+
' --headless`).'
|
|
145
|
+
)
|
|
146
|
+
xpk_exit(1)
|
|
147
|
+
|
|
148
|
+
# Ensure the cluster and CPU nodepools were created with create-pathways
|
|
149
|
+
all_node_pools = get_all_nodepools_programmatic(args)
|
|
150
|
+
desired_pw_cpu_node_pools = {'cpu-user-np', 'cpu-rm-np', 'cpu-proxy-np'}
|
|
151
|
+
if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])):
|
|
152
|
+
xpk_print(
|
|
153
|
+
'Cluster needs to be created with `xpk create-pathways` to run'
|
|
154
|
+
' Pathways workloads.'
|
|
155
|
+
)
|
|
156
|
+
xpk_exit(1)
|
|
157
|
+
|
|
158
|
+
# Ensure device type is TPUs - currently Pathways supports TPUs only.
|
|
159
|
+
if system.accelerator_type != AcceleratorType['TPU']:
|
|
160
|
+
xpk_print('Currently, Pathways workloads can only be run on TPUs.')
|
|
161
|
+
xpk_exit(1)
|
|
162
|
+
|
|
163
|
+
# Set proxy address to be consumed in helper methods and displayed to user.
|
|
164
|
+
args.pathways_proxy_address = get_proxy_address(args)
|
|
165
|
+
|
|
166
|
+
# Set the job which determines the life of other Pathways jobs
|
|
167
|
+
args.targetReplicatedJob = 'proxy' if args.headless else 'main'
|
|
168
|
+
|
|
169
|
+
# Always report user code failures back to JobSet.
|
|
170
|
+
args.restart_on_user_code_failure = True
|
|
171
|
+
|
|
172
|
+
return True
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def get_pathways_unified_query_link(args) -> str:
|
|
176
|
+
"""Get the unified query link for the pathways workload."""
|
|
177
|
+
pw_suffixes = ['main', 'rm', 'proxy']
|
|
178
|
+
pw_pod_names = [f'"{args.workload}-{suffix}-0"' for suffix in pw_suffixes]
|
|
179
|
+
pw_pod_names_query = '%20OR%20'.join(pw_pod_names + ['worker-0-0'])
|
|
180
|
+
query_params = (
|
|
181
|
+
'resource.type%3D"k8s_container"%0A'
|
|
182
|
+
f'resource.labels.project_id%3D"{args.project}"%0A'
|
|
183
|
+
f'resource.labels.location%3D"{zone_to_region(args.zone)}"%0A'
|
|
184
|
+
f'resource.labels.cluster_name%3D"{args.cluster}"%0A'
|
|
185
|
+
f'resource.labels.pod_name:{pw_pod_names_query}%0A'
|
|
186
|
+
'severity>%3DDEFAULT'
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return f'https://console.cloud.google.com/logs/query;query={query_params}'
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
|
|
193
|
+
"""Arguments for the Pathways resource manager.
|
|
194
|
+
Args:
|
|
195
|
+
args: user provided arguments for running the command.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
str: yaml containing arguments for the Pathways resource manager.
|
|
199
|
+
"""
|
|
200
|
+
yaml = """- --server_port=29001
|
|
201
|
+
- --gcs_scratch_location={args.pathways_gcs_location}
|
|
202
|
+
- --node_type=resource_manager
|
|
203
|
+
- --instance_count={instance_count}
|
|
204
|
+
- --instance_type={instance_type}"""
|
|
205
|
+
if args.use_pathways:
|
|
206
|
+
return yaml.format(
|
|
207
|
+
args=args,
|
|
208
|
+
instance_count=args.num_slices,
|
|
209
|
+
instance_type=f'{get_pathways_expected_tpu_type(system.device_type)}:{system.topology}',
|
|
210
|
+
)
|
|
211
|
+
else:
|
|
212
|
+
return ''
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str:
|
|
216
|
+
"""
|
|
217
|
+
Create a user workload container for Pathways.
|
|
218
|
+
Don't create one for Pathways headless mode.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
args: user provided args.
|
|
222
|
+
system: system characteristics.
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
str:
|
|
227
|
+
Pathways server port as a YAML string
|
|
228
|
+
"""
|
|
229
|
+
user_workload_yaml = """- name: main
|
|
230
|
+
replicas: 1
|
|
231
|
+
template:
|
|
232
|
+
metadata:
|
|
233
|
+
labels:
|
|
234
|
+
xpk.google.com/workload: {args.workload}
|
|
235
|
+
spec:
|
|
236
|
+
backoffLimit: 0
|
|
237
|
+
completions: 1
|
|
238
|
+
parallelism: 1
|
|
239
|
+
template:
|
|
240
|
+
spec:
|
|
241
|
+
containers:
|
|
242
|
+
{container}
|
|
243
|
+
nodeSelector:
|
|
244
|
+
cloud.google.com/gke-nodepool: cpu-user-np
|
|
245
|
+
restartPolicy: OnFailure
|
|
246
|
+
volumes:
|
|
247
|
+
- hostPath:
|
|
248
|
+
path: /tmp
|
|
249
|
+
type: DirectoryOrCreate
|
|
250
|
+
name: shared-tmp"""
|
|
251
|
+
if args.headless:
|
|
252
|
+
return ''
|
|
253
|
+
else:
|
|
254
|
+
container, _ = get_user_workload_container(args, system)
|
|
255
|
+
return user_workload_yaml.format(args=args, container=container)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def get_rm_address(args) -> str:
|
|
259
|
+
"""Generates the Pathways resource manager address.
|
|
260
|
+
Args:
|
|
261
|
+
args: user provided arguments for running the command.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
str: Fully qualified RM address.
|
|
265
|
+
"""
|
|
266
|
+
rm_address = f'{args.workload}-rm-0-0.{args.workload}:29001'
|
|
267
|
+
return rm_address
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def get_proxy_address(args) -> str:
|
|
271
|
+
"""Generates the Pathways proxy address.
|
|
272
|
+
Args:
|
|
273
|
+
args: user provided arguments for running the command.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
str: Fully qualified proxy address.
|
|
277
|
+
"""
|
|
278
|
+
proxy_address = f'grpc://{args.workload}-proxy-0-0.{args.workload}:29000'
|
|
279
|
+
return proxy_address
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def get_pathways_expected_tpu_type(device_type: str) -> str:
|
|
283
|
+
"""Returns the device type expected by Pathways
|
|
284
|
+
Args:
|
|
285
|
+
device_type: the system characteristic device type
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
str: the device type expected by pathways.
|
|
289
|
+
"""
|
|
290
|
+
raw_type = device_type.split('-')[0].lower()
|
|
291
|
+
pathways_expected_instance = PathwaysExpectedInstancesMap[raw_type]
|
|
292
|
+
if not pathways_expected_instance:
|
|
293
|
+
xpk_print(
|
|
294
|
+
f'Passed in device_type {device_type} is incorrect. Please pass in a'
|
|
295
|
+
' valid device type'
|
|
296
|
+
)
|
|
297
|
+
xpk_exit(1)
|
|
298
|
+
return pathways_expected_instance
|
xpk/core/ray.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
19
|
+
from ..utils.file import write_tmp_file
|
|
20
|
+
from .commands import run_command_for_value, run_command_with_updates_retry
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
HEAD_CPU = 0.5
|
|
24
|
+
WORKER_CPU = 0.9
|
|
25
|
+
GCS_SERVER = 6379
|
|
26
|
+
DASHBOARD = 8265
|
|
27
|
+
CLIENT = 10001
|
|
28
|
+
MULTISLICE = 8081
|
|
29
|
+
|
|
30
|
+
ray_cluster_crd_yaml = """apiVersion: v1
|
|
31
|
+
kind: Namespace
|
|
32
|
+
metadata:
|
|
33
|
+
name: ray
|
|
34
|
+
---
|
|
35
|
+
apiVersion: ray.io/v1
|
|
36
|
+
kind: RayCluster
|
|
37
|
+
metadata:
|
|
38
|
+
name: raycluster
|
|
39
|
+
namespace: ray
|
|
40
|
+
spec:
|
|
41
|
+
rayVersion: '{version}'
|
|
42
|
+
headGroupSpec:
|
|
43
|
+
rayStartParams: {{}}
|
|
44
|
+
#pod template
|
|
45
|
+
template:
|
|
46
|
+
spec:
|
|
47
|
+
containers:
|
|
48
|
+
- name: ray-head
|
|
49
|
+
image: rayproject/ray:{version}
|
|
50
|
+
resources:
|
|
51
|
+
limits:
|
|
52
|
+
cpu: {head_cpu}
|
|
53
|
+
memory: {head_mem}
|
|
54
|
+
requests:
|
|
55
|
+
cpu: {head_cpu}
|
|
56
|
+
memory: {head_mem}
|
|
57
|
+
ports:
|
|
58
|
+
- containerPort: {gcs_server}
|
|
59
|
+
name: gcs-server
|
|
60
|
+
- containerPort: {dashboard} # Ray dashboard
|
|
61
|
+
name: dashboard
|
|
62
|
+
- containerPort: {client}
|
|
63
|
+
name: client
|
|
64
|
+
- containerPort: {multislice}
|
|
65
|
+
name: multislice
|
|
66
|
+
workerGroupSpecs:
|
|
67
|
+
- replicas: {replicas} # TODO: Set min and max replicas
|
|
68
|
+
numOfHosts: {num_hosts}
|
|
69
|
+
minReplicas: {replicas}
|
|
70
|
+
maxReplicas: {replicas}
|
|
71
|
+
groupName: workergroup0
|
|
72
|
+
rayStartParams:
|
|
73
|
+
block: 'true'
|
|
74
|
+
template:
|
|
75
|
+
spec:
|
|
76
|
+
containers:
|
|
77
|
+
- name: ray-worker
|
|
78
|
+
image: rayproject/ray:{version}
|
|
79
|
+
resources:
|
|
80
|
+
limits:
|
|
81
|
+
cpu: {worker_cpu}
|
|
82
|
+
google.com/tpu: {chips_per_vm}
|
|
83
|
+
memory: {worker_mem}
|
|
84
|
+
requests:
|
|
85
|
+
cpu: {worker_cpu}
|
|
86
|
+
google.com/tpu: {chips_per_vm}
|
|
87
|
+
memory: {worker_mem}
|
|
88
|
+
nodeSelector:
|
|
89
|
+
cloud.google.com/gke-tpu-accelerator: {accelerator}
|
|
90
|
+
cloud.google.com/gke-tpu-topology: {topology}
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def install_ray_cluster(args, system) -> int:
|
|
95
|
+
"""Install a RayCluster on the cluster
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
args: user provided arguments for running the command.
|
|
99
|
+
system: system characteristics.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
0 if successful and 1 otherwise.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
delete_ray_cluster(args)
|
|
106
|
+
|
|
107
|
+
label = 'cloud.google.com/gke-nodepool=default-pool'
|
|
108
|
+
available_head_cpu, available_head_mem = generate_available_resources(
|
|
109
|
+
label, args, HEAD_CPU
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
label = f'cloud.google.com/gke-tpu-accelerator={system.gke_accelerator}'
|
|
113
|
+
available_worker_cpu, available_worker_mem = generate_available_resources(
|
|
114
|
+
label, args, WORKER_CPU
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
yml_string = ray_cluster_crd_yaml.format(
|
|
118
|
+
accelerator=system.gke_accelerator,
|
|
119
|
+
topology=system.topology,
|
|
120
|
+
chips_per_vm=system.chips_per_vm,
|
|
121
|
+
num_hosts=system.vms_per_slice,
|
|
122
|
+
replicas=args.num_slices,
|
|
123
|
+
version=args.ray_version,
|
|
124
|
+
worker_cpu=available_worker_cpu,
|
|
125
|
+
worker_mem=available_worker_mem,
|
|
126
|
+
head_cpu=available_head_cpu,
|
|
127
|
+
head_mem=available_head_mem,
|
|
128
|
+
gcs_server=GCS_SERVER,
|
|
129
|
+
dashboard=DASHBOARD,
|
|
130
|
+
client=CLIENT,
|
|
131
|
+
multislice=MULTISLICE,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
tmp = write_tmp_file(yml_string)
|
|
135
|
+
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
136
|
+
task = 'Applying RayCluster'
|
|
137
|
+
retry_attempts = 1
|
|
138
|
+
return_code = run_command_with_updates_retry(
|
|
139
|
+
command, task, args, num_retry_attempts=retry_attempts
|
|
140
|
+
)
|
|
141
|
+
if return_code != 0:
|
|
142
|
+
xpk_print(f'{task} not successful.')
|
|
143
|
+
xpk_exit(return_code)
|
|
144
|
+
return return_code
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def delete_ray_cluster(args) -> None:
|
|
148
|
+
"""Delete all RayClusters on the cluster
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
args: user provided arguments for running the command.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
None
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
command = 'kubectl delete rayclusters -n ray --all'
|
|
158
|
+
task = 'Deleting old RayCluster'
|
|
159
|
+
retry_attempts = 1
|
|
160
|
+
return_code = run_command_with_updates_retry(
|
|
161
|
+
command, task, args, num_retry_attempts=retry_attempts
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if return_code != 0:
|
|
165
|
+
xpk_print(f'{task} not successful.')
|
|
166
|
+
xpk_exit(return_code)
|
|
167
|
+
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def generate_available_resources(label, args, percent) -> tuple:
|
|
172
|
+
"""Generate the available resources for the nodes that match the given label
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
label: the label used to match the appropriate nodes
|
|
176
|
+
args: user provided arguments for running the command
|
|
177
|
+
percent: the percent of the available resources to use
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
A tuple with the available cpu and memory
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
command = (
|
|
184
|
+
f"kubectl get nodes -l {label} -o jsonpath='{{.items[0].metadata.name}}'"
|
|
185
|
+
)
|
|
186
|
+
task = f'Getting nodes with label {label}'
|
|
187
|
+
_, node_name = run_command_for_value(command, task, args)
|
|
188
|
+
|
|
189
|
+
command = (
|
|
190
|
+
f"kubectl get node {node_name} -o jsonpath='{{.status.allocatable.cpu}}'"
|
|
191
|
+
)
|
|
192
|
+
task = 'Fetching available CPU on node'
|
|
193
|
+
_, available_cpu = run_command_for_value(command, task, args)
|
|
194
|
+
match = re.match(r'(\d+)([a-zA-Z]+)', available_cpu)
|
|
195
|
+
if not match:
|
|
196
|
+
xpk_print(
|
|
197
|
+
'Could not find a regex match for allocatable cpu on TPU node'
|
|
198
|
+
f' {node_name}'
|
|
199
|
+
)
|
|
200
|
+
xpk_exit(1)
|
|
201
|
+
value, units = match.group(1), match.group(2)
|
|
202
|
+
cpu_value = int(int(value) * percent)
|
|
203
|
+
adjusted_available_cpu = str(cpu_value) + units
|
|
204
|
+
|
|
205
|
+
command = (
|
|
206
|
+
f'kubectl get node {node_name} -o'
|
|
207
|
+
" jsonpath='{.status.allocatable.memory}'"
|
|
208
|
+
)
|
|
209
|
+
task = 'Fetching available memory on node'
|
|
210
|
+
_, available_memory = run_command_for_value(command, task, args)
|
|
211
|
+
match = re.match(r'(\d+)([a-zA-Z]+)', available_memory)
|
|
212
|
+
if not match:
|
|
213
|
+
xpk_print(
|
|
214
|
+
'Could not find a regex match for allocatable memory on TPU node'
|
|
215
|
+
f' {node_name}'
|
|
216
|
+
)
|
|
217
|
+
xpk_exit(1)
|
|
218
|
+
value, units = match.group(1), match.group(2)
|
|
219
|
+
memory_value = int(int(value) * percent)
|
|
220
|
+
adjusted_available_memory = str(memory_value) + units
|
|
221
|
+
|
|
222
|
+
return adjusted_available_cpu, adjusted_available_memory
|