kalavai-client 0.5.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kalavai_client/__init__.py +2 -0
- kalavai_client/__main__.py +5 -0
- kalavai_client/assets/apps.yaml +201 -0
- kalavai_client/assets/apps_values.yaml +83 -0
- kalavai_client/assets/docker-compose-template.yaml +55 -0
- kalavai_client/assets/pool_config_template.yaml +19 -0
- kalavai_client/assets/pool_config_values.yaml +12 -0
- kalavai_client/assets/user_workspace.yaml +19 -0
- kalavai_client/assets/user_workspace_values.yaml +29 -0
- kalavai_client/assets/vpn-template.yaml +13 -0
- kalavai_client/auth.py +68 -0
- kalavai_client/cli.py +1989 -0
- kalavai_client/cluster.py +308 -0
- kalavai_client/utils.py +456 -0
- kalavai_client-0.5.0.dist-info/LICENSE +201 -0
- kalavai_client-0.5.0.dist-info/METADATA +279 -0
- kalavai_client-0.5.0.dist-info/RECORD +19 -0
- kalavai_client-0.5.0.dist-info/WHEEL +4 -0
- kalavai_client-0.5.0.dist-info/entry_points.txt +3 -0
kalavai_client/cli.py
ADDED
@@ -0,0 +1,1989 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
import math
|
3
|
+
import os
|
4
|
+
import json
|
5
|
+
import uuid
|
6
|
+
import time
|
7
|
+
import socket
|
8
|
+
from pathlib import Path
|
9
|
+
from getpass import getpass
|
10
|
+
import ipaddress
|
11
|
+
from sys import exit
|
12
|
+
|
13
|
+
import yaml
|
14
|
+
import netifaces as ni
|
15
|
+
import arguably
|
16
|
+
from rich.console import Console
|
17
|
+
|
18
|
+
from kalavai_client.utils import (
|
19
|
+
check_gpu_drivers,
|
20
|
+
run_cmd,
|
21
|
+
user_path,
|
22
|
+
decode_dict,
|
23
|
+
generate_join_token,
|
24
|
+
user_confirm,
|
25
|
+
load_template,
|
26
|
+
store_server_info,
|
27
|
+
generate_table,
|
28
|
+
request_to_server,
|
29
|
+
resource_path,
|
30
|
+
user_path,
|
31
|
+
safe_remove,
|
32
|
+
leave_vpn,
|
33
|
+
join_vpn,
|
34
|
+
load_server_info,
|
35
|
+
user_login,
|
36
|
+
user_logout,
|
37
|
+
get_public_vpns,
|
38
|
+
register_cluster,
|
39
|
+
unregister_cluster,
|
40
|
+
get_public_seeds,
|
41
|
+
validate_join_public_seed,
|
42
|
+
is_storage_compatible,
|
43
|
+
is_watcher_alive,
|
44
|
+
load_user_session,
|
45
|
+
SERVER_IP_KEY,
|
46
|
+
AUTH_KEY,
|
47
|
+
WATCHER_SERVICE_KEY,
|
48
|
+
READONLY_AUTH_KEY,
|
49
|
+
WRITE_AUTH_KEY,
|
50
|
+
PUBLIC_LOCATION_KEY,
|
51
|
+
NODE_NAME_KEY,
|
52
|
+
CLUSTER_NAME_KEY,
|
53
|
+
CLUSTER_IP_KEY,
|
54
|
+
CLUSTER_TOKEN_KEY,
|
55
|
+
WATCHER_PORT_KEY,
|
56
|
+
MANDATORY_TOKEN_FIELDS,
|
57
|
+
USER_NODE_LABEL_KEY,
|
58
|
+
ALLOW_UNREGISTERED_USER_KEY
|
59
|
+
)
|
60
|
+
from kalavai_client.cluster import (
|
61
|
+
dockerCluster
|
62
|
+
)
|
63
|
+
|
64
|
+
|
65
|
+
KALAVAI_PLATFORM_URL = os.getenv("KALAVAI_PLATFORM_URL", "https://platform.kalavai.net")
|
66
|
+
LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
|
67
|
+
VERSION = 1
|
68
|
+
RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
|
69
|
+
CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
|
70
|
+
TEMPLATE_LABEL = "kalavai.job.name"
|
71
|
+
RAY_LABEL = "kalavai.ray.name"
|
72
|
+
PVC_NAME_LABEL = "kalavai.storage.name"
|
73
|
+
DOCKER_COMPOSE_TEMPLATE = resource_path("assets/docker-compose-template.yaml")
|
74
|
+
VPN_COMPOSE_TEMPLATE = resource_path("assets/vpn-template.yaml")
|
75
|
+
POOL_CONFIG_TEMPLATE = resource_path("assets/pool_config_template.yaml")
|
76
|
+
POOL_CONFIG_DEFAULT_VALUES = resource_path("assets/pool_config_values.yaml")
|
77
|
+
USER_WORKSPACE_TEMPLATE = resource_path("assets/user_workspace.yaml")
|
78
|
+
DEFAULT_USER_WORKSPACE_VALUES = resource_path("assets/user_workspace_values.yaml")
|
79
|
+
STORAGE_CLASS_NAME = "local-path"
|
80
|
+
STORAGE_ACCESS_MODE = ["ReadWriteOnce"]
|
81
|
+
STORAGE_CLASS_LABEL = "kalavai.storage.enabled"
|
82
|
+
DEFAULT_STORAGE_NAME = "pool-cache"
|
83
|
+
DEFAULT_STORAGE_SIZE = 20
|
84
|
+
USER_NODE_LABEL = "kalavai.cluster.user"
|
85
|
+
KUBE_VERSION = os.getenv("KALAVAI_KUBE_VERSION", "v1.31.1+k3s1")
|
86
|
+
DEFAULT_FLANNEL_IFACE = os.getenv("KALAVAI_FLANNEL_IFACE", "netmaker")
|
87
|
+
FORBIDEDEN_IPS = ["127.0.0.1"]
|
88
|
+
# kalavai templates
|
89
|
+
HELM_APPS_FILE = resource_path("assets/apps.yaml")
|
90
|
+
HELM_APPS_VALUES = resource_path("assets/apps_values.yaml")
|
91
|
+
# user specific config files
|
92
|
+
DEFAULT_CONTAINER_NAME = "kalavai-seed"
|
93
|
+
CONTAINER_HOST_PATH = user_path("pool/", create_path=True)
|
94
|
+
USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
|
95
|
+
USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
|
96
|
+
USER_HELM_APPS_FILE = user_path("apps.yaml")
|
97
|
+
USER_KUBECONFIG_FILE = user_path("kubeconfig")
|
98
|
+
USER_LOCAL_SERVER_FILE = user_path(".server")
|
99
|
+
USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
|
100
|
+
USER_COOKIE = user_path(".user_cookie.pkl")
|
101
|
+
|
102
|
+
|
103
|
+
console = Console()
|
104
|
+
CLUSTER = dockerCluster(
|
105
|
+
container_name=DEFAULT_CONTAINER_NAME,
|
106
|
+
kube_version=KUBE_VERSION,
|
107
|
+
flannel_iface=DEFAULT_FLANNEL_IFACE,
|
108
|
+
compose_file=USER_COMPOSE_FILE,
|
109
|
+
kubeconfig_file=USER_KUBECONFIG_FILE,
|
110
|
+
poolconfig_file=USER_LOCAL_SERVER_FILE,
|
111
|
+
dependencies_file=USER_HELM_APPS_FILE
|
112
|
+
)
|
113
|
+
|
114
|
+
|
115
|
+
######################
|
116
|
+
## HELPER FUNCTIONS ##
|
117
|
+
######################
|
118
|
+
|
119
|
+
def check_vpn_compatibility():
|
120
|
+
"""Check required packages to join VPN"""
|
121
|
+
logs = []
|
122
|
+
console.log("[white]Checking system requirements...")
|
123
|
+
# netclient
|
124
|
+
try:
|
125
|
+
run_cmd("sudo netclient version >/dev/null 2>&1")
|
126
|
+
except:
|
127
|
+
logs.append("[red]Netmaker not installed. Install instructions:\n")
|
128
|
+
logs.append(" Linux: https://docs.netmaker.io/docs/netclient#linux\n")
|
129
|
+
logs.append(" Windows: https://docs.netmaker.io/docs/netclient#windows\n")
|
130
|
+
logs.append(" MacOS: https://docs.netmaker.io/docs/netclient#mac\n")
|
131
|
+
|
132
|
+
if len(logs) == 0:
|
133
|
+
console.log("[green]System is ready to join a pool")
|
134
|
+
return True
|
135
|
+
else:
|
136
|
+
for log in logs:
|
137
|
+
console.log(log)
|
138
|
+
return False
|
139
|
+
|
140
|
+
def check_seed_compatibility():
|
141
|
+
"""Check required packages to start pools"""
|
142
|
+
logs = []
|
143
|
+
console.log("[white]Checking system requirements...")
|
144
|
+
# docker
|
145
|
+
try:
|
146
|
+
run_cmd("docker version >/dev/null 2>&1")
|
147
|
+
except:
|
148
|
+
logs.append("[red]Docker not installed. Install instructions:\n")
|
149
|
+
logs.append(" Linux: https://docs.docker.com/engine/install/\n")
|
150
|
+
logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
|
151
|
+
|
152
|
+
if len(logs) == 0:
|
153
|
+
console.log("[green]System is ready to start a pool")
|
154
|
+
return True
|
155
|
+
else:
|
156
|
+
for log in logs:
|
157
|
+
console.log(log)
|
158
|
+
return False
|
159
|
+
|
160
|
+
def check_worker_compatibility():
|
161
|
+
"""Check required packages to join pools"""
|
162
|
+
logs = []
|
163
|
+
console.log("[white]Checking system requirements...")
|
164
|
+
# docker
|
165
|
+
try:
|
166
|
+
run_cmd("docker version >/dev/null 2>&1")
|
167
|
+
except:
|
168
|
+
logs.append("[red]Docker not installed. Install instructions:\n")
|
169
|
+
logs.append(" Linux: https://docs.docker.com/engine/install/\n")
|
170
|
+
logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
|
171
|
+
|
172
|
+
if len(logs) == 0:
|
173
|
+
console.log("[green]System is ready to join a pool")
|
174
|
+
return True
|
175
|
+
else:
|
176
|
+
for log in logs:
|
177
|
+
console.log(log)
|
178
|
+
return False
|
179
|
+
|
180
|
+
|
181
|
+
def cleanup_local():
|
182
|
+
# disconnect from private network
|
183
|
+
console.log("Disconnecting from VPN...")
|
184
|
+
try:
|
185
|
+
vpns = leave_vpn()
|
186
|
+
if vpns is not None:
|
187
|
+
for vpn in vpns:
|
188
|
+
console.log(f"You have left {vpn} VPN")
|
189
|
+
except:
|
190
|
+
# no vpn
|
191
|
+
pass
|
192
|
+
console.log("Removing local cache files...")
|
193
|
+
safe_remove(CONTAINER_HOST_PATH)
|
194
|
+
safe_remove(USER_COMPOSE_FILE)
|
195
|
+
safe_remove(USER_VPN_COMPOSE_FILE)
|
196
|
+
safe_remove(USER_HELM_APPS_FILE)
|
197
|
+
safe_remove(USER_KUBECONFIG_FILE)
|
198
|
+
safe_remove(USER_LOCAL_SERVER_FILE)
|
199
|
+
safe_remove(USER_TEMPLATES_FOLDER)
|
200
|
+
|
201
|
+
def pre_join_check(node_name, server_url, server_key):
|
202
|
+
# check with the server that we can connect
|
203
|
+
try:
|
204
|
+
nodes = request_to_server(
|
205
|
+
force_url=server_url,
|
206
|
+
force_key=server_key,
|
207
|
+
method="get",
|
208
|
+
endpoint="/v1/get_nodes",
|
209
|
+
data={"node_names": [node_name]},
|
210
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
211
|
+
user_cookie=USER_COOKIE
|
212
|
+
)
|
213
|
+
return node_name not in nodes.keys()
|
214
|
+
except Exception as e:
|
215
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
216
|
+
return False
|
217
|
+
|
218
|
+
def set_schedulable(schedulable, node_name=load_server_info(data_key=NODE_NAME_KEY, file=USER_LOCAL_SERVER_FILE)):
|
219
|
+
"""
|
220
|
+
Delete job in the cluster
|
221
|
+
"""
|
222
|
+
# deploy template with kube-watcher
|
223
|
+
data = {
|
224
|
+
"schedulable": str(schedulable),
|
225
|
+
"node_names": [node_name]
|
226
|
+
}
|
227
|
+
try:
|
228
|
+
res = request_to_server(
|
229
|
+
method="post",
|
230
|
+
endpoint="/v1/set_node_schedulable",
|
231
|
+
data=data,
|
232
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
233
|
+
user_cookie=USER_COOKIE
|
234
|
+
)
|
235
|
+
console.log(f"{res}")
|
236
|
+
except Exception as e:
|
237
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
238
|
+
|
239
|
+
|
240
|
+
def init_user_workspace(force_namespace=None):
|
241
|
+
|
242
|
+
# load template config and populate with values
|
243
|
+
sidecar_template_yaml = load_template(
|
244
|
+
template_path=USER_WORKSPACE_TEMPLATE,
|
245
|
+
values={},
|
246
|
+
default_values_path=DEFAULT_USER_WORKSPACE_VALUES)
|
247
|
+
|
248
|
+
try:
|
249
|
+
data = {"config": sidecar_template_yaml}
|
250
|
+
if force_namespace is not None:
|
251
|
+
data["force_namespace"] = force_namespace
|
252
|
+
result = request_to_server(
|
253
|
+
method="post",
|
254
|
+
endpoint="/v1/create_user_space",
|
255
|
+
data=data,
|
256
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
257
|
+
user_cookie=USER_COOKIE
|
258
|
+
)
|
259
|
+
console.log(f"Workspace creation (ignore already created warnings): {result}" )
|
260
|
+
except Exception as e:
|
261
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
262
|
+
|
263
|
+
def pool_init(pool_config_values_path=None):
|
264
|
+
"""Deploy configured objects to initialise pool"""
|
265
|
+
if pool_config_values_path is None:
|
266
|
+
return
|
267
|
+
|
268
|
+
# load template config and populate with values
|
269
|
+
sidecar_template_yaml = load_template(
|
270
|
+
template_path=POOL_CONFIG_TEMPLATE,
|
271
|
+
values={},
|
272
|
+
default_values_path=pool_config_values_path)
|
273
|
+
|
274
|
+
try:
|
275
|
+
result = request_to_server(
|
276
|
+
method="post",
|
277
|
+
endpoint="/v1/deploy_generic_model",
|
278
|
+
data={"config": sidecar_template_yaml},
|
279
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
280
|
+
user_cookie=USER_COOKIE
|
281
|
+
)
|
282
|
+
if 'failed' in result and len(result['failed']) > 0:
|
283
|
+
console.log(f"[red]Error when deploying pool config\n\n{result['failed']}")
|
284
|
+
if len(result['successful']) > 0:
|
285
|
+
console.log(f"[green]Deployed pool config!")
|
286
|
+
except Exception as e:
|
287
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
288
|
+
|
289
|
+
def select_ip_address(subnet=None):
|
290
|
+
ips = []
|
291
|
+
retry = 3
|
292
|
+
while len(ips) == 0:
|
293
|
+
for iface in ni.interfaces():
|
294
|
+
try:
|
295
|
+
ip = ni.ifaddresses(iface)[ni.AF_INET][0]['addr']
|
296
|
+
if ip in FORBIDEDEN_IPS:
|
297
|
+
continue
|
298
|
+
if subnet is None or ipaddress.ip_address(ip) in ipaddress.ip_network(subnet):
|
299
|
+
ips.append(ip)
|
300
|
+
except:
|
301
|
+
pass
|
302
|
+
if len(ips) == 1:
|
303
|
+
return ips[0]
|
304
|
+
time.sleep(2)
|
305
|
+
retry -= 1
|
306
|
+
if retry < 0:
|
307
|
+
raise ValueError(f"No IPs available on subnet {subnet}")
|
308
|
+
while True:
|
309
|
+
option = user_confirm(
|
310
|
+
question="Select IP to advertise the node (needs to be visible to other nodes)",
|
311
|
+
options=ips
|
312
|
+
)
|
313
|
+
if option is not None:
|
314
|
+
break
|
315
|
+
else:
|
316
|
+
console.log("[red] Input error")
|
317
|
+
return ips[option]
|
318
|
+
|
319
|
+
def fetch_gpus():
|
320
|
+
data = request_to_server(
|
321
|
+
method="post",
|
322
|
+
endpoint="/v1/get_node_gpus",
|
323
|
+
data={},
|
324
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
325
|
+
user_cookie=USER_COOKIE
|
326
|
+
)
|
327
|
+
return data.items()
|
328
|
+
|
329
|
+
def select_gpus(message):
|
330
|
+
console.log(f"[yellow]{message}")
|
331
|
+
gpu_models = ["Any/None"]
|
332
|
+
gpu_models_full = ["Any/None"]
|
333
|
+
available_gpus = fetch_gpus()
|
334
|
+
for _, gpus in available_gpus:
|
335
|
+
for gpu in gpus["gpus"]:
|
336
|
+
#status = "free" if "ready" in gpu else "busy"
|
337
|
+
memory = math.floor(int(gpu['memory'])/1000)
|
338
|
+
gpu_models.append(gpu['model'])
|
339
|
+
gpu_models_full.append(f"{gpu['model']} ({memory}GB) (in use: {gpus['available'] == 0})" )
|
340
|
+
|
341
|
+
while True:
|
342
|
+
options = user_confirm(
|
343
|
+
question=" ",
|
344
|
+
options=gpu_models_full,
|
345
|
+
multiple=True
|
346
|
+
)
|
347
|
+
if options is not None:
|
348
|
+
if 0 in options:
|
349
|
+
ids = None
|
350
|
+
else:
|
351
|
+
ids = ",".join([gpu_models[i] for i in options])
|
352
|
+
break
|
353
|
+
return ids
|
354
|
+
|
355
|
+
def select_token_type():
|
356
|
+
options = ["Admin", "User (deploy jobs)", "Worker (read only)"]
|
357
|
+
|
358
|
+
while True:
|
359
|
+
choice = user_confirm(
|
360
|
+
question="What type of access are you granting?",
|
361
|
+
options=options,
|
362
|
+
multiple=False
|
363
|
+
)
|
364
|
+
if choice is not None:
|
365
|
+
break
|
366
|
+
return {"admin": choice == 0, "user": choice == 1, "worker": choice == 2}
|
367
|
+
|
368
|
+
def generate_compose_config(role, node_name, ip_address, node_labels, is_public, server=None, token=None):
|
369
|
+
num_gpus = 0
|
370
|
+
try:
|
371
|
+
has_gpus = check_gpu_drivers()
|
372
|
+
if has_gpus:
|
373
|
+
max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
|
374
|
+
num_gpus = user_confirm(
|
375
|
+
question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
|
376
|
+
options=range(max_gpus+1)
|
377
|
+
)
|
378
|
+
except:
|
379
|
+
console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
|
380
|
+
compose_values = {
|
381
|
+
"user_path": user_path(""),
|
382
|
+
"service_name": DEFAULT_CONTAINER_NAME,
|
383
|
+
"pool_ip": server,
|
384
|
+
"token": token,
|
385
|
+
"hostname": node_name,
|
386
|
+
"command": role,
|
387
|
+
"storage_enabled": "True",
|
388
|
+
"ip_address": ip_address,
|
389
|
+
"num_gpus": num_gpus,
|
390
|
+
"k3s_path": f"{CONTAINER_HOST_PATH}/k3s",
|
391
|
+
"etc_path": f"{CONTAINER_HOST_PATH}/etc",
|
392
|
+
"node_labels": " ".join([f"--node-label {key}={value}" for key, value in node_labels.items()]),
|
393
|
+
"flannel_iface": DEFAULT_FLANNEL_IFACE if is_public else None
|
394
|
+
}
|
395
|
+
# generate local config files
|
396
|
+
compose_yaml = load_template(
|
397
|
+
template_path=DOCKER_COMPOSE_TEMPLATE,
|
398
|
+
values=compose_values)
|
399
|
+
with open(USER_COMPOSE_FILE, "w") as f:
|
400
|
+
f.write(compose_yaml)
|
401
|
+
return compose_yaml
|
402
|
+
|
403
|
+
##################
|
404
|
+
## CLI COMMANDS ##
|
405
|
+
##################
|
406
|
+
|
407
|
+
@arguably.command
|
408
|
+
def login(*others, username: str=None):
|
409
|
+
"""
|
410
|
+
[AUTH] (For public clusters only) Log in to Kalavai server.
|
411
|
+
|
412
|
+
Args:
|
413
|
+
*others: all the other positional arguments go here
|
414
|
+
"""
|
415
|
+
console.log(f"Kalavai account details. If you don't have an account, create one at [yellow]{KALAVAI_PLATFORM_URL}")
|
416
|
+
if username is None:
|
417
|
+
username = input("User email: ")
|
418
|
+
password = getpass()
|
419
|
+
user = user_login(
|
420
|
+
user_cookie=USER_COOKIE,
|
421
|
+
username=username,
|
422
|
+
password=password
|
423
|
+
)
|
424
|
+
|
425
|
+
if user is not None:
|
426
|
+
console.log(f"[green]{username} logged in successfully")
|
427
|
+
else:
|
428
|
+
console.log(f"[red]Invalid credentials for {username}")
|
429
|
+
|
430
|
+
return user is not None
|
431
|
+
|
432
|
+
@arguably.command
|
433
|
+
def logout(*others):
|
434
|
+
"""
|
435
|
+
[AUTH] (For public clusters only) Log out of Kalavai server.
|
436
|
+
|
437
|
+
Args:
|
438
|
+
*others: all the other positional arguments go here
|
439
|
+
"""
|
440
|
+
user_logout(
|
441
|
+
user_cookie=USER_COOKIE
|
442
|
+
)
|
443
|
+
console.log("[green]Log out successfull")
|
444
|
+
|
445
|
+
@arguably.command
|
446
|
+
def location__list(*others):
|
447
|
+
"""
|
448
|
+
[AUTH] List public locations on Kalavai
|
449
|
+
"""
|
450
|
+
try:
|
451
|
+
seeds = get_public_vpns(user_cookie=USER_COOKIE)
|
452
|
+
except Exception as e:
|
453
|
+
console.log(f"[red]Error: {str(e)}")
|
454
|
+
console.log("Are you authenticated? Try [yellow]kalavai login")
|
455
|
+
return
|
456
|
+
columns, rows = [], []
|
457
|
+
for idx, seed in enumerate(seeds):
|
458
|
+
columns = seed.keys()
|
459
|
+
rows.append([str(idx)] + list(seed.values()))
|
460
|
+
columns = ["VPN"] + list(columns)
|
461
|
+
table = generate_table(columns=columns, rows=rows)
|
462
|
+
console.log(table)
|
463
|
+
|
464
|
+
@arguably.command
|
465
|
+
def pool__publish(*others, description=None):
|
466
|
+
"""
|
467
|
+
[AUTH] Publish pool to Kalavai platform, where other users may be able to join
|
468
|
+
"""
|
469
|
+
# Check for:
|
470
|
+
# - cluster is up and running
|
471
|
+
# - cluster is connected to vpn (has net token)
|
472
|
+
# - user is authenticated
|
473
|
+
try:
|
474
|
+
CLUSTER.is_seed_node()
|
475
|
+
except Exception as e:
|
476
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
477
|
+
return
|
478
|
+
choices = select_token_type()
|
479
|
+
token = pool__token(**choices)
|
480
|
+
|
481
|
+
if description is None:
|
482
|
+
console.log("[yellow] [Markdown] In a few words (max 500 chars), describe your goals with this cluster. Remember, this is what other users will see to decide whether to share their resources with you, [blue]so inspire them!")
|
483
|
+
description = input(f"(You can edit this later in {KALAVAI_PLATFORM_URL}\n")
|
484
|
+
|
485
|
+
description = description
|
486
|
+
|
487
|
+
try:
|
488
|
+
if not pool__check_token(token=token, public=True):
|
489
|
+
raise ValueError("[red]Cluster must be started with a valid vpn_location to publish")
|
490
|
+
cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
|
491
|
+
|
492
|
+
register_cluster(
|
493
|
+
name=cluster_name,
|
494
|
+
token=token,
|
495
|
+
description=description,
|
496
|
+
user_cookie=USER_COOKIE)
|
497
|
+
console.log(f"[green]Your cluster is now public on {KALAVAI_PLATFORM_URL}")
|
498
|
+
except Exception as e:
|
499
|
+
console.log(f"[red]Error when publishing cluster. {str(e)}")
|
500
|
+
|
501
|
+
@arguably.command
|
502
|
+
def pool__unpublish(cluster_name=None, *others):
|
503
|
+
"""
|
504
|
+
[AUTH] Unpublish pool to Kalavai platform. Cluster and all its workers will still work
|
505
|
+
"""
|
506
|
+
# Check for:
|
507
|
+
# - cluster is up and running
|
508
|
+
# - user is authenticated
|
509
|
+
try:
|
510
|
+
CLUSTER.is_seed_node()
|
511
|
+
except Exception as e:
|
512
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
513
|
+
return
|
514
|
+
|
515
|
+
try:
|
516
|
+
if cluster_name is None:
|
517
|
+
cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
|
518
|
+
unregister_cluster(
|
519
|
+
name=cluster_name,
|
520
|
+
user_cookie=USER_COOKIE)
|
521
|
+
console.log(f"[green]Your cluster has been removed from {KALAVAI_PLATFORM_URL}")
|
522
|
+
except Exception as e:
|
523
|
+
console.log(f"[red]Error when unpublishing cluster. {str(e)}")
|
524
|
+
|
525
|
+
@arguably.command
|
526
|
+
def pool__list(*others, user_only=False):
|
527
|
+
"""
|
528
|
+
[AUTH] List public pools in to Kalavai platform.
|
529
|
+
"""
|
530
|
+
try:
|
531
|
+
seeds = get_public_seeds(
|
532
|
+
user_only=user_only,
|
533
|
+
user_cookie=USER_COOKIE)
|
534
|
+
except Exception as e:
|
535
|
+
console.log(f"[red]Error when loading pools. {str(e)}")
|
536
|
+
return
|
537
|
+
|
538
|
+
for seed in seeds:
|
539
|
+
console.log("[yellow]************************************")
|
540
|
+
for key, value in seed.items():
|
541
|
+
if key == "join_key":
|
542
|
+
continue
|
543
|
+
console.log(f"[yellow]{key}: [green]{value}")
|
544
|
+
print(f"Join key: {seed['join_key']}")
|
545
|
+
console.log("[yellow]************************************")
|
546
|
+
console.log("[white]Use [yellow]kalavai pool join <join key> [white]to join a public pool")
|
547
|
+
|
548
|
+
|
549
|
+
@arguably.command
|
550
|
+
def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=HELM_APPS_VALUES, pool_config_values: str=POOL_CONFIG_DEFAULT_VALUES):
|
551
|
+
"""
|
552
|
+
Start Kalavai pool and start/resume sharing resources.
|
553
|
+
|
554
|
+
Args:
|
555
|
+
*others: all the other positional arguments go here
|
556
|
+
"""
|
557
|
+
|
558
|
+
if not check_seed_compatibility():
|
559
|
+
return
|
560
|
+
|
561
|
+
if CLUSTER.is_cluster_init():
|
562
|
+
console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
|
563
|
+
return
|
564
|
+
|
565
|
+
# User acknowledgement
|
566
|
+
option = user_confirm(
|
567
|
+
question="Kalavai will now create a pool and a local worker using docker. This won't modify your system. Are you happy to proceed?",
|
568
|
+
options=["no", "yes"]
|
569
|
+
)
|
570
|
+
if option == 0:
|
571
|
+
console.log("Installation was cancelled and did not complete.")
|
572
|
+
return
|
573
|
+
|
574
|
+
# if only registered users are allowed, check user has logged in
|
575
|
+
user = defaultdict(lambda: None)
|
576
|
+
if only_registered_users or location is not None:
|
577
|
+
user = user_login(user_cookie=USER_COOKIE)
|
578
|
+
if user is None:
|
579
|
+
console.log("[white]--only-registered-users [red]or [white]--location[red] can only be used if the host is authenticated. Run [yellow]kalavai login[red] to authenticate")
|
580
|
+
exit()
|
581
|
+
|
582
|
+
# join private network if provided
|
583
|
+
vpn = defaultdict(lambda: None)
|
584
|
+
node_labels = {
|
585
|
+
STORAGE_CLASS_LABEL: is_storage_compatible()
|
586
|
+
}
|
587
|
+
if location is not None:
|
588
|
+
console.log("Joining private network")
|
589
|
+
try:
|
590
|
+
if not check_vpn_compatibility():
|
591
|
+
return
|
592
|
+
vpn = join_vpn(
|
593
|
+
location=location,
|
594
|
+
user_cookie=USER_COOKIE)
|
595
|
+
node_labels[USER_NODE_LABEL] = user["username"]
|
596
|
+
except Exception as e:
|
597
|
+
console.log(f"[red]Error when joining network: {str(e)}")
|
598
|
+
return
|
599
|
+
|
600
|
+
if ip_address is None:
|
601
|
+
console.log(f"Scanning for valid IPs (subnet {vpn['subnet']})...")
|
602
|
+
ip_address = select_ip_address(subnet=vpn["subnet"])
|
603
|
+
console.log(f"Using {ip_address} address for server")
|
604
|
+
|
605
|
+
auth_key = str(uuid.uuid4())
|
606
|
+
write_auth_key = str(uuid.uuid4())
|
607
|
+
readonly_auth_key = str(uuid.uuid4())
|
608
|
+
watcher_port = 30001
|
609
|
+
watcher_service = f"{ip_address}:{watcher_port}"
|
610
|
+
values = {
|
611
|
+
CLUSTER_NAME_KEY: cluster_name,
|
612
|
+
CLUSTER_IP_KEY: ip_address,
|
613
|
+
AUTH_KEY: auth_key,
|
614
|
+
READONLY_AUTH_KEY: readonly_auth_key,
|
615
|
+
WRITE_AUTH_KEY: write_auth_key,
|
616
|
+
WATCHER_PORT_KEY: watcher_port,
|
617
|
+
WATCHER_SERVICE_KEY: watcher_service,
|
618
|
+
USER_NODE_LABEL_KEY: USER_NODE_LABEL,
|
619
|
+
ALLOW_UNREGISTERED_USER_KEY: not only_registered_users
|
620
|
+
}
|
621
|
+
|
622
|
+
store_server_info(
|
623
|
+
server_ip=ip_address,
|
624
|
+
auth_key=auth_key,
|
625
|
+
readonly_auth_key=readonly_auth_key,
|
626
|
+
write_auth_key=write_auth_key,
|
627
|
+
file=USER_LOCAL_SERVER_FILE,
|
628
|
+
watcher_service=watcher_service,
|
629
|
+
node_name=socket.gethostname(),
|
630
|
+
cluster_name=cluster_name,
|
631
|
+
public_location=location,
|
632
|
+
user_api_key=user["api_key"])
|
633
|
+
|
634
|
+
# 1. Generate docker compose recipe
|
635
|
+
compose_yaml = generate_compose_config(
|
636
|
+
role="server",
|
637
|
+
node_name=socket.gethostname(),
|
638
|
+
ip_address=ip_address,
|
639
|
+
node_labels=node_labels,
|
640
|
+
is_public=location is not None
|
641
|
+
)
|
642
|
+
|
643
|
+
# Generate helmfile recipe
|
644
|
+
helm_yaml = load_template(
|
645
|
+
template_path=HELM_APPS_FILE,
|
646
|
+
values=values,
|
647
|
+
default_values_path=app_values,
|
648
|
+
force_defaults=True)
|
649
|
+
with open(USER_HELM_APPS_FILE, "w") as f:
|
650
|
+
f.write(helm_yaml)
|
651
|
+
|
652
|
+
console.log("[green]Config files have been generated in your local machine\n")
|
653
|
+
|
654
|
+
# # 1. start server
|
655
|
+
console.log("Deploying seed...")
|
656
|
+
CLUSTER.start_seed_node()
|
657
|
+
|
658
|
+
while not CLUSTER.is_agent_running():
|
659
|
+
console.log("Waiting for seed to start...")
|
660
|
+
time.sleep(10)
|
661
|
+
|
662
|
+
console.log("Setting pool dependencies...")
|
663
|
+
# set template values in helmfile
|
664
|
+
try:
|
665
|
+
CLUSTER.update_dependencies(
|
666
|
+
dependencies_file=USER_HELM_APPS_FILE
|
667
|
+
)
|
668
|
+
except Exception as e:
|
669
|
+
console.log(f"Error: {str(e)}")
|
670
|
+
exit()
|
671
|
+
console.log("[green]Your pool is ready! Grow it by sharing your joining token with others. Run [yellow]kalavai pool token[green] to generate one.")
|
672
|
+
|
673
|
+
if location is not None:
|
674
|
+
# register with kalavai if it's a public cluster
|
675
|
+
console.log("Registering public cluster with Kalavai...")
|
676
|
+
pool__publish()
|
677
|
+
|
678
|
+
# wait until the server is ready to create objects
|
679
|
+
while True:
|
680
|
+
console.log("Waiting for core services to be ready, may take a few minutes...")
|
681
|
+
time.sleep(30)
|
682
|
+
if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
|
683
|
+
break
|
684
|
+
console.log("Initialise user workspace...")
|
685
|
+
pool_init(pool_config_values_path=pool_config_values)
|
686
|
+
# init default namespace
|
687
|
+
init_user_workspace(force_namespace="default")
|
688
|
+
if only_registered_users:
|
689
|
+
# init user namespace
|
690
|
+
init_user_workspace()
|
691
|
+
|
692
|
+
return None
|
693
|
+
|
694
|
+
|
695
|
+
@arguably.command
|
696
|
+
def pool__token(*others, admin=False, user=False, worker=False):
|
697
|
+
"""
|
698
|
+
Generate a join token for others to connect to your pool
|
699
|
+
"""
|
700
|
+
try:
|
701
|
+
CLUSTER.validate_cluster()
|
702
|
+
except Exception as e:
|
703
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
704
|
+
return
|
705
|
+
|
706
|
+
if not admin and not user and not worker:
|
707
|
+
console.log(f"[red]Select at least one mode (--admin, --user or --worker)")
|
708
|
+
return
|
709
|
+
|
710
|
+
if admin:
|
711
|
+
auth_key = load_server_info(data_key=AUTH_KEY, file=USER_LOCAL_SERVER_FILE)
|
712
|
+
elif user:
|
713
|
+
auth_key = load_server_info(data_key=WRITE_AUTH_KEY, file=USER_LOCAL_SERVER_FILE)
|
714
|
+
else:
|
715
|
+
auth_key = load_server_info(data_key=READONLY_AUTH_KEY, file=USER_LOCAL_SERVER_FILE)
|
716
|
+
|
717
|
+
watcher_service = load_server_info(data_key=WATCHER_SERVICE_KEY, file=USER_LOCAL_SERVER_FILE)
|
718
|
+
public_location = load_server_info(data_key=PUBLIC_LOCATION_KEY, file=USER_LOCAL_SERVER_FILE)
|
719
|
+
|
720
|
+
cluster_token = CLUSTER.get_cluster_token()
|
721
|
+
|
722
|
+
ip_address = load_server_info(SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)
|
723
|
+
cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
|
724
|
+
|
725
|
+
join_token = generate_join_token(
|
726
|
+
cluster_ip=ip_address,
|
727
|
+
cluster_name=cluster_name,
|
728
|
+
cluster_token=cluster_token,
|
729
|
+
auth_key=auth_key,
|
730
|
+
watcher_service=watcher_service,
|
731
|
+
public_location=public_location
|
732
|
+
)
|
733
|
+
|
734
|
+
console.log("[green]Join token:")
|
735
|
+
print(join_token)
|
736
|
+
|
737
|
+
return join_token
|
738
|
+
|
739
|
+
@arguably.command
|
740
|
+
def pool__check_token(token, *others, public=False):
|
741
|
+
"""
|
742
|
+
Utility to check the validity of a join token
|
743
|
+
"""
|
744
|
+
try:
|
745
|
+
data = decode_dict(token)
|
746
|
+
for field in MANDATORY_TOKEN_FIELDS:
|
747
|
+
assert field in data
|
748
|
+
if public:
|
749
|
+
if data[PUBLIC_LOCATION_KEY] is None:
|
750
|
+
raise ValueError("Token is not valid for public pools. Did you start the cluster with a public_location?")
|
751
|
+
console.log("[green]Token format is correct")
|
752
|
+
return True
|
753
|
+
except Exception as e:
|
754
|
+
console.log(f"[white]{str(e)}")
|
755
|
+
console.log("[red]Token is invalid.")
|
756
|
+
return False
|
757
|
+
|
758
|
+
|
759
|
+
@arguably.command
|
760
|
+
def pool__join(token, *others, node_name=None, ip_address: str=None):
|
761
|
+
"""
|
762
|
+
Join Kalavai pool and start/resume sharing resources.
|
763
|
+
|
764
|
+
Args:
|
765
|
+
*others: all the other positional arguments go here
|
766
|
+
"""
|
767
|
+
|
768
|
+
if not check_worker_compatibility():
|
769
|
+
return
|
770
|
+
|
771
|
+
# check that k3s is not running already in the host
|
772
|
+
# k3s service running or preinstalled
|
773
|
+
if CLUSTER.is_agent_running():
|
774
|
+
console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
|
775
|
+
return
|
776
|
+
# check that is not attached to another instance
|
777
|
+
if os.path.exists(USER_LOCAL_SERVER_FILE):
|
778
|
+
option = user_confirm(
|
779
|
+
question="You seem to be connected to an instance already. Are you sure you want to join a new one?",
|
780
|
+
options=["no", "yes"]
|
781
|
+
)
|
782
|
+
if option == 0:
|
783
|
+
console.log("[green]Nothing happened.")
|
784
|
+
return
|
785
|
+
|
786
|
+
if node_name is None:
|
787
|
+
node_name = socket.gethostname()
|
788
|
+
|
789
|
+
# check token
|
790
|
+
if not pool__check_token(token):
|
791
|
+
return
|
792
|
+
|
793
|
+
try:
|
794
|
+
data = decode_dict(token)
|
795
|
+
kalavai_seed_ip = data[CLUSTER_IP_KEY]
|
796
|
+
kalavai_token = data[CLUSTER_TOKEN_KEY]
|
797
|
+
cluster_name = data[CLUSTER_NAME_KEY]
|
798
|
+
auth_key = data[AUTH_KEY]
|
799
|
+
watcher_service = data[WATCHER_SERVICE_KEY]
|
800
|
+
public_location = data[PUBLIC_LOCATION_KEY]
|
801
|
+
vpn = defaultdict(lambda: None)
|
802
|
+
except Exception as e:
|
803
|
+
console.log(str(e))
|
804
|
+
console.log("[red] Invalid token")
|
805
|
+
return
|
806
|
+
|
807
|
+
# join private network if provided
|
808
|
+
node_labels = {
|
809
|
+
STORAGE_CLASS_LABEL: is_storage_compatible()
|
810
|
+
}
|
811
|
+
user = defaultdict(lambda: None)
|
812
|
+
if public_location is not None:
|
813
|
+
console.log("Joining private network")
|
814
|
+
try:
|
815
|
+
if not check_vpn_compatibility():
|
816
|
+
return
|
817
|
+
vpn = join_vpn(
|
818
|
+
location=public_location,
|
819
|
+
user_cookie=USER_COOKIE)
|
820
|
+
user = user_login(user_cookie=USER_COOKIE)
|
821
|
+
node_labels[USER_NODE_LABEL] = user["username"]
|
822
|
+
except Exception as e:
|
823
|
+
console.log(f"[red]Error when joining network: {str(e)}")
|
824
|
+
console.log("Are you authenticated? Try [yellow]kalavai login")
|
825
|
+
return
|
826
|
+
# validate public seed
|
827
|
+
try:
|
828
|
+
validate_join_public_seed(
|
829
|
+
cluster_name=cluster_name,
|
830
|
+
join_key=token,
|
831
|
+
user_cookie=USER_COOKIE
|
832
|
+
)
|
833
|
+
except Exception as e:
|
834
|
+
console.log(f"[red]Error when joining network: {str(e)}")
|
835
|
+
leave_vpn(vpn_file=USER_VPN_COMPOSE_FILE)
|
836
|
+
return
|
837
|
+
|
838
|
+
# send note to server to let them know the node is coming online
|
839
|
+
if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
|
840
|
+
console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with '--node-name'")
|
841
|
+
leave_vpn(vpn_file=USER_VPN_COMPOSE_FILE)
|
842
|
+
return
|
843
|
+
|
844
|
+
if ip_address is None:
|
845
|
+
console.log(f"Scanning for valid IPs (subnet {vpn['subnet']})...")
|
846
|
+
ip_address = select_ip_address(subnet=vpn["subnet"])
|
847
|
+
console.log(f"Using {ip_address} address for worker")
|
848
|
+
|
849
|
+
# local agent join
|
850
|
+
# 1. Generate local cache files
|
851
|
+
console.log("Generating config files...")
|
852
|
+
compose_yaml = generate_compose_config(
|
853
|
+
role="agent",
|
854
|
+
server=f"https://{kalavai_seed_ip}:6443",
|
855
|
+
token=kalavai_token,
|
856
|
+
node_name=socket.gethostname(),
|
857
|
+
ip_address=ip_address,
|
858
|
+
node_labels=node_labels,
|
859
|
+
is_public=public_location is not None)
|
860
|
+
store_server_info(
|
861
|
+
server_ip=kalavai_seed_ip,
|
862
|
+
auth_key=auth_key,
|
863
|
+
file=USER_LOCAL_SERVER_FILE,
|
864
|
+
watcher_service=watcher_service,
|
865
|
+
node_name=node_name,
|
866
|
+
cluster_name=cluster_name,
|
867
|
+
public_location=public_location,
|
868
|
+
user_api_key=user["api_key"])
|
869
|
+
|
870
|
+
init_user_workspace()
|
871
|
+
|
872
|
+
option = user_confirm(
|
873
|
+
question="Docker compose ready. Would you like Kalavai to deploy it?",
|
874
|
+
options=["no", "yes"]
|
875
|
+
)
|
876
|
+
if option == 0:
|
877
|
+
console.log("Manually deploy the worker with the following command:\n")
|
878
|
+
print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
|
879
|
+
return
|
880
|
+
|
881
|
+
console.log(f"[white] Connecting to {cluster_name} @ {kalavai_seed_ip} (this may take a few minutes)...")
|
882
|
+
try:
|
883
|
+
CLUSTER.start_worker_node()
|
884
|
+
except Exception as e:
|
885
|
+
console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
|
886
|
+
leave_vpn(vpn_file=USER_VPN_COMPOSE_FILE)
|
887
|
+
exit()
|
888
|
+
|
889
|
+
while not CLUSTER.is_agent_running():
|
890
|
+
console.log("Waiting for worker to start...")
|
891
|
+
time.sleep(10)
|
892
|
+
|
893
|
+
# set status to schedulable
|
894
|
+
console.log(f"[green] You are connected to {cluster_name}")
|
895
|
+
|
896
|
+
@arguably.command
|
897
|
+
def pool__stop(*others):
|
898
|
+
"""
|
899
|
+
Stop sharing your device and clean up. DO THIS ONLY IF YOU WANT TO REMOVE KALAVAI-CLIENT from your device.
|
900
|
+
|
901
|
+
Args:
|
902
|
+
*others: all the other positional arguments go here
|
903
|
+
"""
|
904
|
+
console.log("[white] Stopping kalavai app...")
|
905
|
+
# delete local node from server
|
906
|
+
node__delete(load_server_info(data_key=NODE_NAME_KEY, file=USER_LOCAL_SERVER_FILE))
|
907
|
+
# unpublish event (only if seed node)
|
908
|
+
# TODO: no, this should be done via the platform!!!
|
909
|
+
# try:
|
910
|
+
# if CLUSTER.is_seed_node():
|
911
|
+
# console.log("Unregistering pool...")
|
912
|
+
# unregister_cluster(
|
913
|
+
# name=load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE),
|
914
|
+
# user_cookie=USER_COOKIE)
|
915
|
+
# except Exception as e:
|
916
|
+
# console.log(f"[red][WARNING]: (ignore if not a public pool) Error when unpublishing cluster. {str(e)}")
|
917
|
+
# remove local node agent
|
918
|
+
console.log("Removing agent and local cache")
|
919
|
+
CLUSTER.remove_agent()
|
920
|
+
# clean local files
|
921
|
+
cleanup_local()
|
922
|
+
console.log("[white] Kalavai has stopped sharing your resources. Use [yellow]kalavai pool start[white] or [yellow]kalavai pool join[white] to start again!")
|
923
|
+
|
924
|
+
@arguably.command
|
925
|
+
def pool__pause(*others):
|
926
|
+
"""
|
927
|
+
Pause sharing your device and make your device unavailable for kalavai scheduling.
|
928
|
+
|
929
|
+
Args:
|
930
|
+
*others: all the other positional arguments go here
|
931
|
+
"""
|
932
|
+
# k3s stop locally
|
933
|
+
console.log("[white] Pausing kalavai app...")
|
934
|
+
success = CLUSTER.pause_agent()
|
935
|
+
if success:
|
936
|
+
console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
|
937
|
+
else:
|
938
|
+
console.log("[red] Error when stopping. Please run [yellow]kalavai pool pause[red] again.")
|
939
|
+
|
940
|
+
@arguably.command
|
941
|
+
def pool__resume(*others):
|
942
|
+
"""
|
943
|
+
Resume sharing your device and make device available for kalavai scheduling.
|
944
|
+
|
945
|
+
Args:
|
946
|
+
*others: all the other positional arguments go here
|
947
|
+
"""
|
948
|
+
# k3s stop locally
|
949
|
+
if not CLUSTER.is_cluster_init():
|
950
|
+
console.log("[red] Kalavai app was not started before, please run [yellow]kalavai pool start[red] to start a pool or [yellow]kalavai pool join[red] to join one first")
|
951
|
+
return
|
952
|
+
console.log("[white] Restarting sharing (may take a few minutes)...")
|
953
|
+
if CLUSTER.restart_agent():
|
954
|
+
console.log("[white] Kalava sharing resumed")
|
955
|
+
else:
|
956
|
+
console.log("[red] Error when restarting. Please run [yellow]kalavai pool resume[white] again.")
|
957
|
+
|
958
|
+
|
959
|
+
@arguably.command
|
960
|
+
def pool__gpus(*others, available=False):
|
961
|
+
"""
|
962
|
+
Display GPU information from all connected nodes
|
963
|
+
"""
|
964
|
+
try:
|
965
|
+
CLUSTER.validate_cluster()
|
966
|
+
except Exception as e:
|
967
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
968
|
+
return
|
969
|
+
|
970
|
+
try:
|
971
|
+
data = fetch_gpus()
|
972
|
+
columns, rows = [], []
|
973
|
+
for node, gpus in data:
|
974
|
+
row_gpus = []
|
975
|
+
for gpu in gpus["gpus"]:
|
976
|
+
status = gpu["ready"] if "ready" in gpu else True
|
977
|
+
if available and not status:
|
978
|
+
continue
|
979
|
+
row_gpus.append( (f"{gpu['model']} ({math.floor(int(gpu['memory'])/1000)} GBs)", str(status)))
|
980
|
+
if len(row_gpus) > 0:
|
981
|
+
models, statuses = zip(*row_gpus)
|
982
|
+
rows.append([node, "\n".join(statuses), "\n".join(models), str(gpus["available"]), str(gpus["capacity"])])
|
983
|
+
|
984
|
+
columns = ["Ready", "GPU(s)", "Available", "Total"]
|
985
|
+
columns = ["Node"] + columns
|
986
|
+
console.print(
|
987
|
+
generate_table(columns=columns, rows=rows,end_sections=[n for n in range(len(rows))])
|
988
|
+
)
|
989
|
+
|
990
|
+
except Exception as e:
|
991
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
992
|
+
|
993
|
+
|
994
|
+
@arguably.command
|
995
|
+
def pool__resources(*others):
|
996
|
+
"""
|
997
|
+
Display information about resources on the pool
|
998
|
+
"""
|
999
|
+
try:
|
1000
|
+
CLUSTER.validate_cluster()
|
1001
|
+
except Exception as e:
|
1002
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1003
|
+
return
|
1004
|
+
|
1005
|
+
try:
|
1006
|
+
total = request_to_server(
|
1007
|
+
method="get",
|
1008
|
+
endpoint="/v1/get_cluster_total_resources",
|
1009
|
+
data={},
|
1010
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1011
|
+
user_cookie=USER_COOKIE
|
1012
|
+
)
|
1013
|
+
available = request_to_server(
|
1014
|
+
method="get",
|
1015
|
+
endpoint="/v1/get_cluster_available_resources",
|
1016
|
+
data={},
|
1017
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1018
|
+
user_cookie=USER_COOKIE
|
1019
|
+
)
|
1020
|
+
columns = []
|
1021
|
+
total_values = []
|
1022
|
+
available_values = []
|
1023
|
+
for col in total.keys():
|
1024
|
+
if col in RESOURCE_EXCLUDE:
|
1025
|
+
continue
|
1026
|
+
columns.append(col)
|
1027
|
+
total_values.append(str(total[col]))
|
1028
|
+
available_values.append(str(available[col]))
|
1029
|
+
|
1030
|
+
columns = [""] + columns
|
1031
|
+
total_values = ["Total"] + total_values
|
1032
|
+
available_values = ["Available"] + available_values
|
1033
|
+
|
1034
|
+
rows = [
|
1035
|
+
tuple(available_values),
|
1036
|
+
tuple(total_values)
|
1037
|
+
]
|
1038
|
+
console.print(
|
1039
|
+
generate_table(columns=columns, rows=rows, end_sections=[0, 1])
|
1040
|
+
)
|
1041
|
+
|
1042
|
+
except Exception as e:
|
1043
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1044
|
+
|
1045
|
+
@arguably.command
|
1046
|
+
def pool__update(*others):
|
1047
|
+
"""
|
1048
|
+
Update kalavai pool
|
1049
|
+
"""
|
1050
|
+
try:
|
1051
|
+
CLUSTER.validate_cluster()
|
1052
|
+
except Exception as e:
|
1053
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1054
|
+
return
|
1055
|
+
|
1056
|
+
if not CLUSTER.is_seed_node():
|
1057
|
+
console.log("You can only update a pool from the seed node.")
|
1058
|
+
return
|
1059
|
+
|
1060
|
+
# update dependencies
|
1061
|
+
try:
|
1062
|
+
CLUSTER.update_dependencies(debug=True)
|
1063
|
+
console.log("Pool updating. Expect some downtime on core services")
|
1064
|
+
except Exception as e:
|
1065
|
+
console.log(f"[red]Error when updating pool: {str(e)}")
|
1066
|
+
return
|
1067
|
+
|
1068
|
+
|
1069
|
+
@arguably.command
|
1070
|
+
def pool__status(*others, log_file=None):
|
1071
|
+
"""
|
1072
|
+
Run diagnostics on a local installation of kalavai
|
1073
|
+
* is pool installed
|
1074
|
+
* is agent running
|
1075
|
+
* is kube-watcher running
|
1076
|
+
* is lws running
|
1077
|
+
"""
|
1078
|
+
logs = []
|
1079
|
+
|
1080
|
+
logs.append("Getting deployment status...")
|
1081
|
+
|
1082
|
+
if CLUSTER.is_seed_node():
|
1083
|
+
# seed node
|
1084
|
+
data = CLUSTER.diagnostics()
|
1085
|
+
logs.append(data)
|
1086
|
+
else:
|
1087
|
+
# worker node
|
1088
|
+
logs.append("Could not access node info. This info is only available to seed nodes. Ignore if you are on a worker node.")
|
1089
|
+
logs.append(f"Worker installed: {CLUSTER.is_cluster_init()}")
|
1090
|
+
|
1091
|
+
logs.append(f"Worker running: {CLUSTER.is_agent_running()}")
|
1092
|
+
|
1093
|
+
logs.append(f"Pool credentials present: {CLUSTER.validate_cluster()}")
|
1094
|
+
|
1095
|
+
if log_file is not None:
|
1096
|
+
with open(log_file, "w") as f:
|
1097
|
+
for log in logs:
|
1098
|
+
f.write(log)
|
1099
|
+
f.write("\n")
|
1100
|
+
console.log(f"[green]Logs written to {log_file}")
|
1101
|
+
else:
|
1102
|
+
for log in logs:
|
1103
|
+
console.log(f"{log}\n")
|
1104
|
+
|
1105
|
+
@arguably.command
|
1106
|
+
def pool__attach(token, *others, node_name=None):
|
1107
|
+
"""
|
1108
|
+
Set creds in token on the local instance
|
1109
|
+
"""
|
1110
|
+
if os.path.exists(USER_LOCAL_SERVER_FILE):
|
1111
|
+
option = user_confirm(
|
1112
|
+
question="You seem to be connected to an instance already. Are you sure you want to join a new one?",
|
1113
|
+
options=["no", "yes"]
|
1114
|
+
)
|
1115
|
+
if option == 0:
|
1116
|
+
console.log("[green]Nothing happened.")
|
1117
|
+
return
|
1118
|
+
try:
|
1119
|
+
data = decode_dict(token)
|
1120
|
+
kalavai_seed_ip = data[CLUSTER_IP_KEY]
|
1121
|
+
kalavai_token = data[CLUSTER_TOKEN_KEY]
|
1122
|
+
cluster_name = data[CLUSTER_NAME_KEY]
|
1123
|
+
auth_key = data[AUTH_KEY]
|
1124
|
+
watcher_service = data[WATCHER_SERVICE_KEY]
|
1125
|
+
public_location = data[PUBLIC_LOCATION_KEY]
|
1126
|
+
except:
|
1127
|
+
console.log("[red]Error when parsing token. Invalid token")
|
1128
|
+
return
|
1129
|
+
|
1130
|
+
user = defaultdict(lambda: None)
|
1131
|
+
if public_location is not None:
|
1132
|
+
console.log("Joining private network")
|
1133
|
+
try:
|
1134
|
+
if not check_vpn_compatibility():
|
1135
|
+
return
|
1136
|
+
vpn = join_vpn(
|
1137
|
+
location=public_location,
|
1138
|
+
user_cookie=USER_COOKIE)
|
1139
|
+
user = user_login(user_cookie=USER_COOKIE)
|
1140
|
+
time.sleep(5)
|
1141
|
+
except Exception as e:
|
1142
|
+
console.log(f"[red]Error when joining network: {str(e)}")
|
1143
|
+
console.log("Are you authenticated? Try [yellow]kalavai login")
|
1144
|
+
return
|
1145
|
+
# validate public seed
|
1146
|
+
try:
|
1147
|
+
validate_join_public_seed(
|
1148
|
+
cluster_name=cluster_name,
|
1149
|
+
join_key=token,
|
1150
|
+
user_cookie=USER_COOKIE
|
1151
|
+
)
|
1152
|
+
except Exception as e:
|
1153
|
+
console.log(f"[red]Error when joining network: {str(e)}")
|
1154
|
+
leave_vpn(vpn_file=USER_VPN_COMPOSE_FILE)
|
1155
|
+
return
|
1156
|
+
|
1157
|
+
store_server_info(
|
1158
|
+
server_ip=kalavai_seed_ip,
|
1159
|
+
auth_key=auth_key,
|
1160
|
+
file=USER_LOCAL_SERVER_FILE,
|
1161
|
+
watcher_service=watcher_service,
|
1162
|
+
node_name=node_name,
|
1163
|
+
cluster_name=cluster_name,
|
1164
|
+
public_location=public_location,
|
1165
|
+
user_api_key=user["api_key"])
|
1166
|
+
|
1167
|
+
console.log(f"[green]You are now connected to {cluster_name} @ {kalavai_seed_ip}")
|
1168
|
+
|
1169
|
+
|
1170
|
+
@arguably.command
|
1171
|
+
def storage__create(name, storage, *others, force_namespace: str=None):
|
1172
|
+
"""
|
1173
|
+
Create storage for the cluster
|
1174
|
+
"""
|
1175
|
+
try:
|
1176
|
+
CLUSTER.validate_cluster()
|
1177
|
+
except Exception as e:
|
1178
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1179
|
+
return
|
1180
|
+
|
1181
|
+
if force_namespace is not None:
|
1182
|
+
console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
|
1183
|
+
|
1184
|
+
# Deploy PVC
|
1185
|
+
data = {
|
1186
|
+
"name": name,
|
1187
|
+
"labels": {
|
1188
|
+
PVC_NAME_LABEL: name,
|
1189
|
+
"kalavai.resource": "storage"
|
1190
|
+
},
|
1191
|
+
"access_modes": STORAGE_ACCESS_MODE,
|
1192
|
+
"storage_class_name": STORAGE_CLASS_NAME,
|
1193
|
+
"storage_size": storage
|
1194
|
+
}
|
1195
|
+
if force_namespace is not None:
|
1196
|
+
data["force_namespace"] = force_namespace
|
1197
|
+
|
1198
|
+
try:
|
1199
|
+
result = request_to_server(
|
1200
|
+
method="post",
|
1201
|
+
endpoint="/v1/deploy_storage_claim",
|
1202
|
+
data=data,
|
1203
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1204
|
+
user_cookie=USER_COOKIE
|
1205
|
+
)
|
1206
|
+
if "error" in result or "detail" in result:
|
1207
|
+
console.log(f"Error: {result}")
|
1208
|
+
else:
|
1209
|
+
console.log(f"Storage {name} ({storage}Gi) created")
|
1210
|
+
except Exception as e:
|
1211
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1212
|
+
|
1213
|
+
|
1214
|
+
@arguably.command
|
1215
|
+
def storage__list(*other):
|
1216
|
+
"""
|
1217
|
+
List existing storages deployed in the pool
|
1218
|
+
"""
|
1219
|
+
try:
|
1220
|
+
CLUSTER.validate_cluster()
|
1221
|
+
except Exception as e:
|
1222
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1223
|
+
return
|
1224
|
+
|
1225
|
+
try:
|
1226
|
+
user = load_user_session(user_cookie=USER_COOKIE)
|
1227
|
+
username = user["username"] if user is not None else None
|
1228
|
+
result = request_to_server(
|
1229
|
+
method="post",
|
1230
|
+
endpoint="/v1/get_storage_usage",
|
1231
|
+
data={},
|
1232
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1233
|
+
user_cookie=USER_COOKIE
|
1234
|
+
)
|
1235
|
+
|
1236
|
+
columns = []
|
1237
|
+
rows = []
|
1238
|
+
for namespace, storages in result.items():
|
1239
|
+
for name, values in storages.items():
|
1240
|
+
if namespace == username:
|
1241
|
+
namespace = f"**{namespace}**"
|
1242
|
+
columns = list(values.keys())
|
1243
|
+
rows.append([namespace, name] + [f"{v:.2f} MB" if "capacity" in k else str(v) for k, v in values.items()])
|
1244
|
+
|
1245
|
+
if len(rows) == 0:
|
1246
|
+
console.log("[green] Storages have not been claimed yet (did you deploy any job using them?)")
|
1247
|
+
return
|
1248
|
+
columns = ["Owner", "Name"] + columns
|
1249
|
+
table = generate_table(columns=columns, rows=rows)
|
1250
|
+
console.log(table)
|
1251
|
+
|
1252
|
+
except Exception as e:
|
1253
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1254
|
+
|
1255
|
+
@arguably.command
|
1256
|
+
def storage__delete(name, *others, force_namespace: str=None):
|
1257
|
+
"""
|
1258
|
+
Delete storage by name
|
1259
|
+
"""
|
1260
|
+
try:
|
1261
|
+
CLUSTER.validate_cluster()
|
1262
|
+
except Exception as e:
|
1263
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1264
|
+
return
|
1265
|
+
|
1266
|
+
if force_namespace is not None:
|
1267
|
+
console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
|
1268
|
+
|
1269
|
+
# deploy template with kube-watcher
|
1270
|
+
data = {
|
1271
|
+
"label": PVC_NAME_LABEL,
|
1272
|
+
"value": name
|
1273
|
+
}
|
1274
|
+
if force_namespace is not None:
|
1275
|
+
data["force_namespace"] = force_namespace
|
1276
|
+
try:
|
1277
|
+
result = request_to_server(
|
1278
|
+
method="post",
|
1279
|
+
endpoint="/v1/delete_labeled_resources",
|
1280
|
+
data=data,
|
1281
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1282
|
+
user_cookie=USER_COOKIE
|
1283
|
+
)
|
1284
|
+
console.log(f"{result}")
|
1285
|
+
except Exception as e:
|
1286
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1287
|
+
|
1288
|
+
@arguably.command
|
1289
|
+
def node__list(*others):
|
1290
|
+
"""
|
1291
|
+
Display information about nodes connected
|
1292
|
+
"""
|
1293
|
+
try:
|
1294
|
+
CLUSTER.validate_cluster()
|
1295
|
+
except Exception as e:
|
1296
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1297
|
+
return
|
1298
|
+
|
1299
|
+
try:
|
1300
|
+
data = request_to_server(
|
1301
|
+
method="get",
|
1302
|
+
endpoint="/v1/get_nodes",
|
1303
|
+
data={},
|
1304
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1305
|
+
user_cookie=USER_COOKIE
|
1306
|
+
)
|
1307
|
+
rows = []
|
1308
|
+
columns = ["Node name"]
|
1309
|
+
for node, status in data.items():
|
1310
|
+
row = [node]
|
1311
|
+
for key, value in status.items():
|
1312
|
+
if key not in columns:
|
1313
|
+
columns.append(key)
|
1314
|
+
row.append(str(value))
|
1315
|
+
rows.append(tuple(row))
|
1316
|
+
|
1317
|
+
console.log("Nodes with 'unschedulable=True' will not receive workload")
|
1318
|
+
console.log("To make a node unschedulable (i.e. won't receive workloads) use [yellow]kalavai node cordon <node name>")
|
1319
|
+
console.log("To make a node schedulable (i.e. will receive workloads) use [yellow]kalavai node uncordon <node name>")
|
1320
|
+
console.print(
|
1321
|
+
generate_table(columns=columns, rows=rows)
|
1322
|
+
)
|
1323
|
+
|
1324
|
+
except Exception as e:
|
1325
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1326
|
+
|
1327
|
+
|
1328
|
+
@arguably.command
|
1329
|
+
def node__delete(name, *others):
|
1330
|
+
"""
|
1331
|
+
Delete a node from the cluster
|
1332
|
+
"""
|
1333
|
+
try:
|
1334
|
+
CLUSTER.validate_cluster()
|
1335
|
+
except Exception as e:
|
1336
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1337
|
+
return
|
1338
|
+
|
1339
|
+
data = {
|
1340
|
+
"node_names": [name]
|
1341
|
+
}
|
1342
|
+
try:
|
1343
|
+
result = request_to_server(
|
1344
|
+
method="post",
|
1345
|
+
endpoint="/v1/delete_nodes",
|
1346
|
+
data=data,
|
1347
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1348
|
+
user_cookie=USER_COOKIE
|
1349
|
+
)
|
1350
|
+
if result is None or result is True:
|
1351
|
+
console.log(f"Node {name} deleted successfully")
|
1352
|
+
else:
|
1353
|
+
console.log(f"{result}")
|
1354
|
+
except Exception as e:
|
1355
|
+
console.log(f"[yellow](ignore if stopping worker from dead server). Error when removing node {name}: {str(e)}")
|
1356
|
+
|
1357
|
+
|
1358
|
+
@arguably.command
|
1359
|
+
def node__cordon(node_name, *others):
|
1360
|
+
"""
|
1361
|
+
Cordon a particular node so no more work will be scheduled on it
|
1362
|
+
"""
|
1363
|
+
try:
|
1364
|
+
CLUSTER.validate_cluster()
|
1365
|
+
except Exception as e:
|
1366
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1367
|
+
return
|
1368
|
+
set_schedulable(schedulable=False, node_name=node_name)
|
1369
|
+
|
1370
|
+
|
1371
|
+
@arguably.command
|
1372
|
+
def node__uncordon(node_name, *others):
|
1373
|
+
"""
|
1374
|
+
Uncordon a particular node to allow more work to be scheduled on it
|
1375
|
+
"""
|
1376
|
+
try:
|
1377
|
+
CLUSTER.validate_cluster()
|
1378
|
+
except Exception as e:
|
1379
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1380
|
+
return
|
1381
|
+
set_schedulable(schedulable=True, node_name=node_name)
|
1382
|
+
|
1383
|
+
|
1384
|
+
@arguably.command
|
1385
|
+
def job__templates(*others):
|
1386
|
+
"""
|
1387
|
+
Job templates integrated with kalavai. Use env var LOCAL_TEMPLATES_DIR to test local templates
|
1388
|
+
"""
|
1389
|
+
try:
|
1390
|
+
CLUSTER.validate_cluster()
|
1391
|
+
except Exception as e:
|
1392
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1393
|
+
return
|
1394
|
+
|
1395
|
+
try:
|
1396
|
+
result = request_to_server(
|
1397
|
+
method="get",
|
1398
|
+
endpoint="/v1/get_job_templates",
|
1399
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1400
|
+
data=None,
|
1401
|
+
user_cookie=USER_COOKIE
|
1402
|
+
)
|
1403
|
+
console.log("Templates available in the pool")
|
1404
|
+
console.log(result)
|
1405
|
+
except Exception as e:
|
1406
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1407
|
+
|
1408
|
+
|
1409
|
+
@arguably.command
|
1410
|
+
def job__run(template_name, *others, values: str=None, force_namespace: str=None):
|
1411
|
+
"""
|
1412
|
+
Deploy and run a template job.
|
1413
|
+
|
1414
|
+
Args:
|
1415
|
+
*others: all the other positional arguments go here
|
1416
|
+
"""
|
1417
|
+
try:
|
1418
|
+
CLUSTER.validate_cluster()
|
1419
|
+
except Exception as e:
|
1420
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1421
|
+
return
|
1422
|
+
|
1423
|
+
if force_namespace is not None:
|
1424
|
+
console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
|
1425
|
+
|
1426
|
+
if values is None:
|
1427
|
+
values_dict = {}
|
1428
|
+
else:
|
1429
|
+
if not Path(values).is_file():
|
1430
|
+
console.log(f"[red]Values file {values} was not found")
|
1431
|
+
|
1432
|
+
with open(values, "r") as f:
|
1433
|
+
raw_values = yaml.load(f, Loader=yaml.SafeLoader)
|
1434
|
+
values_dict = {variable["name"]: variable['value'] for variable in raw_values}
|
1435
|
+
|
1436
|
+
# Inject hardware information if not present in the template
|
1437
|
+
def generate_gpu_annotation(input_message, values, value_key, annotation_key):
|
1438
|
+
if value_key not in values:
|
1439
|
+
selection = select_gpus(message=input_message)
|
1440
|
+
else:
|
1441
|
+
selection = values[value_key]
|
1442
|
+
if selection is not None:
|
1443
|
+
values[value_key] = f"{annotation_key}: {selection}"
|
1444
|
+
else:
|
1445
|
+
values[value_key] = ""
|
1446
|
+
GPU_TYPES_KEY = "use_gputype"
|
1447
|
+
GPU_NOTYPES_KEY = "nouse_gputype"
|
1448
|
+
console.log("Checking current GPU stock...")
|
1449
|
+
generate_gpu_annotation(
|
1450
|
+
input_message="SELECT Target GPUs for the job (loading models)",
|
1451
|
+
values=values_dict,
|
1452
|
+
value_key=GPU_TYPES_KEY,
|
1453
|
+
annotation_key="nvidia.com/use-gputype"
|
1454
|
+
)
|
1455
|
+
generate_gpu_annotation(
|
1456
|
+
input_message="AVOID Target GPUs for the job (loading models)",
|
1457
|
+
values=values_dict,
|
1458
|
+
value_key=GPU_NOTYPES_KEY,
|
1459
|
+
annotation_key="nvidia.com/nouse-gputype"
|
1460
|
+
)
|
1461
|
+
|
1462
|
+
# deploy template with kube-watcher
|
1463
|
+
data = {
|
1464
|
+
"template": template_name,
|
1465
|
+
"template_values": values_dict
|
1466
|
+
}
|
1467
|
+
if force_namespace is not None:
|
1468
|
+
data["force_namespace"] = force_namespace
|
1469
|
+
|
1470
|
+
try:
|
1471
|
+
result = request_to_server(
|
1472
|
+
method="post",
|
1473
|
+
endpoint="/v1/deploy_job",
|
1474
|
+
data=data,
|
1475
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1476
|
+
user_cookie=USER_COOKIE
|
1477
|
+
)
|
1478
|
+
console.log(f"[green]{template_name} job deployed")
|
1479
|
+
except Exception as e:
|
1480
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1481
|
+
return
|
1482
|
+
|
1483
|
+
|
1484
|
+
@arguably.command
|
1485
|
+
def job__defaults(template_name, *others):
|
1486
|
+
"""
|
1487
|
+
Fetch default values.yaml for a template job
|
1488
|
+
"""
|
1489
|
+
try:
|
1490
|
+
CLUSTER.validate_cluster()
|
1491
|
+
except Exception as e:
|
1492
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1493
|
+
return
|
1494
|
+
|
1495
|
+
# deploy template with kube-watcher
|
1496
|
+
data = {
|
1497
|
+
"template": template_name
|
1498
|
+
}
|
1499
|
+
try:
|
1500
|
+
result = request_to_server(
|
1501
|
+
method="get",
|
1502
|
+
endpoint="/v1/job_defaults",
|
1503
|
+
data=data,
|
1504
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1505
|
+
user_cookie=USER_COOKIE
|
1506
|
+
)
|
1507
|
+
print(
|
1508
|
+
json.dumps(result,indent=3)
|
1509
|
+
)
|
1510
|
+
except Exception as e:
|
1511
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1512
|
+
|
1513
|
+
|
1514
|
+
@arguably.command
|
1515
|
+
def job__delete(name, *others, force_namespace: str=None):
|
1516
|
+
"""
|
1517
|
+
Delete job in the cluster
|
1518
|
+
"""
|
1519
|
+
try:
|
1520
|
+
CLUSTER.validate_cluster()
|
1521
|
+
except Exception as e:
|
1522
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1523
|
+
return
|
1524
|
+
|
1525
|
+
if force_namespace is not None:
|
1526
|
+
console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
|
1527
|
+
|
1528
|
+
# deploy template with kube-watcher
|
1529
|
+
data = {
|
1530
|
+
"label": TEMPLATE_LABEL, # this ensures that both lws template and services are deleted
|
1531
|
+
"value": name
|
1532
|
+
}
|
1533
|
+
if force_namespace is not None:
|
1534
|
+
data["force_namespace"] = force_namespace
|
1535
|
+
try:
|
1536
|
+
result = request_to_server(
|
1537
|
+
method="post",
|
1538
|
+
endpoint="/v1/delete_labeled_resources",
|
1539
|
+
data=data,
|
1540
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1541
|
+
user_cookie=USER_COOKIE
|
1542
|
+
)
|
1543
|
+
console.log(f"{result}")
|
1544
|
+
except Exception as e:
|
1545
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1546
|
+
|
1547
|
+
|
1548
|
+
@arguably.command
|
1549
|
+
def job__estimate(billion_parameters, *others, precision=32):
|
1550
|
+
"""Guesstimate of resources needed based on required memory and current resources"""
|
1551
|
+
try:
|
1552
|
+
CLUSTER.validate_cluster()
|
1553
|
+
except Exception as e:
|
1554
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1555
|
+
return
|
1556
|
+
|
1557
|
+
average_vram = 8
|
1558
|
+
required_memory = float(billion_parameters) * (precision / 8) / 1.2
|
1559
|
+
available_gpus = fetch_gpus()
|
1560
|
+
vrams = []
|
1561
|
+
for _, gpus in available_gpus:
|
1562
|
+
for model in gpus["gpus"]:
|
1563
|
+
vrams.extend([int(model["memory"])/1000] * int(gpus["capacity"]) )
|
1564
|
+
vrams = sorted(vrams, reverse=False)
|
1565
|
+
|
1566
|
+
console.log(f"There are {len(vrams)} GPUs available ({sum(vrams)}GBs)")
|
1567
|
+
console.log(f"A [yellow]{billion_parameters}B[white] model requires [yellow]~{required_memory:.2f}GB vRAM[white] at {precision}bits precision")
|
1568
|
+
|
1569
|
+
if sum(vrams) < required_memory:
|
1570
|
+
console.log("Current capacity is insufficient to host the model, but it can be scheduled for when it is!")
|
1571
|
+
console.log(f"Average devices have {average_vram}GB vRAM, use {math.ceil(required_memory/(average_vram))} GPU workers")
|
1572
|
+
else:
|
1573
|
+
current_vram = 0
|
1574
|
+
n_devices = 0
|
1575
|
+
for mem in vrams:
|
1576
|
+
current_vram += mem
|
1577
|
+
n_devices += 1
|
1578
|
+
if current_vram > required_memory:
|
1579
|
+
break
|
1580
|
+
console.log(f"Looking at current capacity, use [green]{n_devices} GPU workers[white] for a total [green]{current_vram:.2f} GB vRAM")
|
1581
|
+
|
1582
|
+
@arguably.command
|
1583
|
+
def job__status(name, *others):
|
1584
|
+
|
1585
|
+
try:
|
1586
|
+
# get pod statuses
|
1587
|
+
data = {
|
1588
|
+
"label": TEMPLATE_LABEL,
|
1589
|
+
"value": name
|
1590
|
+
}
|
1591
|
+
result = request_to_server(
|
1592
|
+
method="post",
|
1593
|
+
endpoint="/v1/get_pods_status_for_label",
|
1594
|
+
data=data,
|
1595
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1596
|
+
user_cookie=USER_COOKIE
|
1597
|
+
)
|
1598
|
+
workers_status = defaultdict(int)
|
1599
|
+
workers_conditions = {}
|
1600
|
+
for _, ss in result.items():
|
1601
|
+
for pod_name, values in ss.items():
|
1602
|
+
workers_status[values["status"]] += 1
|
1603
|
+
workers_conditions[pod_name] = values["conditions"]
|
1604
|
+
workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
|
1605
|
+
|
1606
|
+
console.log("Workers conditions")
|
1607
|
+
for worker, conditions in workers_conditions.items():
|
1608
|
+
console.log(f"[yellow]{worker}")
|
1609
|
+
console.log(conditions)
|
1610
|
+
console.log(f"[yellow]{workers}\nTotal: {len(workers_status)}")
|
1611
|
+
except Exception as e:
|
1612
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1613
|
+
return
|
1614
|
+
|
1615
|
+
@arguably.command
|
1616
|
+
def job__list(*others, detailed=False):
|
1617
|
+
"""
|
1618
|
+
List jobs in the cluster
|
1619
|
+
"""
|
1620
|
+
try:
|
1621
|
+
CLUSTER.validate_cluster()
|
1622
|
+
except Exception as e:
|
1623
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1624
|
+
return
|
1625
|
+
|
1626
|
+
data = {
|
1627
|
+
"group": "batch.volcano.sh",
|
1628
|
+
"api_version": "v1alpha1",
|
1629
|
+
"plural": "jobs"
|
1630
|
+
}
|
1631
|
+
try:
|
1632
|
+
result = request_to_server(
|
1633
|
+
method="post",
|
1634
|
+
endpoint="/v1/get_objects_of_type",
|
1635
|
+
data=data,
|
1636
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1637
|
+
user_cookie=USER_COOKIE
|
1638
|
+
)
|
1639
|
+
all_deployments = defaultdict(list)
|
1640
|
+
for ns, ds in result.items():
|
1641
|
+
all_deployments[ns].extend([d["metadata"]["labels"][TEMPLATE_LABEL] for d in ds["items"]])
|
1642
|
+
#deployments = {ns: d["metadata"]["labels"][TEMPLATE_LABEL] for ns, ds in result.items() for d in ds["items"]}
|
1643
|
+
except Exception as e:
|
1644
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1645
|
+
return
|
1646
|
+
if len(all_deployments.keys()) == 0:
|
1647
|
+
console.log("[green]No deployments found.")
|
1648
|
+
return
|
1649
|
+
|
1650
|
+
columns = ["Owner", "Deployment", "Workers", "Endpoint"]
|
1651
|
+
if detailed:
|
1652
|
+
columns.append("Status")
|
1653
|
+
rows = []
|
1654
|
+
for namespace, deployments in all_deployments.items():
|
1655
|
+
for deployment in deployments:
|
1656
|
+
try:
|
1657
|
+
# get status for deployment
|
1658
|
+
if detailed:
|
1659
|
+
data = {
|
1660
|
+
"group": "batch.volcano.sh",
|
1661
|
+
"api_version": "v1alpha1",
|
1662
|
+
"plural": "jobs",
|
1663
|
+
# "group": "leaderworkerset.x-k8s.io",
|
1664
|
+
# "api_version": "v1",
|
1665
|
+
# "plural": "leaderworkersets",
|
1666
|
+
"name": deployment
|
1667
|
+
}
|
1668
|
+
result = request_to_server(
|
1669
|
+
method="post",
|
1670
|
+
endpoint="/v1/get_status_for_object",
|
1671
|
+
data=data,
|
1672
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1673
|
+
user_cookie=USER_COOKIE
|
1674
|
+
)
|
1675
|
+
ss = [] # flatten results ({namespace: statuses})
|
1676
|
+
[ss.extend(values) for values in result.values()]
|
1677
|
+
if len(ss) > 0:
|
1678
|
+
last = ss[-1]
|
1679
|
+
statuses = f"[{last['lastTransitionTime']}] {last['status']}"
|
1680
|
+
else:
|
1681
|
+
statuses = "Unknown"
|
1682
|
+
# get pod statuses
|
1683
|
+
data = {
|
1684
|
+
"label": TEMPLATE_LABEL,
|
1685
|
+
"value": deployment
|
1686
|
+
}
|
1687
|
+
result = request_to_server(
|
1688
|
+
method="post",
|
1689
|
+
endpoint="/v1/get_pods_status_for_label",
|
1690
|
+
data=data,
|
1691
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1692
|
+
user_cookie=USER_COOKIE
|
1693
|
+
)
|
1694
|
+
workers_status = defaultdict(int)
|
1695
|
+
for ns, ss in result.items():
|
1696
|
+
if ns != namespace: # same job name, different namespace
|
1697
|
+
continue
|
1698
|
+
for _, values in ss.items():
|
1699
|
+
workers_status[values["status"]] += 1
|
1700
|
+
workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
|
1701
|
+
# get URL details
|
1702
|
+
data = {
|
1703
|
+
"label": TEMPLATE_LABEL,
|
1704
|
+
"value": deployment,
|
1705
|
+
"types": ["NodePort"]
|
1706
|
+
}
|
1707
|
+
result = request_to_server(
|
1708
|
+
method="post",
|
1709
|
+
endpoint="/v1/get_ports_for_services",
|
1710
|
+
data=data,
|
1711
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1712
|
+
user_cookie=USER_COOKIE
|
1713
|
+
)
|
1714
|
+
node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
|
1715
|
+
|
1716
|
+
urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
|
1717
|
+
row = [namespace, deployment, workers, "\n".join(urls)]
|
1718
|
+
if detailed:
|
1719
|
+
row.append(statuses)
|
1720
|
+
rows.append(row)
|
1721
|
+
|
1722
|
+
except Exception as e:
|
1723
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1724
|
+
return
|
1725
|
+
|
1726
|
+
console.print(
|
1727
|
+
generate_table(columns=columns, rows=rows, end_sections=range(len(rows)))
|
1728
|
+
)
|
1729
|
+
|
1730
|
+
console.log("Check detailed status with [yellow]kalavai job status <name of deployment>")
|
1731
|
+
console.log("Get logs with [yellow]kalavai job logs <name of deployment> [white](note it only works when the deployment is complete)")
|
1732
|
+
|
1733
|
+
|
1734
|
+
@arguably.command
|
1735
|
+
def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namespace: str=None):
|
1736
|
+
"""
|
1737
|
+
Get logs for a specific job
|
1738
|
+
"""
|
1739
|
+
try:
|
1740
|
+
CLUSTER.validate_cluster()
|
1741
|
+
except Exception as e:
|
1742
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1743
|
+
return
|
1744
|
+
|
1745
|
+
if force_namespace is not None:
|
1746
|
+
console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
|
1747
|
+
|
1748
|
+
data = {
|
1749
|
+
"label": TEMPLATE_LABEL,
|
1750
|
+
"value": name,
|
1751
|
+
"tail": tail
|
1752
|
+
}
|
1753
|
+
if force_namespace is not None:
|
1754
|
+
data["force_namespace"] = force_namespace
|
1755
|
+
while True:
|
1756
|
+
try:
|
1757
|
+
# send tail as parameter (fetch only last _tail_ lines)
|
1758
|
+
result = request_to_server(
|
1759
|
+
method="post",
|
1760
|
+
endpoint="/v1/get_logs_for_label",
|
1761
|
+
data=data,
|
1762
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1763
|
+
user_cookie=USER_COOKIE
|
1764
|
+
)
|
1765
|
+
if not stream:
|
1766
|
+
for pod, logs in result.items():
|
1767
|
+
if pod_name is not None and pod_name != pod:
|
1768
|
+
continue
|
1769
|
+
console.log(f"[yellow]Pod {pod}")
|
1770
|
+
console.log(f"[green]{logs}")
|
1771
|
+
break
|
1772
|
+
else:
|
1773
|
+
os.system("clear")
|
1774
|
+
for pod, logs in result.items():
|
1775
|
+
if pod_name is not None and pod_name != pod:
|
1776
|
+
continue
|
1777
|
+
print(f"Pod {pod}")
|
1778
|
+
print(f"{logs}")
|
1779
|
+
time.sleep(1)
|
1780
|
+
except KeyboardInterrupt:
|
1781
|
+
break
|
1782
|
+
except Exception as e:
|
1783
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1784
|
+
console.log(f"Check if {name} is running with [yellow]kalavai job list")
|
1785
|
+
return
|
1786
|
+
|
1787
|
+
@arguably.command
|
1788
|
+
def job__manifest(*others, name, force_namespace: str=None):
|
1789
|
+
"""
|
1790
|
+
Get job manifest description
|
1791
|
+
"""
|
1792
|
+
try:
|
1793
|
+
CLUSTER.validate_cluster()
|
1794
|
+
except Exception as e:
|
1795
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1796
|
+
return
|
1797
|
+
|
1798
|
+
if force_namespace is not None:
|
1799
|
+
console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
|
1800
|
+
|
1801
|
+
data = {
|
1802
|
+
"label": TEMPLATE_LABEL,
|
1803
|
+
"value": name,
|
1804
|
+
}
|
1805
|
+
if force_namespace is not None:
|
1806
|
+
data["force_namespace"] = force_namespace
|
1807
|
+
try:
|
1808
|
+
result = request_to_server(
|
1809
|
+
method="post",
|
1810
|
+
endpoint="/v1/describe_pods_for_label",
|
1811
|
+
data=data,
|
1812
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1813
|
+
user_cookie=USER_COOKIE
|
1814
|
+
)
|
1815
|
+
for pod, manifest in result.items():
|
1816
|
+
manifest = json.dumps(manifest, indent=3)
|
1817
|
+
console.log(f"[yellow]Pod {pod}")
|
1818
|
+
console.log(f"{manifest}")
|
1819
|
+
except Exception as e:
|
1820
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1821
|
+
return
|
1822
|
+
|
1823
|
+
|
1824
|
+
@arguably.command
|
1825
|
+
def ray__create(name, template_path, *others, force_namespace: str=None):
|
1826
|
+
"""
|
1827
|
+
Create a cluster using KubeRay operator
|
1828
|
+
"""
|
1829
|
+
|
1830
|
+
try:
|
1831
|
+
CLUSTER.validate_cluster()
|
1832
|
+
except Exception as e:
|
1833
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1834
|
+
return
|
1835
|
+
|
1836
|
+
with open(template_path, "r") as f:
|
1837
|
+
template_yaml = f.read()
|
1838
|
+
|
1839
|
+
data = {
|
1840
|
+
"name": name,
|
1841
|
+
"manifest": template_yaml
|
1842
|
+
}
|
1843
|
+
if force_namespace is not None:
|
1844
|
+
data["force_namespace"] = force_namespace
|
1845
|
+
try:
|
1846
|
+
result = request_to_server(
|
1847
|
+
method="post",
|
1848
|
+
endpoint="/v1/deploy_ray",
|
1849
|
+
data=data,
|
1850
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1851
|
+
user_cookie=USER_COOKIE
|
1852
|
+
)
|
1853
|
+
if len(result['failed']) > 0:
|
1854
|
+
console.log(f"[red]Error when deploying ray manifest\n\n{result['failed']}")
|
1855
|
+
return
|
1856
|
+
if len(result['successful']) > 0:
|
1857
|
+
console.log(f"[green]Ray cluster {name} successfully deployed!")
|
1858
|
+
except Exception as e:
|
1859
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1860
|
+
return
|
1861
|
+
|
1862
|
+
|
1863
|
+
@arguably.command
|
1864
|
+
def ray__list(*status):
|
1865
|
+
"""
|
1866
|
+
List all available ray clusters
|
1867
|
+
"""
|
1868
|
+
try:
|
1869
|
+
CLUSTER.validate_cluster()
|
1870
|
+
except Exception as e:
|
1871
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1872
|
+
return
|
1873
|
+
|
1874
|
+
data = {
|
1875
|
+
"group": "ray.io",
|
1876
|
+
"api_version": "v1",
|
1877
|
+
"plural": "rayclusters"
|
1878
|
+
}
|
1879
|
+
try:
|
1880
|
+
result = request_to_server(
|
1881
|
+
method="post",
|
1882
|
+
endpoint="/v1/get_objects_of_type",
|
1883
|
+
data=data,
|
1884
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1885
|
+
user_cookie=USER_COOKIE
|
1886
|
+
)
|
1887
|
+
clusters = {ns: ds["items"] for ns, ds in result.items()}
|
1888
|
+
#clusters = result['items']
|
1889
|
+
|
1890
|
+
except Exception as e:
|
1891
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1892
|
+
return
|
1893
|
+
|
1894
|
+
if len(clusters) == 0:
|
1895
|
+
console.log("No clusters available")
|
1896
|
+
return
|
1897
|
+
|
1898
|
+
# pretty print
|
1899
|
+
columns = ["Owner", "Name", "Status", "CPUs", "GPUs", "Memory", "Endpoints"]
|
1900
|
+
rows = []
|
1901
|
+
server_ip = load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)
|
1902
|
+
for namespace, clusters in clusters.items():
|
1903
|
+
for cluster in clusters:
|
1904
|
+
cluster_name = cluster['metadata']['name']
|
1905
|
+
cpus = cluster["status"]["desiredCPU"]
|
1906
|
+
gpus = cluster["status"]["desiredGPU"]
|
1907
|
+
memory = cluster["status"]["desiredMemory"]
|
1908
|
+
min_workers = cluster["status"]["minWorkerReplicas"] if "minWorkerReplicas" in cluster["status"] else 0
|
1909
|
+
max_workers = cluster["status"]["maxWorkerReplicas"] if "maxWorkerReplicas" in cluster["status"] else 0
|
1910
|
+
ready_workers = cluster["status"]["readyWorkerReplicas"] if "readyWorkerReplicas" in cluster["status"] else 0
|
1911
|
+
head_status = cluster["status"]["state"] if "state" in cluster["status"] else "creating"
|
1912
|
+
status = f"Head {head_status}\nWorkers: {ready_workers} ready ({min_workers}/{max_workers})"
|
1913
|
+
endpoints = [f"{k}: http://{server_ip}:{v}" for k, v in cluster['status']["endpoints"].items()]
|
1914
|
+
rows.append(
|
1915
|
+
(namespace, cluster_name, status, cpus, gpus, memory, "\n".join(endpoints))
|
1916
|
+
)
|
1917
|
+
table = generate_table(columns=columns, rows=rows)
|
1918
|
+
console.log(table)
|
1919
|
+
|
1920
|
+
@arguably.command
|
1921
|
+
def ray__delete(*others, name, force_namespace=None):
|
1922
|
+
"""
|
1923
|
+
Delete a ray cluster
|
1924
|
+
"""
|
1925
|
+
try:
|
1926
|
+
CLUSTER.validate_cluster()
|
1927
|
+
except Exception as e:
|
1928
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1929
|
+
return
|
1930
|
+
|
1931
|
+
# deploy template with kube-watcher
|
1932
|
+
data = {
|
1933
|
+
"label": RAY_LABEL, # this ensures that both raycluster and services are deleted
|
1934
|
+
"value": name
|
1935
|
+
}
|
1936
|
+
if force_namespace is not None:
|
1937
|
+
data["force_namespace"] = force_namespace
|
1938
|
+
try:
|
1939
|
+
result = request_to_server(
|
1940
|
+
method="post",
|
1941
|
+
endpoint="/v1/delete_labeled_resources",
|
1942
|
+
data=data,
|
1943
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1944
|
+
user_cookie=USER_COOKIE
|
1945
|
+
)
|
1946
|
+
console.log(f"{result}")
|
1947
|
+
except Exception as e:
|
1948
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1949
|
+
|
1950
|
+
@arguably.command
|
1951
|
+
def ray__manifest(*others, name, force_namespace=None):
|
1952
|
+
"""
|
1953
|
+
Get ray cluster manifest description
|
1954
|
+
"""
|
1955
|
+
try:
|
1956
|
+
CLUSTER.validate_cluster()
|
1957
|
+
except Exception as e:
|
1958
|
+
console.log(f"[red]Problems with your pool: {str(e)}")
|
1959
|
+
return
|
1960
|
+
|
1961
|
+
data = {
|
1962
|
+
"label": "ray.io/cluster",
|
1963
|
+
"value": name
|
1964
|
+
}
|
1965
|
+
if force_namespace is not None:
|
1966
|
+
data["force_namespace"] = force_namespace
|
1967
|
+
try:
|
1968
|
+
result = request_to_server(
|
1969
|
+
method="post",
|
1970
|
+
endpoint="/v1/describe_pods_for_label",
|
1971
|
+
data=data,
|
1972
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
1973
|
+
user_cookie=USER_COOKIE
|
1974
|
+
)
|
1975
|
+
for pod, manifest in result.items():
|
1976
|
+
manifest = json.dumps(manifest, indent=3)
|
1977
|
+
console.log(f"[yellow]Pod {pod}")
|
1978
|
+
console.log(f"{manifest}")
|
1979
|
+
except Exception as e:
|
1980
|
+
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1981
|
+
return
|
1982
|
+
|
1983
|
+
|
1984
|
+
def app():
|
1985
|
+
user_path("", create_path=True)
|
1986
|
+
arguably.run()
|
1987
|
+
|
1988
|
+
if __name__ == "__main__":
|
1989
|
+
app()
|