kalavai-client 0.5.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kalavai_client/cli.py ADDED
@@ -0,0 +1,1989 @@
1
+ from collections import defaultdict
2
+ import math
3
+ import os
4
+ import json
5
+ import uuid
6
+ import time
7
+ import socket
8
+ from pathlib import Path
9
+ from getpass import getpass
10
+ import ipaddress
11
+ from sys import exit
12
+
13
+ import yaml
14
+ import netifaces as ni
15
+ import arguably
16
+ from rich.console import Console
17
+
18
+ from kalavai_client.utils import (
19
+ check_gpu_drivers,
20
+ run_cmd,
21
+ user_path,
22
+ decode_dict,
23
+ generate_join_token,
24
+ user_confirm,
25
+ load_template,
26
+ store_server_info,
27
+ generate_table,
28
+ request_to_server,
29
+ resource_path,
30
+ user_path,
31
+ safe_remove,
32
+ leave_vpn,
33
+ join_vpn,
34
+ load_server_info,
35
+ user_login,
36
+ user_logout,
37
+ get_public_vpns,
38
+ register_cluster,
39
+ unregister_cluster,
40
+ get_public_seeds,
41
+ validate_join_public_seed,
42
+ is_storage_compatible,
43
+ is_watcher_alive,
44
+ load_user_session,
45
+ SERVER_IP_KEY,
46
+ AUTH_KEY,
47
+ WATCHER_SERVICE_KEY,
48
+ READONLY_AUTH_KEY,
49
+ WRITE_AUTH_KEY,
50
+ PUBLIC_LOCATION_KEY,
51
+ NODE_NAME_KEY,
52
+ CLUSTER_NAME_KEY,
53
+ CLUSTER_IP_KEY,
54
+ CLUSTER_TOKEN_KEY,
55
+ WATCHER_PORT_KEY,
56
+ MANDATORY_TOKEN_FIELDS,
57
+ USER_NODE_LABEL_KEY,
58
+ ALLOW_UNREGISTERED_USER_KEY
59
+ )
60
+ from kalavai_client.cluster import (
61
+ dockerCluster
62
+ )
63
+
64
+
65
+ KALAVAI_PLATFORM_URL = os.getenv("KALAVAI_PLATFORM_URL", "https://platform.kalavai.net")
66
+ LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
67
+ VERSION = 1
68
+ RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
69
+ CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
70
+ TEMPLATE_LABEL = "kalavai.job.name"
71
+ RAY_LABEL = "kalavai.ray.name"
72
+ PVC_NAME_LABEL = "kalavai.storage.name"
73
+ DOCKER_COMPOSE_TEMPLATE = resource_path("assets/docker-compose-template.yaml")
74
+ VPN_COMPOSE_TEMPLATE = resource_path("assets/vpn-template.yaml")
75
+ POOL_CONFIG_TEMPLATE = resource_path("assets/pool_config_template.yaml")
76
+ POOL_CONFIG_DEFAULT_VALUES = resource_path("assets/pool_config_values.yaml")
77
+ USER_WORKSPACE_TEMPLATE = resource_path("assets/user_workspace.yaml")
78
+ DEFAULT_USER_WORKSPACE_VALUES = resource_path("assets/user_workspace_values.yaml")
79
+ STORAGE_CLASS_NAME = "local-path"
80
+ STORAGE_ACCESS_MODE = ["ReadWriteOnce"]
81
+ STORAGE_CLASS_LABEL = "kalavai.storage.enabled"
82
+ DEFAULT_STORAGE_NAME = "pool-cache"
83
+ DEFAULT_STORAGE_SIZE = 20
84
+ USER_NODE_LABEL = "kalavai.cluster.user"
85
+ KUBE_VERSION = os.getenv("KALAVAI_KUBE_VERSION", "v1.31.1+k3s1")
86
+ DEFAULT_FLANNEL_IFACE = os.getenv("KALAVAI_FLANNEL_IFACE", "netmaker")
87
+ FORBIDEDEN_IPS = ["127.0.0.1"]
88
+ # kalavai templates
89
+ HELM_APPS_FILE = resource_path("assets/apps.yaml")
90
+ HELM_APPS_VALUES = resource_path("assets/apps_values.yaml")
91
+ # user specific config files
92
+ DEFAULT_CONTAINER_NAME = "kalavai-seed"
93
+ CONTAINER_HOST_PATH = user_path("pool/", create_path=True)
94
+ USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
95
+ USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
96
+ USER_HELM_APPS_FILE = user_path("apps.yaml")
97
+ USER_KUBECONFIG_FILE = user_path("kubeconfig")
98
+ USER_LOCAL_SERVER_FILE = user_path(".server")
99
+ USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
100
+ USER_COOKIE = user_path(".user_cookie.pkl")
101
+
102
+
103
+ console = Console()
104
+ CLUSTER = dockerCluster(
105
+ container_name=DEFAULT_CONTAINER_NAME,
106
+ kube_version=KUBE_VERSION,
107
+ flannel_iface=DEFAULT_FLANNEL_IFACE,
108
+ compose_file=USER_COMPOSE_FILE,
109
+ kubeconfig_file=USER_KUBECONFIG_FILE,
110
+ poolconfig_file=USER_LOCAL_SERVER_FILE,
111
+ dependencies_file=USER_HELM_APPS_FILE
112
+ )
113
+
114
+
115
+ ######################
116
+ ## HELPER FUNCTIONS ##
117
+ ######################
118
+
119
+ def check_vpn_compatibility():
120
+ """Check required packages to join VPN"""
121
+ logs = []
122
+ console.log("[white]Checking system requirements...")
123
+ # netclient
124
+ try:
125
+ run_cmd("sudo netclient version >/dev/null 2>&1")
126
+ except:
127
+ logs.append("[red]Netmaker not installed. Install instructions:\n")
128
+ logs.append(" Linux: https://docs.netmaker.io/docs/netclient#linux\n")
129
+ logs.append(" Windows: https://docs.netmaker.io/docs/netclient#windows\n")
130
+ logs.append(" MacOS: https://docs.netmaker.io/docs/netclient#mac\n")
131
+
132
+ if len(logs) == 0:
133
+ console.log("[green]System is ready to join a pool")
134
+ return True
135
+ else:
136
+ for log in logs:
137
+ console.log(log)
138
+ return False
139
+
140
+ def check_seed_compatibility():
141
+ """Check required packages to start pools"""
142
+ logs = []
143
+ console.log("[white]Checking system requirements...")
144
+ # docker
145
+ try:
146
+ run_cmd("docker version >/dev/null 2>&1")
147
+ except:
148
+ logs.append("[red]Docker not installed. Install instructions:\n")
149
+ logs.append(" Linux: https://docs.docker.com/engine/install/\n")
150
+ logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
151
+
152
+ if len(logs) == 0:
153
+ console.log("[green]System is ready to start a pool")
154
+ return True
155
+ else:
156
+ for log in logs:
157
+ console.log(log)
158
+ return False
159
+
160
+ def check_worker_compatibility():
161
+ """Check required packages to join pools"""
162
+ logs = []
163
+ console.log("[white]Checking system requirements...")
164
+ # docker
165
+ try:
166
+ run_cmd("docker version >/dev/null 2>&1")
167
+ except:
168
+ logs.append("[red]Docker not installed. Install instructions:\n")
169
+ logs.append(" Linux: https://docs.docker.com/engine/install/\n")
170
+ logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
171
+
172
+ if len(logs) == 0:
173
+ console.log("[green]System is ready to join a pool")
174
+ return True
175
+ else:
176
+ for log in logs:
177
+ console.log(log)
178
+ return False
179
+
180
+
181
+ def cleanup_local():
182
+ # disconnect from private network
183
+ console.log("Disconnecting from VPN...")
184
+ try:
185
+ vpns = leave_vpn()
186
+ if vpns is not None:
187
+ for vpn in vpns:
188
+ console.log(f"You have left {vpn} VPN")
189
+ except:
190
+ # no vpn
191
+ pass
192
+ console.log("Removing local cache files...")
193
+ safe_remove(CONTAINER_HOST_PATH)
194
+ safe_remove(USER_COMPOSE_FILE)
195
+ safe_remove(USER_VPN_COMPOSE_FILE)
196
+ safe_remove(USER_HELM_APPS_FILE)
197
+ safe_remove(USER_KUBECONFIG_FILE)
198
+ safe_remove(USER_LOCAL_SERVER_FILE)
199
+ safe_remove(USER_TEMPLATES_FOLDER)
200
+
201
+ def pre_join_check(node_name, server_url, server_key):
202
+ # check with the server that we can connect
203
+ try:
204
+ nodes = request_to_server(
205
+ force_url=server_url,
206
+ force_key=server_key,
207
+ method="get",
208
+ endpoint="/v1/get_nodes",
209
+ data={"node_names": [node_name]},
210
+ server_creds=USER_LOCAL_SERVER_FILE,
211
+ user_cookie=USER_COOKIE
212
+ )
213
+ return node_name not in nodes.keys()
214
+ except Exception as e:
215
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
216
+ return False
217
+
218
+ def set_schedulable(schedulable, node_name=load_server_info(data_key=NODE_NAME_KEY, file=USER_LOCAL_SERVER_FILE)):
219
+ """
220
+ Delete job in the cluster
221
+ """
222
+ # deploy template with kube-watcher
223
+ data = {
224
+ "schedulable": str(schedulable),
225
+ "node_names": [node_name]
226
+ }
227
+ try:
228
+ res = request_to_server(
229
+ method="post",
230
+ endpoint="/v1/set_node_schedulable",
231
+ data=data,
232
+ server_creds=USER_LOCAL_SERVER_FILE,
233
+ user_cookie=USER_COOKIE
234
+ )
235
+ console.log(f"{res}")
236
+ except Exception as e:
237
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
238
+
239
+
240
+ def init_user_workspace(force_namespace=None):
241
+
242
+ # load template config and populate with values
243
+ sidecar_template_yaml = load_template(
244
+ template_path=USER_WORKSPACE_TEMPLATE,
245
+ values={},
246
+ default_values_path=DEFAULT_USER_WORKSPACE_VALUES)
247
+
248
+ try:
249
+ data = {"config": sidecar_template_yaml}
250
+ if force_namespace is not None:
251
+ data["force_namespace"] = force_namespace
252
+ result = request_to_server(
253
+ method="post",
254
+ endpoint="/v1/create_user_space",
255
+ data=data,
256
+ server_creds=USER_LOCAL_SERVER_FILE,
257
+ user_cookie=USER_COOKIE
258
+ )
259
+ console.log(f"Workspace creation (ignore already created warnings): {result}" )
260
+ except Exception as e:
261
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
262
+
263
+ def pool_init(pool_config_values_path=None):
264
+ """Deploy configured objects to initialise pool"""
265
+ if pool_config_values_path is None:
266
+ return
267
+
268
+ # load template config and populate with values
269
+ sidecar_template_yaml = load_template(
270
+ template_path=POOL_CONFIG_TEMPLATE,
271
+ values={},
272
+ default_values_path=pool_config_values_path)
273
+
274
+ try:
275
+ result = request_to_server(
276
+ method="post",
277
+ endpoint="/v1/deploy_generic_model",
278
+ data={"config": sidecar_template_yaml},
279
+ server_creds=USER_LOCAL_SERVER_FILE,
280
+ user_cookie=USER_COOKIE
281
+ )
282
+ if 'failed' in result and len(result['failed']) > 0:
283
+ console.log(f"[red]Error when deploying pool config\n\n{result['failed']}")
284
+ if len(result['successful']) > 0:
285
+ console.log(f"[green]Deployed pool config!")
286
+ except Exception as e:
287
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
288
+
289
+ def select_ip_address(subnet=None):
290
+ ips = []
291
+ retry = 3
292
+ while len(ips) == 0:
293
+ for iface in ni.interfaces():
294
+ try:
295
+ ip = ni.ifaddresses(iface)[ni.AF_INET][0]['addr']
296
+ if ip in FORBIDEDEN_IPS:
297
+ continue
298
+ if subnet is None or ipaddress.ip_address(ip) in ipaddress.ip_network(subnet):
299
+ ips.append(ip)
300
+ except:
301
+ pass
302
+ if len(ips) == 1:
303
+ return ips[0]
304
+ time.sleep(2)
305
+ retry -= 1
306
+ if retry < 0:
307
+ raise ValueError(f"No IPs available on subnet {subnet}")
308
+ while True:
309
+ option = user_confirm(
310
+ question="Select IP to advertise the node (needs to be visible to other nodes)",
311
+ options=ips
312
+ )
313
+ if option is not None:
314
+ break
315
+ else:
316
+ console.log("[red] Input error")
317
+ return ips[option]
318
+
319
+ def fetch_gpus():
320
+ data = request_to_server(
321
+ method="post",
322
+ endpoint="/v1/get_node_gpus",
323
+ data={},
324
+ server_creds=USER_LOCAL_SERVER_FILE,
325
+ user_cookie=USER_COOKIE
326
+ )
327
+ return data.items()
328
+
329
+ def select_gpus(message):
330
+ console.log(f"[yellow]{message}")
331
+ gpu_models = ["Any/None"]
332
+ gpu_models_full = ["Any/None"]
333
+ available_gpus = fetch_gpus()
334
+ for _, gpus in available_gpus:
335
+ for gpu in gpus["gpus"]:
336
+ #status = "free" if "ready" in gpu else "busy"
337
+ memory = math.floor(int(gpu['memory'])/1000)
338
+ gpu_models.append(gpu['model'])
339
+ gpu_models_full.append(f"{gpu['model']} ({memory}GB) (in use: {gpus['available'] == 0})" )
340
+
341
+ while True:
342
+ options = user_confirm(
343
+ question=" ",
344
+ options=gpu_models_full,
345
+ multiple=True
346
+ )
347
+ if options is not None:
348
+ if 0 in options:
349
+ ids = None
350
+ else:
351
+ ids = ",".join([gpu_models[i] for i in options])
352
+ break
353
+ return ids
354
+
355
+ def select_token_type():
356
+ options = ["Admin", "User (deploy jobs)", "Worker (read only)"]
357
+
358
+ while True:
359
+ choice = user_confirm(
360
+ question="What type of access are you granting?",
361
+ options=options,
362
+ multiple=False
363
+ )
364
+ if choice is not None:
365
+ break
366
+ return {"admin": choice == 0, "user": choice == 1, "worker": choice == 2}
367
+
368
+ def generate_compose_config(role, node_name, ip_address, node_labels, is_public, server=None, token=None):
369
+ num_gpus = 0
370
+ try:
371
+ has_gpus = check_gpu_drivers()
372
+ if has_gpus:
373
+ max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
374
+ num_gpus = user_confirm(
375
+ question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
376
+ options=range(max_gpus+1)
377
+ )
378
+ except:
379
+ console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
380
+ compose_values = {
381
+ "user_path": user_path(""),
382
+ "service_name": DEFAULT_CONTAINER_NAME,
383
+ "pool_ip": server,
384
+ "token": token,
385
+ "hostname": node_name,
386
+ "command": role,
387
+ "storage_enabled": "True",
388
+ "ip_address": ip_address,
389
+ "num_gpus": num_gpus,
390
+ "k3s_path": f"{CONTAINER_HOST_PATH}/k3s",
391
+ "etc_path": f"{CONTAINER_HOST_PATH}/etc",
392
+ "node_labels": " ".join([f"--node-label {key}={value}" for key, value in node_labels.items()]),
393
+ "flannel_iface": DEFAULT_FLANNEL_IFACE if is_public else None
394
+ }
395
+ # generate local config files
396
+ compose_yaml = load_template(
397
+ template_path=DOCKER_COMPOSE_TEMPLATE,
398
+ values=compose_values)
399
+ with open(USER_COMPOSE_FILE, "w") as f:
400
+ f.write(compose_yaml)
401
+ return compose_yaml
402
+
403
+ ##################
404
+ ## CLI COMMANDS ##
405
+ ##################
406
+
407
+ @arguably.command
408
+ def login(*others, username: str=None):
409
+ """
410
+ [AUTH] (For public clusters only) Log in to Kalavai server.
411
+
412
+ Args:
413
+ *others: all the other positional arguments go here
414
+ """
415
+ console.log(f"Kalavai account details. If you don't have an account, create one at [yellow]{KALAVAI_PLATFORM_URL}")
416
+ if username is None:
417
+ username = input("User email: ")
418
+ password = getpass()
419
+ user = user_login(
420
+ user_cookie=USER_COOKIE,
421
+ username=username,
422
+ password=password
423
+ )
424
+
425
+ if user is not None:
426
+ console.log(f"[green]{username} logged in successfully")
427
+ else:
428
+ console.log(f"[red]Invalid credentials for {username}")
429
+
430
+ return user is not None
431
+
432
+ @arguably.command
433
+ def logout(*others):
434
+ """
435
+ [AUTH] (For public clusters only) Log out of Kalavai server.
436
+
437
+ Args:
438
+ *others: all the other positional arguments go here
439
+ """
440
+ user_logout(
441
+ user_cookie=USER_COOKIE
442
+ )
443
+ console.log("[green]Log out successfull")
444
+
445
+ @arguably.command
446
+ def location__list(*others):
447
+ """
448
+ [AUTH] List public locations on Kalavai
449
+ """
450
+ try:
451
+ seeds = get_public_vpns(user_cookie=USER_COOKIE)
452
+ except Exception as e:
453
+ console.log(f"[red]Error: {str(e)}")
454
+ console.log("Are you authenticated? Try [yellow]kalavai login")
455
+ return
456
+ columns, rows = [], []
457
+ for idx, seed in enumerate(seeds):
458
+ columns = seed.keys()
459
+ rows.append([str(idx)] + list(seed.values()))
460
+ columns = ["VPN"] + list(columns)
461
+ table = generate_table(columns=columns, rows=rows)
462
+ console.log(table)
463
+
464
+ @arguably.command
465
+ def pool__publish(*others, description=None):
466
+ """
467
+ [AUTH] Publish pool to Kalavai platform, where other users may be able to join
468
+ """
469
+ # Check for:
470
+ # - cluster is up and running
471
+ # - cluster is connected to vpn (has net token)
472
+ # - user is authenticated
473
+ try:
474
+ CLUSTER.is_seed_node()
475
+ except Exception as e:
476
+ console.log(f"[red]Problems with your pool: {str(e)}")
477
+ return
478
+ choices = select_token_type()
479
+ token = pool__token(**choices)
480
+
481
+ if description is None:
482
+ console.log("[yellow] [Markdown] In a few words (max 500 chars), describe your goals with this cluster. Remember, this is what other users will see to decide whether to share their resources with you, [blue]so inspire them!")
483
+ description = input(f"(You can edit this later in {KALAVAI_PLATFORM_URL}\n")
484
+
485
+ description = description
486
+
487
+ try:
488
+ if not pool__check_token(token=token, public=True):
489
+ raise ValueError("[red]Cluster must be started with a valid vpn_location to publish")
490
+ cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
491
+
492
+ register_cluster(
493
+ name=cluster_name,
494
+ token=token,
495
+ description=description,
496
+ user_cookie=USER_COOKIE)
497
+ console.log(f"[green]Your cluster is now public on {KALAVAI_PLATFORM_URL}")
498
+ except Exception as e:
499
+ console.log(f"[red]Error when publishing cluster. {str(e)}")
500
+
501
+ @arguably.command
502
+ def pool__unpublish(cluster_name=None, *others):
503
+ """
504
+ [AUTH] Unpublish pool to Kalavai platform. Cluster and all its workers will still work
505
+ """
506
+ # Check for:
507
+ # - cluster is up and running
508
+ # - user is authenticated
509
+ try:
510
+ CLUSTER.is_seed_node()
511
+ except Exception as e:
512
+ console.log(f"[red]Problems with your pool: {str(e)}")
513
+ return
514
+
515
+ try:
516
+ if cluster_name is None:
517
+ cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
518
+ unregister_cluster(
519
+ name=cluster_name,
520
+ user_cookie=USER_COOKIE)
521
+ console.log(f"[green]Your cluster has been removed from {KALAVAI_PLATFORM_URL}")
522
+ except Exception as e:
523
+ console.log(f"[red]Error when unpublishing cluster. {str(e)}")
524
+
525
+ @arguably.command
526
+ def pool__list(*others, user_only=False):
527
+ """
528
+ [AUTH] List public pools in to Kalavai platform.
529
+ """
530
+ try:
531
+ seeds = get_public_seeds(
532
+ user_only=user_only,
533
+ user_cookie=USER_COOKIE)
534
+ except Exception as e:
535
+ console.log(f"[red]Error when loading pools. {str(e)}")
536
+ return
537
+
538
+ for seed in seeds:
539
+ console.log("[yellow]************************************")
540
+ for key, value in seed.items():
541
+ if key == "join_key":
542
+ continue
543
+ console.log(f"[yellow]{key}: [green]{value}")
544
+ print(f"Join key: {seed['join_key']}")
545
+ console.log("[yellow]************************************")
546
+ console.log("[white]Use [yellow]kalavai pool join <join key> [white]to join a public pool")
547
+
548
+
549
+ @arguably.command
550
+ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=HELM_APPS_VALUES, pool_config_values: str=POOL_CONFIG_DEFAULT_VALUES):
551
+ """
552
+ Start Kalavai pool and start/resume sharing resources.
553
+
554
+ Args:
555
+ *others: all the other positional arguments go here
556
+ """
557
+
558
+ if not check_seed_compatibility():
559
+ return
560
+
561
+ if CLUSTER.is_cluster_init():
562
+ console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
563
+ return
564
+
565
+ # User acknowledgement
566
+ option = user_confirm(
567
+ question="Kalavai will now create a pool and a local worker using docker. This won't modify your system. Are you happy to proceed?",
568
+ options=["no", "yes"]
569
+ )
570
+ if option == 0:
571
+ console.log("Installation was cancelled and did not complete.")
572
+ return
573
+
574
+ # if only registered users are allowed, check user has logged in
575
+ user = defaultdict(lambda: None)
576
+ if only_registered_users or location is not None:
577
+ user = user_login(user_cookie=USER_COOKIE)
578
+ if user is None:
579
+ console.log("[white]--only-registered-users [red]or [white]--location[red] can only be used if the host is authenticated. Run [yellow]kalavai login[red] to authenticate")
580
+ exit()
581
+
582
+ # join private network if provided
583
+ vpn = defaultdict(lambda: None)
584
+ node_labels = {
585
+ STORAGE_CLASS_LABEL: is_storage_compatible()
586
+ }
587
+ if location is not None:
588
+ console.log("Joining private network")
589
+ try:
590
+ if not check_vpn_compatibility():
591
+ return
592
+ vpn = join_vpn(
593
+ location=location,
594
+ user_cookie=USER_COOKIE)
595
+ node_labels[USER_NODE_LABEL] = user["username"]
596
+ except Exception as e:
597
+ console.log(f"[red]Error when joining network: {str(e)}")
598
+ return
599
+
600
+ if ip_address is None:
601
+ console.log(f"Scanning for valid IPs (subnet {vpn['subnet']})...")
602
+ ip_address = select_ip_address(subnet=vpn["subnet"])
603
+ console.log(f"Using {ip_address} address for server")
604
+
605
+ auth_key = str(uuid.uuid4())
606
+ write_auth_key = str(uuid.uuid4())
607
+ readonly_auth_key = str(uuid.uuid4())
608
+ watcher_port = 30001
609
+ watcher_service = f"{ip_address}:{watcher_port}"
610
+ values = {
611
+ CLUSTER_NAME_KEY: cluster_name,
612
+ CLUSTER_IP_KEY: ip_address,
613
+ AUTH_KEY: auth_key,
614
+ READONLY_AUTH_KEY: readonly_auth_key,
615
+ WRITE_AUTH_KEY: write_auth_key,
616
+ WATCHER_PORT_KEY: watcher_port,
617
+ WATCHER_SERVICE_KEY: watcher_service,
618
+ USER_NODE_LABEL_KEY: USER_NODE_LABEL,
619
+ ALLOW_UNREGISTERED_USER_KEY: not only_registered_users
620
+ }
621
+
622
+ store_server_info(
623
+ server_ip=ip_address,
624
+ auth_key=auth_key,
625
+ readonly_auth_key=readonly_auth_key,
626
+ write_auth_key=write_auth_key,
627
+ file=USER_LOCAL_SERVER_FILE,
628
+ watcher_service=watcher_service,
629
+ node_name=socket.gethostname(),
630
+ cluster_name=cluster_name,
631
+ public_location=location,
632
+ user_api_key=user["api_key"])
633
+
634
+ # 1. Generate docker compose recipe
635
+ compose_yaml = generate_compose_config(
636
+ role="server",
637
+ node_name=socket.gethostname(),
638
+ ip_address=ip_address,
639
+ node_labels=node_labels,
640
+ is_public=location is not None
641
+ )
642
+
643
+ # Generate helmfile recipe
644
+ helm_yaml = load_template(
645
+ template_path=HELM_APPS_FILE,
646
+ values=values,
647
+ default_values_path=app_values,
648
+ force_defaults=True)
649
+ with open(USER_HELM_APPS_FILE, "w") as f:
650
+ f.write(helm_yaml)
651
+
652
+ console.log("[green]Config files have been generated in your local machine\n")
653
+
654
+ # # 1. start server
655
+ console.log("Deploying seed...")
656
+ CLUSTER.start_seed_node()
657
+
658
+ while not CLUSTER.is_agent_running():
659
+ console.log("Waiting for seed to start...")
660
+ time.sleep(10)
661
+
662
+ console.log("Setting pool dependencies...")
663
+ # set template values in helmfile
664
+ try:
665
+ CLUSTER.update_dependencies(
666
+ dependencies_file=USER_HELM_APPS_FILE
667
+ )
668
+ except Exception as e:
669
+ console.log(f"Error: {str(e)}")
670
+ exit()
671
+ console.log("[green]Your pool is ready! Grow it by sharing your joining token with others. Run [yellow]kalavai pool token[green] to generate one.")
672
+
673
+ if location is not None:
674
+ # register with kalavai if it's a public cluster
675
+ console.log("Registering public cluster with Kalavai...")
676
+ pool__publish()
677
+
678
+ # wait until the server is ready to create objects
679
+ while True:
680
+ console.log("Waiting for core services to be ready, may take a few minutes...")
681
+ time.sleep(30)
682
+ if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
683
+ break
684
+ console.log("Initialise user workspace...")
685
+ pool_init(pool_config_values_path=pool_config_values)
686
+ # init default namespace
687
+ init_user_workspace(force_namespace="default")
688
+ if only_registered_users:
689
+ # init user namespace
690
+ init_user_workspace()
691
+
692
+ return None
693
+
694
+
695
+ @arguably.command
696
+ def pool__token(*others, admin=False, user=False, worker=False):
697
+ """
698
+ Generate a join token for others to connect to your pool
699
+ """
700
+ try:
701
+ CLUSTER.validate_cluster()
702
+ except Exception as e:
703
+ console.log(f"[red]Problems with your pool: {str(e)}")
704
+ return
705
+
706
+ if not admin and not user and not worker:
707
+ console.log(f"[red]Select at least one mode (--admin, --user or --worker)")
708
+ return
709
+
710
+ if admin:
711
+ auth_key = load_server_info(data_key=AUTH_KEY, file=USER_LOCAL_SERVER_FILE)
712
+ elif user:
713
+ auth_key = load_server_info(data_key=WRITE_AUTH_KEY, file=USER_LOCAL_SERVER_FILE)
714
+ else:
715
+ auth_key = load_server_info(data_key=READONLY_AUTH_KEY, file=USER_LOCAL_SERVER_FILE)
716
+
717
+ watcher_service = load_server_info(data_key=WATCHER_SERVICE_KEY, file=USER_LOCAL_SERVER_FILE)
718
+ public_location = load_server_info(data_key=PUBLIC_LOCATION_KEY, file=USER_LOCAL_SERVER_FILE)
719
+
720
+ cluster_token = CLUSTER.get_cluster_token()
721
+
722
+ ip_address = load_server_info(SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)
723
+ cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
724
+
725
+ join_token = generate_join_token(
726
+ cluster_ip=ip_address,
727
+ cluster_name=cluster_name,
728
+ cluster_token=cluster_token,
729
+ auth_key=auth_key,
730
+ watcher_service=watcher_service,
731
+ public_location=public_location
732
+ )
733
+
734
+ console.log("[green]Join token:")
735
+ print(join_token)
736
+
737
+ return join_token
738
+
739
+ @arguably.command
740
+ def pool__check_token(token, *others, public=False):
741
+ """
742
+ Utility to check the validity of a join token
743
+ """
744
+ try:
745
+ data = decode_dict(token)
746
+ for field in MANDATORY_TOKEN_FIELDS:
747
+ assert field in data
748
+ if public:
749
+ if data[PUBLIC_LOCATION_KEY] is None:
750
+ raise ValueError("Token is not valid for public pools. Did you start the cluster with a public_location?")
751
+ console.log("[green]Token format is correct")
752
+ return True
753
+ except Exception as e:
754
+ console.log(f"[white]{str(e)}")
755
+ console.log("[red]Token is invalid.")
756
+ return False
757
+
758
+
759
+ @arguably.command
760
+ def pool__join(token, *others, node_name=None, ip_address: str=None):
761
+ """
762
+ Join Kalavai pool and start/resume sharing resources.
763
+
764
+ Args:
765
+ *others: all the other positional arguments go here
766
+ """
767
+
768
+ if not check_worker_compatibility():
769
+ return
770
+
771
+ # check that k3s is not running already in the host
772
+ # k3s service running or preinstalled
773
+ if CLUSTER.is_agent_running():
774
+ console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
775
+ return
776
+ # check that is not attached to another instance
777
+ if os.path.exists(USER_LOCAL_SERVER_FILE):
778
+ option = user_confirm(
779
+ question="You seem to be connected to an instance already. Are you sure you want to join a new one?",
780
+ options=["no", "yes"]
781
+ )
782
+ if option == 0:
783
+ console.log("[green]Nothing happened.")
784
+ return
785
+
786
+ if node_name is None:
787
+ node_name = socket.gethostname()
788
+
789
+ # check token
790
+ if not pool__check_token(token):
791
+ return
792
+
793
+ try:
794
+ data = decode_dict(token)
795
+ kalavai_seed_ip = data[CLUSTER_IP_KEY]
796
+ kalavai_token = data[CLUSTER_TOKEN_KEY]
797
+ cluster_name = data[CLUSTER_NAME_KEY]
798
+ auth_key = data[AUTH_KEY]
799
+ watcher_service = data[WATCHER_SERVICE_KEY]
800
+ public_location = data[PUBLIC_LOCATION_KEY]
801
+ vpn = defaultdict(lambda: None)
802
+ except Exception as e:
803
+ console.log(str(e))
804
+ console.log("[red] Invalid token")
805
+ return
806
+
807
+ # join private network if provided
808
+ node_labels = {
809
+ STORAGE_CLASS_LABEL: is_storage_compatible()
810
+ }
811
+ user = defaultdict(lambda: None)
812
+ if public_location is not None:
813
+ console.log("Joining private network")
814
+ try:
815
+ if not check_vpn_compatibility():
816
+ return
817
+ vpn = join_vpn(
818
+ location=public_location,
819
+ user_cookie=USER_COOKIE)
820
+ user = user_login(user_cookie=USER_COOKIE)
821
+ node_labels[USER_NODE_LABEL] = user["username"]
822
+ except Exception as e:
823
+ console.log(f"[red]Error when joining network: {str(e)}")
824
+ console.log("Are you authenticated? Try [yellow]kalavai login")
825
+ return
826
+ # validate public seed
827
+ try:
828
+ validate_join_public_seed(
829
+ cluster_name=cluster_name,
830
+ join_key=token,
831
+ user_cookie=USER_COOKIE
832
+ )
833
+ except Exception as e:
834
+ console.log(f"[red]Error when joining network: {str(e)}")
835
+ leave_vpn(vpn_file=USER_VPN_COMPOSE_FILE)
836
+ return
837
+
838
+ # send note to server to let them know the node is coming online
839
+ if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
840
+ console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with '--node-name'")
841
+ leave_vpn(vpn_file=USER_VPN_COMPOSE_FILE)
842
+ return
843
+
844
+ if ip_address is None:
845
+ console.log(f"Scanning for valid IPs (subnet {vpn['subnet']})...")
846
+ ip_address = select_ip_address(subnet=vpn["subnet"])
847
+ console.log(f"Using {ip_address} address for worker")
848
+
849
+ # local agent join
850
+ # 1. Generate local cache files
851
+ console.log("Generating config files...")
852
+ compose_yaml = generate_compose_config(
853
+ role="agent",
854
+ server=f"https://{kalavai_seed_ip}:6443",
855
+ token=kalavai_token,
856
+ node_name=socket.gethostname(),
857
+ ip_address=ip_address,
858
+ node_labels=node_labels,
859
+ is_public=public_location is not None)
860
+ store_server_info(
861
+ server_ip=kalavai_seed_ip,
862
+ auth_key=auth_key,
863
+ file=USER_LOCAL_SERVER_FILE,
864
+ watcher_service=watcher_service,
865
+ node_name=node_name,
866
+ cluster_name=cluster_name,
867
+ public_location=public_location,
868
+ user_api_key=user["api_key"])
869
+
870
+ init_user_workspace()
871
+
872
+ option = user_confirm(
873
+ question="Docker compose ready. Would you like Kalavai to deploy it?",
874
+ options=["no", "yes"]
875
+ )
876
+ if option == 0:
877
+ console.log("Manually deploy the worker with the following command:\n")
878
+ print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
879
+ return
880
+
881
+ console.log(f"[white] Connecting to {cluster_name} @ {kalavai_seed_ip} (this may take a few minutes)...")
882
+ try:
883
+ CLUSTER.start_worker_node()
884
+ except Exception as e:
885
+ console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
886
+ leave_vpn(vpn_file=USER_VPN_COMPOSE_FILE)
887
+ exit()
888
+
889
+ while not CLUSTER.is_agent_running():
890
+ console.log("Waiting for worker to start...")
891
+ time.sleep(10)
892
+
893
+ # set status to schedulable
894
+ console.log(f"[green] You are connected to {cluster_name}")
895
+
896
+ @arguably.command
897
+ def pool__stop(*others):
898
+ """
899
+ Stop sharing your device and clean up. DO THIS ONLY IF YOU WANT TO REMOVE KALAVAI-CLIENT from your device.
900
+
901
+ Args:
902
+ *others: all the other positional arguments go here
903
+ """
904
+ console.log("[white] Stopping kalavai app...")
905
+ # delete local node from server
906
+ node__delete(load_server_info(data_key=NODE_NAME_KEY, file=USER_LOCAL_SERVER_FILE))
907
+ # unpublish event (only if seed node)
908
+ # TODO: no, this should be done via the platform!!!
909
+ # try:
910
+ # if CLUSTER.is_seed_node():
911
+ # console.log("Unregistering pool...")
912
+ # unregister_cluster(
913
+ # name=load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE),
914
+ # user_cookie=USER_COOKIE)
915
+ # except Exception as e:
916
+ # console.log(f"[red][WARNING]: (ignore if not a public pool) Error when unpublishing cluster. {str(e)}")
917
+ # remove local node agent
918
+ console.log("Removing agent and local cache")
919
+ CLUSTER.remove_agent()
920
+ # clean local files
921
+ cleanup_local()
922
+ console.log("[white] Kalavai has stopped sharing your resources. Use [yellow]kalavai pool start[white] or [yellow]kalavai pool join[white] to start again!")
923
+
924
+ @arguably.command
925
+ def pool__pause(*others):
926
+ """
927
+ Pause sharing your device and make your device unavailable for kalavai scheduling.
928
+
929
+ Args:
930
+ *others: all the other positional arguments go here
931
+ """
932
+ # k3s stop locally
933
+ console.log("[white] Pausing kalavai app...")
934
+ success = CLUSTER.pause_agent()
935
+ if success:
936
+ console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
937
+ else:
938
+ console.log("[red] Error when stopping. Please run [yellow]kalavai pool pause[red] again.")
939
+
940
+ @arguably.command
941
+ def pool__resume(*others):
942
+ """
943
+ Resume sharing your device and make device available for kalavai scheduling.
944
+
945
+ Args:
946
+ *others: all the other positional arguments go here
947
+ """
948
+ # k3s stop locally
949
+ if not CLUSTER.is_cluster_init():
950
+ console.log("[red] Kalavai app was not started before, please run [yellow]kalavai pool start[red] to start a pool or [yellow]kalavai pool join[red] to join one first")
951
+ return
952
+ console.log("[white] Restarting sharing (may take a few minutes)...")
953
+ if CLUSTER.restart_agent():
954
+ console.log("[white] Kalava sharing resumed")
955
+ else:
956
+ console.log("[red] Error when restarting. Please run [yellow]kalavai pool resume[white] again.")
957
+
958
+
959
+ @arguably.command
960
+ def pool__gpus(*others, available=False):
961
+ """
962
+ Display GPU information from all connected nodes
963
+ """
964
+ try:
965
+ CLUSTER.validate_cluster()
966
+ except Exception as e:
967
+ console.log(f"[red]Problems with your pool: {str(e)}")
968
+ return
969
+
970
+ try:
971
+ data = fetch_gpus()
972
+ columns, rows = [], []
973
+ for node, gpus in data:
974
+ row_gpus = []
975
+ for gpu in gpus["gpus"]:
976
+ status = gpu["ready"] if "ready" in gpu else True
977
+ if available and not status:
978
+ continue
979
+ row_gpus.append( (f"{gpu['model']} ({math.floor(int(gpu['memory'])/1000)} GBs)", str(status)))
980
+ if len(row_gpus) > 0:
981
+ models, statuses = zip(*row_gpus)
982
+ rows.append([node, "\n".join(statuses), "\n".join(models), str(gpus["available"]), str(gpus["capacity"])])
983
+
984
+ columns = ["Ready", "GPU(s)", "Available", "Total"]
985
+ columns = ["Node"] + columns
986
+ console.print(
987
+ generate_table(columns=columns, rows=rows,end_sections=[n for n in range(len(rows))])
988
+ )
989
+
990
+ except Exception as e:
991
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
992
+
993
+
994
+ @arguably.command
995
+ def pool__resources(*others):
996
+ """
997
+ Display information about resources on the pool
998
+ """
999
+ try:
1000
+ CLUSTER.validate_cluster()
1001
+ except Exception as e:
1002
+ console.log(f"[red]Problems with your pool: {str(e)}")
1003
+ return
1004
+
1005
+ try:
1006
+ total = request_to_server(
1007
+ method="get",
1008
+ endpoint="/v1/get_cluster_total_resources",
1009
+ data={},
1010
+ server_creds=USER_LOCAL_SERVER_FILE,
1011
+ user_cookie=USER_COOKIE
1012
+ )
1013
+ available = request_to_server(
1014
+ method="get",
1015
+ endpoint="/v1/get_cluster_available_resources",
1016
+ data={},
1017
+ server_creds=USER_LOCAL_SERVER_FILE,
1018
+ user_cookie=USER_COOKIE
1019
+ )
1020
+ columns = []
1021
+ total_values = []
1022
+ available_values = []
1023
+ for col in total.keys():
1024
+ if col in RESOURCE_EXCLUDE:
1025
+ continue
1026
+ columns.append(col)
1027
+ total_values.append(str(total[col]))
1028
+ available_values.append(str(available[col]))
1029
+
1030
+ columns = [""] + columns
1031
+ total_values = ["Total"] + total_values
1032
+ available_values = ["Available"] + available_values
1033
+
1034
+ rows = [
1035
+ tuple(available_values),
1036
+ tuple(total_values)
1037
+ ]
1038
+ console.print(
1039
+ generate_table(columns=columns, rows=rows, end_sections=[0, 1])
1040
+ )
1041
+
1042
+ except Exception as e:
1043
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1044
+
1045
+ @arguably.command
1046
+ def pool__update(*others):
1047
+ """
1048
+ Update kalavai pool
1049
+ """
1050
+ try:
1051
+ CLUSTER.validate_cluster()
1052
+ except Exception as e:
1053
+ console.log(f"[red]Problems with your pool: {str(e)}")
1054
+ return
1055
+
1056
+ if not CLUSTER.is_seed_node():
1057
+ console.log("You can only update a pool from the seed node.")
1058
+ return
1059
+
1060
+ # update dependencies
1061
+ try:
1062
+ CLUSTER.update_dependencies(debug=True)
1063
+ console.log("Pool updating. Expect some downtime on core services")
1064
+ except Exception as e:
1065
+ console.log(f"[red]Error when updating pool: {str(e)}")
1066
+ return
1067
+
1068
+
1069
+ @arguably.command
1070
+ def pool__status(*others, log_file=None):
1071
+ """
1072
+ Run diagnostics on a local installation of kalavai
1073
+ * is pool installed
1074
+ * is agent running
1075
+ * is kube-watcher running
1076
+ * is lws running
1077
+ """
1078
+ logs = []
1079
+
1080
+ logs.append("Getting deployment status...")
1081
+
1082
+ if CLUSTER.is_seed_node():
1083
+ # seed node
1084
+ data = CLUSTER.diagnostics()
1085
+ logs.append(data)
1086
+ else:
1087
+ # worker node
1088
+ logs.append("Could not access node info. This info is only available to seed nodes. Ignore if you are on a worker node.")
1089
+ logs.append(f"Worker installed: {CLUSTER.is_cluster_init()}")
1090
+
1091
+ logs.append(f"Worker running: {CLUSTER.is_agent_running()}")
1092
+
1093
+ logs.append(f"Pool credentials present: {CLUSTER.validate_cluster()}")
1094
+
1095
+ if log_file is not None:
1096
+ with open(log_file, "w") as f:
1097
+ for log in logs:
1098
+ f.write(log)
1099
+ f.write("\n")
1100
+ console.log(f"[green]Logs written to {log_file}")
1101
+ else:
1102
+ for log in logs:
1103
+ console.log(f"{log}\n")
1104
+
1105
+ @arguably.command
1106
+ def pool__attach(token, *others, node_name=None):
1107
+ """
1108
+ Set creds in token on the local instance
1109
+ """
1110
+ if os.path.exists(USER_LOCAL_SERVER_FILE):
1111
+ option = user_confirm(
1112
+ question="You seem to be connected to an instance already. Are you sure you want to join a new one?",
1113
+ options=["no", "yes"]
1114
+ )
1115
+ if option == 0:
1116
+ console.log("[green]Nothing happened.")
1117
+ return
1118
+ try:
1119
+ data = decode_dict(token)
1120
+ kalavai_seed_ip = data[CLUSTER_IP_KEY]
1121
+ kalavai_token = data[CLUSTER_TOKEN_KEY]
1122
+ cluster_name = data[CLUSTER_NAME_KEY]
1123
+ auth_key = data[AUTH_KEY]
1124
+ watcher_service = data[WATCHER_SERVICE_KEY]
1125
+ public_location = data[PUBLIC_LOCATION_KEY]
1126
+ except:
1127
+ console.log("[red]Error when parsing token. Invalid token")
1128
+ return
1129
+
1130
+ user = defaultdict(lambda: None)
1131
+ if public_location is not None:
1132
+ console.log("Joining private network")
1133
+ try:
1134
+ if not check_vpn_compatibility():
1135
+ return
1136
+ vpn = join_vpn(
1137
+ location=public_location,
1138
+ user_cookie=USER_COOKIE)
1139
+ user = user_login(user_cookie=USER_COOKIE)
1140
+ time.sleep(5)
1141
+ except Exception as e:
1142
+ console.log(f"[red]Error when joining network: {str(e)}")
1143
+ console.log("Are you authenticated? Try [yellow]kalavai login")
1144
+ return
1145
+ # validate public seed
1146
+ try:
1147
+ validate_join_public_seed(
1148
+ cluster_name=cluster_name,
1149
+ join_key=token,
1150
+ user_cookie=USER_COOKIE
1151
+ )
1152
+ except Exception as e:
1153
+ console.log(f"[red]Error when joining network: {str(e)}")
1154
+ leave_vpn(vpn_file=USER_VPN_COMPOSE_FILE)
1155
+ return
1156
+
1157
+ store_server_info(
1158
+ server_ip=kalavai_seed_ip,
1159
+ auth_key=auth_key,
1160
+ file=USER_LOCAL_SERVER_FILE,
1161
+ watcher_service=watcher_service,
1162
+ node_name=node_name,
1163
+ cluster_name=cluster_name,
1164
+ public_location=public_location,
1165
+ user_api_key=user["api_key"])
1166
+
1167
+ console.log(f"[green]You are now connected to {cluster_name} @ {kalavai_seed_ip}")
1168
+
1169
+
1170
+ @arguably.command
1171
+ def storage__create(name, storage, *others, force_namespace: str=None):
1172
+ """
1173
+ Create storage for the cluster
1174
+ """
1175
+ try:
1176
+ CLUSTER.validate_cluster()
1177
+ except Exception as e:
1178
+ console.log(f"[red]Problems with your pool: {str(e)}")
1179
+ return
1180
+
1181
+ if force_namespace is not None:
1182
+ console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1183
+
1184
+ # Deploy PVC
1185
+ data = {
1186
+ "name": name,
1187
+ "labels": {
1188
+ PVC_NAME_LABEL: name,
1189
+ "kalavai.resource": "storage"
1190
+ },
1191
+ "access_modes": STORAGE_ACCESS_MODE,
1192
+ "storage_class_name": STORAGE_CLASS_NAME,
1193
+ "storage_size": storage
1194
+ }
1195
+ if force_namespace is not None:
1196
+ data["force_namespace"] = force_namespace
1197
+
1198
+ try:
1199
+ result = request_to_server(
1200
+ method="post",
1201
+ endpoint="/v1/deploy_storage_claim",
1202
+ data=data,
1203
+ server_creds=USER_LOCAL_SERVER_FILE,
1204
+ user_cookie=USER_COOKIE
1205
+ )
1206
+ if "error" in result or "detail" in result:
1207
+ console.log(f"Error: {result}")
1208
+ else:
1209
+ console.log(f"Storage {name} ({storage}Gi) created")
1210
+ except Exception as e:
1211
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1212
+
1213
+
1214
+ @arguably.command
1215
+ def storage__list(*other):
1216
+ """
1217
+ List existing storages deployed in the pool
1218
+ """
1219
+ try:
1220
+ CLUSTER.validate_cluster()
1221
+ except Exception as e:
1222
+ console.log(f"[red]Problems with your pool: {str(e)}")
1223
+ return
1224
+
1225
+ try:
1226
+ user = load_user_session(user_cookie=USER_COOKIE)
1227
+ username = user["username"] if user is not None else None
1228
+ result = request_to_server(
1229
+ method="post",
1230
+ endpoint="/v1/get_storage_usage",
1231
+ data={},
1232
+ server_creds=USER_LOCAL_SERVER_FILE,
1233
+ user_cookie=USER_COOKIE
1234
+ )
1235
+
1236
+ columns = []
1237
+ rows = []
1238
+ for namespace, storages in result.items():
1239
+ for name, values in storages.items():
1240
+ if namespace == username:
1241
+ namespace = f"**{namespace}**"
1242
+ columns = list(values.keys())
1243
+ rows.append([namespace, name] + [f"{v:.2f} MB" if "capacity" in k else str(v) for k, v in values.items()])
1244
+
1245
+ if len(rows) == 0:
1246
+ console.log("[green] Storages have not been claimed yet (did you deploy any job using them?)")
1247
+ return
1248
+ columns = ["Owner", "Name"] + columns
1249
+ table = generate_table(columns=columns, rows=rows)
1250
+ console.log(table)
1251
+
1252
+ except Exception as e:
1253
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1254
+
1255
+ @arguably.command
1256
+ def storage__delete(name, *others, force_namespace: str=None):
1257
+ """
1258
+ Delete storage by name
1259
+ """
1260
+ try:
1261
+ CLUSTER.validate_cluster()
1262
+ except Exception as e:
1263
+ console.log(f"[red]Problems with your pool: {str(e)}")
1264
+ return
1265
+
1266
+ if force_namespace is not None:
1267
+ console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1268
+
1269
+ # deploy template with kube-watcher
1270
+ data = {
1271
+ "label": PVC_NAME_LABEL,
1272
+ "value": name
1273
+ }
1274
+ if force_namespace is not None:
1275
+ data["force_namespace"] = force_namespace
1276
+ try:
1277
+ result = request_to_server(
1278
+ method="post",
1279
+ endpoint="/v1/delete_labeled_resources",
1280
+ data=data,
1281
+ server_creds=USER_LOCAL_SERVER_FILE,
1282
+ user_cookie=USER_COOKIE
1283
+ )
1284
+ console.log(f"{result}")
1285
+ except Exception as e:
1286
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1287
+
1288
+ @arguably.command
1289
+ def node__list(*others):
1290
+ """
1291
+ Display information about nodes connected
1292
+ """
1293
+ try:
1294
+ CLUSTER.validate_cluster()
1295
+ except Exception as e:
1296
+ console.log(f"[red]Problems with your pool: {str(e)}")
1297
+ return
1298
+
1299
+ try:
1300
+ data = request_to_server(
1301
+ method="get",
1302
+ endpoint="/v1/get_nodes",
1303
+ data={},
1304
+ server_creds=USER_LOCAL_SERVER_FILE,
1305
+ user_cookie=USER_COOKIE
1306
+ )
1307
+ rows = []
1308
+ columns = ["Node name"]
1309
+ for node, status in data.items():
1310
+ row = [node]
1311
+ for key, value in status.items():
1312
+ if key not in columns:
1313
+ columns.append(key)
1314
+ row.append(str(value))
1315
+ rows.append(tuple(row))
1316
+
1317
+ console.log("Nodes with 'unschedulable=True' will not receive workload")
1318
+ console.log("To make a node unschedulable (i.e. won't receive workloads) use [yellow]kalavai node cordon <node name>")
1319
+ console.log("To make a node schedulable (i.e. will receive workloads) use [yellow]kalavai node uncordon <node name>")
1320
+ console.print(
1321
+ generate_table(columns=columns, rows=rows)
1322
+ )
1323
+
1324
+ except Exception as e:
1325
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1326
+
1327
+
1328
+ @arguably.command
1329
+ def node__delete(name, *others):
1330
+ """
1331
+ Delete a node from the cluster
1332
+ """
1333
+ try:
1334
+ CLUSTER.validate_cluster()
1335
+ except Exception as e:
1336
+ console.log(f"[red]Problems with your pool: {str(e)}")
1337
+ return
1338
+
1339
+ data = {
1340
+ "node_names": [name]
1341
+ }
1342
+ try:
1343
+ result = request_to_server(
1344
+ method="post",
1345
+ endpoint="/v1/delete_nodes",
1346
+ data=data,
1347
+ server_creds=USER_LOCAL_SERVER_FILE,
1348
+ user_cookie=USER_COOKIE
1349
+ )
1350
+ if result is None or result is True:
1351
+ console.log(f"Node {name} deleted successfully")
1352
+ else:
1353
+ console.log(f"{result}")
1354
+ except Exception as e:
1355
+ console.log(f"[yellow](ignore if stopping worker from dead server). Error when removing node {name}: {str(e)}")
1356
+
1357
+
1358
+ @arguably.command
1359
+ def node__cordon(node_name, *others):
1360
+ """
1361
+ Cordon a particular node so no more work will be scheduled on it
1362
+ """
1363
+ try:
1364
+ CLUSTER.validate_cluster()
1365
+ except Exception as e:
1366
+ console.log(f"[red]Problems with your pool: {str(e)}")
1367
+ return
1368
+ set_schedulable(schedulable=False, node_name=node_name)
1369
+
1370
+
1371
+ @arguably.command
1372
+ def node__uncordon(node_name, *others):
1373
+ """
1374
+ Uncordon a particular node to allow more work to be scheduled on it
1375
+ """
1376
+ try:
1377
+ CLUSTER.validate_cluster()
1378
+ except Exception as e:
1379
+ console.log(f"[red]Problems with your pool: {str(e)}")
1380
+ return
1381
+ set_schedulable(schedulable=True, node_name=node_name)
1382
+
1383
+
1384
+ @arguably.command
1385
+ def job__templates(*others):
1386
+ """
1387
+ Job templates integrated with kalavai. Use env var LOCAL_TEMPLATES_DIR to test local templates
1388
+ """
1389
+ try:
1390
+ CLUSTER.validate_cluster()
1391
+ except Exception as e:
1392
+ console.log(f"[red]Problems with your pool: {str(e)}")
1393
+ return
1394
+
1395
+ try:
1396
+ result = request_to_server(
1397
+ method="get",
1398
+ endpoint="/v1/get_job_templates",
1399
+ server_creds=USER_LOCAL_SERVER_FILE,
1400
+ data=None,
1401
+ user_cookie=USER_COOKIE
1402
+ )
1403
+ console.log("Templates available in the pool")
1404
+ console.log(result)
1405
+ except Exception as e:
1406
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1407
+
1408
+
1409
+ @arguably.command
1410
+ def job__run(template_name, *others, values: str=None, force_namespace: str=None):
1411
+ """
1412
+ Deploy and run a template job.
1413
+
1414
+ Args:
1415
+ *others: all the other positional arguments go here
1416
+ """
1417
+ try:
1418
+ CLUSTER.validate_cluster()
1419
+ except Exception as e:
1420
+ console.log(f"[red]Problems with your pool: {str(e)}")
1421
+ return
1422
+
1423
+ if force_namespace is not None:
1424
+ console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1425
+
1426
+ if values is None:
1427
+ values_dict = {}
1428
+ else:
1429
+ if not Path(values).is_file():
1430
+ console.log(f"[red]Values file {values} was not found")
1431
+
1432
+ with open(values, "r") as f:
1433
+ raw_values = yaml.load(f, Loader=yaml.SafeLoader)
1434
+ values_dict = {variable["name"]: variable['value'] for variable in raw_values}
1435
+
1436
+ # Inject hardware information if not present in the template
1437
+ def generate_gpu_annotation(input_message, values, value_key, annotation_key):
1438
+ if value_key not in values:
1439
+ selection = select_gpus(message=input_message)
1440
+ else:
1441
+ selection = values[value_key]
1442
+ if selection is not None:
1443
+ values[value_key] = f"{annotation_key}: {selection}"
1444
+ else:
1445
+ values[value_key] = ""
1446
+ GPU_TYPES_KEY = "use_gputype"
1447
+ GPU_NOTYPES_KEY = "nouse_gputype"
1448
+ console.log("Checking current GPU stock...")
1449
+ generate_gpu_annotation(
1450
+ input_message="SELECT Target GPUs for the job (loading models)",
1451
+ values=values_dict,
1452
+ value_key=GPU_TYPES_KEY,
1453
+ annotation_key="nvidia.com/use-gputype"
1454
+ )
1455
+ generate_gpu_annotation(
1456
+ input_message="AVOID Target GPUs for the job (loading models)",
1457
+ values=values_dict,
1458
+ value_key=GPU_NOTYPES_KEY,
1459
+ annotation_key="nvidia.com/nouse-gputype"
1460
+ )
1461
+
1462
+ # deploy template with kube-watcher
1463
+ data = {
1464
+ "template": template_name,
1465
+ "template_values": values_dict
1466
+ }
1467
+ if force_namespace is not None:
1468
+ data["force_namespace"] = force_namespace
1469
+
1470
+ try:
1471
+ result = request_to_server(
1472
+ method="post",
1473
+ endpoint="/v1/deploy_job",
1474
+ data=data,
1475
+ server_creds=USER_LOCAL_SERVER_FILE,
1476
+ user_cookie=USER_COOKIE
1477
+ )
1478
+ console.log(f"[green]{template_name} job deployed")
1479
+ except Exception as e:
1480
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1481
+ return
1482
+
1483
+
1484
+ @arguably.command
1485
+ def job__defaults(template_name, *others):
1486
+ """
1487
+ Fetch default values.yaml for a template job
1488
+ """
1489
+ try:
1490
+ CLUSTER.validate_cluster()
1491
+ except Exception as e:
1492
+ console.log(f"[red]Problems with your pool: {str(e)}")
1493
+ return
1494
+
1495
+ # deploy template with kube-watcher
1496
+ data = {
1497
+ "template": template_name
1498
+ }
1499
+ try:
1500
+ result = request_to_server(
1501
+ method="get",
1502
+ endpoint="/v1/job_defaults",
1503
+ data=data,
1504
+ server_creds=USER_LOCAL_SERVER_FILE,
1505
+ user_cookie=USER_COOKIE
1506
+ )
1507
+ print(
1508
+ json.dumps(result,indent=3)
1509
+ )
1510
+ except Exception as e:
1511
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1512
+
1513
+
1514
+ @arguably.command
1515
+ def job__delete(name, *others, force_namespace: str=None):
1516
+ """
1517
+ Delete job in the cluster
1518
+ """
1519
+ try:
1520
+ CLUSTER.validate_cluster()
1521
+ except Exception as e:
1522
+ console.log(f"[red]Problems with your pool: {str(e)}")
1523
+ return
1524
+
1525
+ if force_namespace is not None:
1526
+ console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1527
+
1528
+ # deploy template with kube-watcher
1529
+ data = {
1530
+ "label": TEMPLATE_LABEL, # this ensures that both lws template and services are deleted
1531
+ "value": name
1532
+ }
1533
+ if force_namespace is not None:
1534
+ data["force_namespace"] = force_namespace
1535
+ try:
1536
+ result = request_to_server(
1537
+ method="post",
1538
+ endpoint="/v1/delete_labeled_resources",
1539
+ data=data,
1540
+ server_creds=USER_LOCAL_SERVER_FILE,
1541
+ user_cookie=USER_COOKIE
1542
+ )
1543
+ console.log(f"{result}")
1544
+ except Exception as e:
1545
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1546
+
1547
+
1548
+ @arguably.command
1549
+ def job__estimate(billion_parameters, *others, precision=32):
1550
+ """Guesstimate of resources needed based on required memory and current resources"""
1551
+ try:
1552
+ CLUSTER.validate_cluster()
1553
+ except Exception as e:
1554
+ console.log(f"[red]Problems with your pool: {str(e)}")
1555
+ return
1556
+
1557
+ average_vram = 8
1558
+ required_memory = float(billion_parameters) * (precision / 8) / 1.2
1559
+ available_gpus = fetch_gpus()
1560
+ vrams = []
1561
+ for _, gpus in available_gpus:
1562
+ for model in gpus["gpus"]:
1563
+ vrams.extend([int(model["memory"])/1000] * int(gpus["capacity"]) )
1564
+ vrams = sorted(vrams, reverse=False)
1565
+
1566
+ console.log(f"There are {len(vrams)} GPUs available ({sum(vrams)}GBs)")
1567
+ console.log(f"A [yellow]{billion_parameters}B[white] model requires [yellow]~{required_memory:.2f}GB vRAM[white] at {precision}bits precision")
1568
+
1569
+ if sum(vrams) < required_memory:
1570
+ console.log("Current capacity is insufficient to host the model, but it can be scheduled for when it is!")
1571
+ console.log(f"Average devices have {average_vram}GB vRAM, use {math.ceil(required_memory/(average_vram))} GPU workers")
1572
+ else:
1573
+ current_vram = 0
1574
+ n_devices = 0
1575
+ for mem in vrams:
1576
+ current_vram += mem
1577
+ n_devices += 1
1578
+ if current_vram > required_memory:
1579
+ break
1580
+ console.log(f"Looking at current capacity, use [green]{n_devices} GPU workers[white] for a total [green]{current_vram:.2f} GB vRAM")
1581
+
1582
+ @arguably.command
1583
+ def job__status(name, *others):
1584
+
1585
+ try:
1586
+ # get pod statuses
1587
+ data = {
1588
+ "label": TEMPLATE_LABEL,
1589
+ "value": name
1590
+ }
1591
+ result = request_to_server(
1592
+ method="post",
1593
+ endpoint="/v1/get_pods_status_for_label",
1594
+ data=data,
1595
+ server_creds=USER_LOCAL_SERVER_FILE,
1596
+ user_cookie=USER_COOKIE
1597
+ )
1598
+ workers_status = defaultdict(int)
1599
+ workers_conditions = {}
1600
+ for _, ss in result.items():
1601
+ for pod_name, values in ss.items():
1602
+ workers_status[values["status"]] += 1
1603
+ workers_conditions[pod_name] = values["conditions"]
1604
+ workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
1605
+
1606
+ console.log("Workers conditions")
1607
+ for worker, conditions in workers_conditions.items():
1608
+ console.log(f"[yellow]{worker}")
1609
+ console.log(conditions)
1610
+ console.log(f"[yellow]{workers}\nTotal: {len(workers_status)}")
1611
+ except Exception as e:
1612
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1613
+ return
1614
+
1615
+ @arguably.command
1616
+ def job__list(*others, detailed=False):
1617
+ """
1618
+ List jobs in the cluster
1619
+ """
1620
+ try:
1621
+ CLUSTER.validate_cluster()
1622
+ except Exception as e:
1623
+ console.log(f"[red]Problems with your pool: {str(e)}")
1624
+ return
1625
+
1626
+ data = {
1627
+ "group": "batch.volcano.sh",
1628
+ "api_version": "v1alpha1",
1629
+ "plural": "jobs"
1630
+ }
1631
+ try:
1632
+ result = request_to_server(
1633
+ method="post",
1634
+ endpoint="/v1/get_objects_of_type",
1635
+ data=data,
1636
+ server_creds=USER_LOCAL_SERVER_FILE,
1637
+ user_cookie=USER_COOKIE
1638
+ )
1639
+ all_deployments = defaultdict(list)
1640
+ for ns, ds in result.items():
1641
+ all_deployments[ns].extend([d["metadata"]["labels"][TEMPLATE_LABEL] for d in ds["items"]])
1642
+ #deployments = {ns: d["metadata"]["labels"][TEMPLATE_LABEL] for ns, ds in result.items() for d in ds["items"]}
1643
+ except Exception as e:
1644
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1645
+ return
1646
+ if len(all_deployments.keys()) == 0:
1647
+ console.log("[green]No deployments found.")
1648
+ return
1649
+
1650
+ columns = ["Owner", "Deployment", "Workers", "Endpoint"]
1651
+ if detailed:
1652
+ columns.append("Status")
1653
+ rows = []
1654
+ for namespace, deployments in all_deployments.items():
1655
+ for deployment in deployments:
1656
+ try:
1657
+ # get status for deployment
1658
+ if detailed:
1659
+ data = {
1660
+ "group": "batch.volcano.sh",
1661
+ "api_version": "v1alpha1",
1662
+ "plural": "jobs",
1663
+ # "group": "leaderworkerset.x-k8s.io",
1664
+ # "api_version": "v1",
1665
+ # "plural": "leaderworkersets",
1666
+ "name": deployment
1667
+ }
1668
+ result = request_to_server(
1669
+ method="post",
1670
+ endpoint="/v1/get_status_for_object",
1671
+ data=data,
1672
+ server_creds=USER_LOCAL_SERVER_FILE,
1673
+ user_cookie=USER_COOKIE
1674
+ )
1675
+ ss = [] # flatten results ({namespace: statuses})
1676
+ [ss.extend(values) for values in result.values()]
1677
+ if len(ss) > 0:
1678
+ last = ss[-1]
1679
+ statuses = f"[{last['lastTransitionTime']}] {last['status']}"
1680
+ else:
1681
+ statuses = "Unknown"
1682
+ # get pod statuses
1683
+ data = {
1684
+ "label": TEMPLATE_LABEL,
1685
+ "value": deployment
1686
+ }
1687
+ result = request_to_server(
1688
+ method="post",
1689
+ endpoint="/v1/get_pods_status_for_label",
1690
+ data=data,
1691
+ server_creds=USER_LOCAL_SERVER_FILE,
1692
+ user_cookie=USER_COOKIE
1693
+ )
1694
+ workers_status = defaultdict(int)
1695
+ for ns, ss in result.items():
1696
+ if ns != namespace: # same job name, different namespace
1697
+ continue
1698
+ for _, values in ss.items():
1699
+ workers_status[values["status"]] += 1
1700
+ workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
1701
+ # get URL details
1702
+ data = {
1703
+ "label": TEMPLATE_LABEL,
1704
+ "value": deployment,
1705
+ "types": ["NodePort"]
1706
+ }
1707
+ result = request_to_server(
1708
+ method="post",
1709
+ endpoint="/v1/get_ports_for_services",
1710
+ data=data,
1711
+ server_creds=USER_LOCAL_SERVER_FILE,
1712
+ user_cookie=USER_COOKIE
1713
+ )
1714
+ node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
1715
+
1716
+ urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
1717
+ row = [namespace, deployment, workers, "\n".join(urls)]
1718
+ if detailed:
1719
+ row.append(statuses)
1720
+ rows.append(row)
1721
+
1722
+ except Exception as e:
1723
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1724
+ return
1725
+
1726
+ console.print(
1727
+ generate_table(columns=columns, rows=rows, end_sections=range(len(rows)))
1728
+ )
1729
+
1730
+ console.log("Check detailed status with [yellow]kalavai job status <name of deployment>")
1731
+ console.log("Get logs with [yellow]kalavai job logs <name of deployment> [white](note it only works when the deployment is complete)")
1732
+
1733
+
1734
+ @arguably.command
1735
+ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namespace: str=None):
1736
+ """
1737
+ Get logs for a specific job
1738
+ """
1739
+ try:
1740
+ CLUSTER.validate_cluster()
1741
+ except Exception as e:
1742
+ console.log(f"[red]Problems with your pool: {str(e)}")
1743
+ return
1744
+
1745
+ if force_namespace is not None:
1746
+ console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1747
+
1748
+ data = {
1749
+ "label": TEMPLATE_LABEL,
1750
+ "value": name,
1751
+ "tail": tail
1752
+ }
1753
+ if force_namespace is not None:
1754
+ data["force_namespace"] = force_namespace
1755
+ while True:
1756
+ try:
1757
+ # send tail as parameter (fetch only last _tail_ lines)
1758
+ result = request_to_server(
1759
+ method="post",
1760
+ endpoint="/v1/get_logs_for_label",
1761
+ data=data,
1762
+ server_creds=USER_LOCAL_SERVER_FILE,
1763
+ user_cookie=USER_COOKIE
1764
+ )
1765
+ if not stream:
1766
+ for pod, logs in result.items():
1767
+ if pod_name is not None and pod_name != pod:
1768
+ continue
1769
+ console.log(f"[yellow]Pod {pod}")
1770
+ console.log(f"[green]{logs}")
1771
+ break
1772
+ else:
1773
+ os.system("clear")
1774
+ for pod, logs in result.items():
1775
+ if pod_name is not None and pod_name != pod:
1776
+ continue
1777
+ print(f"Pod {pod}")
1778
+ print(f"{logs}")
1779
+ time.sleep(1)
1780
+ except KeyboardInterrupt:
1781
+ break
1782
+ except Exception as e:
1783
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1784
+ console.log(f"Check if {name} is running with [yellow]kalavai job list")
1785
+ return
1786
+
1787
+ @arguably.command
1788
+ def job__manifest(*others, name, force_namespace: str=None):
1789
+ """
1790
+ Get job manifest description
1791
+ """
1792
+ try:
1793
+ CLUSTER.validate_cluster()
1794
+ except Exception as e:
1795
+ console.log(f"[red]Problems with your pool: {str(e)}")
1796
+ return
1797
+
1798
+ if force_namespace is not None:
1799
+ console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1800
+
1801
+ data = {
1802
+ "label": TEMPLATE_LABEL,
1803
+ "value": name,
1804
+ }
1805
+ if force_namespace is not None:
1806
+ data["force_namespace"] = force_namespace
1807
+ try:
1808
+ result = request_to_server(
1809
+ method="post",
1810
+ endpoint="/v1/describe_pods_for_label",
1811
+ data=data,
1812
+ server_creds=USER_LOCAL_SERVER_FILE,
1813
+ user_cookie=USER_COOKIE
1814
+ )
1815
+ for pod, manifest in result.items():
1816
+ manifest = json.dumps(manifest, indent=3)
1817
+ console.log(f"[yellow]Pod {pod}")
1818
+ console.log(f"{manifest}")
1819
+ except Exception as e:
1820
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1821
+ return
1822
+
1823
+
1824
+ @arguably.command
1825
+ def ray__create(name, template_path, *others, force_namespace: str=None):
1826
+ """
1827
+ Create a cluster using KubeRay operator
1828
+ """
1829
+
1830
+ try:
1831
+ CLUSTER.validate_cluster()
1832
+ except Exception as e:
1833
+ console.log(f"[red]Problems with your pool: {str(e)}")
1834
+ return
1835
+
1836
+ with open(template_path, "r") as f:
1837
+ template_yaml = f.read()
1838
+
1839
+ data = {
1840
+ "name": name,
1841
+ "manifest": template_yaml
1842
+ }
1843
+ if force_namespace is not None:
1844
+ data["force_namespace"] = force_namespace
1845
+ try:
1846
+ result = request_to_server(
1847
+ method="post",
1848
+ endpoint="/v1/deploy_ray",
1849
+ data=data,
1850
+ server_creds=USER_LOCAL_SERVER_FILE,
1851
+ user_cookie=USER_COOKIE
1852
+ )
1853
+ if len(result['failed']) > 0:
1854
+ console.log(f"[red]Error when deploying ray manifest\n\n{result['failed']}")
1855
+ return
1856
+ if len(result['successful']) > 0:
1857
+ console.log(f"[green]Ray cluster {name} successfully deployed!")
1858
+ except Exception as e:
1859
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1860
+ return
1861
+
1862
+
1863
+ @arguably.command
1864
+ def ray__list(*status):
1865
+ """
1866
+ List all available ray clusters
1867
+ """
1868
+ try:
1869
+ CLUSTER.validate_cluster()
1870
+ except Exception as e:
1871
+ console.log(f"[red]Problems with your pool: {str(e)}")
1872
+ return
1873
+
1874
+ data = {
1875
+ "group": "ray.io",
1876
+ "api_version": "v1",
1877
+ "plural": "rayclusters"
1878
+ }
1879
+ try:
1880
+ result = request_to_server(
1881
+ method="post",
1882
+ endpoint="/v1/get_objects_of_type",
1883
+ data=data,
1884
+ server_creds=USER_LOCAL_SERVER_FILE,
1885
+ user_cookie=USER_COOKIE
1886
+ )
1887
+ clusters = {ns: ds["items"] for ns, ds in result.items()}
1888
+ #clusters = result['items']
1889
+
1890
+ except Exception as e:
1891
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1892
+ return
1893
+
1894
+ if len(clusters) == 0:
1895
+ console.log("No clusters available")
1896
+ return
1897
+
1898
+ # pretty print
1899
+ columns = ["Owner", "Name", "Status", "CPUs", "GPUs", "Memory", "Endpoints"]
1900
+ rows = []
1901
+ server_ip = load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)
1902
+ for namespace, clusters in clusters.items():
1903
+ for cluster in clusters:
1904
+ cluster_name = cluster['metadata']['name']
1905
+ cpus = cluster["status"]["desiredCPU"]
1906
+ gpus = cluster["status"]["desiredGPU"]
1907
+ memory = cluster["status"]["desiredMemory"]
1908
+ min_workers = cluster["status"]["minWorkerReplicas"] if "minWorkerReplicas" in cluster["status"] else 0
1909
+ max_workers = cluster["status"]["maxWorkerReplicas"] if "maxWorkerReplicas" in cluster["status"] else 0
1910
+ ready_workers = cluster["status"]["readyWorkerReplicas"] if "readyWorkerReplicas" in cluster["status"] else 0
1911
+ head_status = cluster["status"]["state"] if "state" in cluster["status"] else "creating"
1912
+ status = f"Head {head_status}\nWorkers: {ready_workers} ready ({min_workers}/{max_workers})"
1913
+ endpoints = [f"{k}: http://{server_ip}:{v}" for k, v in cluster['status']["endpoints"].items()]
1914
+ rows.append(
1915
+ (namespace, cluster_name, status, cpus, gpus, memory, "\n".join(endpoints))
1916
+ )
1917
+ table = generate_table(columns=columns, rows=rows)
1918
+ console.log(table)
1919
+
1920
+ @arguably.command
1921
+ def ray__delete(*others, name, force_namespace=None):
1922
+ """
1923
+ Delete a ray cluster
1924
+ """
1925
+ try:
1926
+ CLUSTER.validate_cluster()
1927
+ except Exception as e:
1928
+ console.log(f"[red]Problems with your pool: {str(e)}")
1929
+ return
1930
+
1931
+ # deploy template with kube-watcher
1932
+ data = {
1933
+ "label": RAY_LABEL, # this ensures that both raycluster and services are deleted
1934
+ "value": name
1935
+ }
1936
+ if force_namespace is not None:
1937
+ data["force_namespace"] = force_namespace
1938
+ try:
1939
+ result = request_to_server(
1940
+ method="post",
1941
+ endpoint="/v1/delete_labeled_resources",
1942
+ data=data,
1943
+ server_creds=USER_LOCAL_SERVER_FILE,
1944
+ user_cookie=USER_COOKIE
1945
+ )
1946
+ console.log(f"{result}")
1947
+ except Exception as e:
1948
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1949
+
1950
+ @arguably.command
1951
+ def ray__manifest(*others, name, force_namespace=None):
1952
+ """
1953
+ Get ray cluster manifest description
1954
+ """
1955
+ try:
1956
+ CLUSTER.validate_cluster()
1957
+ except Exception as e:
1958
+ console.log(f"[red]Problems with your pool: {str(e)}")
1959
+ return
1960
+
1961
+ data = {
1962
+ "label": "ray.io/cluster",
1963
+ "value": name
1964
+ }
1965
+ if force_namespace is not None:
1966
+ data["force_namespace"] = force_namespace
1967
+ try:
1968
+ result = request_to_server(
1969
+ method="post",
1970
+ endpoint="/v1/describe_pods_for_label",
1971
+ data=data,
1972
+ server_creds=USER_LOCAL_SERVER_FILE,
1973
+ user_cookie=USER_COOKIE
1974
+ )
1975
+ for pod, manifest in result.items():
1976
+ manifest = json.dumps(manifest, indent=3)
1977
+ console.log(f"[yellow]Pod {pod}")
1978
+ console.log(f"{manifest}")
1979
+ except Exception as e:
1980
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1981
+ return
1982
+
1983
+
1984
+ def app():
1985
+ user_path("", create_path=True)
1986
+ arguably.run()
1987
+
1988
+ if __name__ == "__main__":
1989
+ app()