kalavai-client 0.5.14__py3-none-any.whl → 0.5.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kalavai_client/cli.py CHANGED
@@ -7,19 +7,30 @@ import time
7
7
  import socket
8
8
  from pathlib import Path
9
9
  from getpass import getpass
10
- import ipaddress
11
10
  from sys import exit
12
11
 
13
12
  import yaml
14
- import netifaces as ni
13
+
15
14
  import arguably
16
15
  from rich.console import Console
17
16
 
17
+ from kalavai_client.cluster import CLUSTER
18
18
  from kalavai_client.env import (
19
19
  USER_COOKIE,
20
20
  USER_LOCAL_SERVER_FILE,
21
21
  TEMPLATE_LABEL,
22
- user_path
22
+ KALAVAI_PLATFORM_URL,
23
+ DEFAULT_VPN_CONTAINER_NAME,
24
+ CONTAINER_HOST_PATH,
25
+ USER_COMPOSE_FILE,
26
+ USER_HELM_APPS_FILE,
27
+ USER_KUBECONFIG_FILE,
28
+ USER_VPN_COMPOSE_FILE,
29
+ USER_TEMPLATES_FOLDER,
30
+ DOCKER_COMPOSE_GUI,
31
+ USER_GUI_COMPOSE_FILE,
32
+ user_path,
33
+ resource_path,
23
34
  )
24
35
  from kalavai_client.core import (
25
36
  fetch_resources,
@@ -28,22 +39,29 @@ from kalavai_client.core import (
28
39
  fetch_devices,
29
40
  fetch_job_logs,
30
41
  fetch_gpus,
31
- load_gpu_models
42
+ load_gpu_models,
43
+ fetch_job_templates,
44
+ fetch_job_defaults,
45
+ deploy_job,
46
+ delete_job,
47
+ check_token,
48
+ attach_to_pool,
49
+ join_pool,
50
+ create_pool,
51
+ get_ip_addresses,
52
+ pause_agent,
53
+ resume_agent
32
54
  )
33
55
  from kalavai_client.utils import (
34
56
  check_gpu_drivers,
57
+ load_template,
35
58
  run_cmd,
36
- decode_dict,
37
59
  generate_join_token,
38
60
  user_confirm,
39
- load_template,
40
- store_server_info,
41
61
  generate_table,
42
62
  request_to_server,
43
- resource_path,
44
63
  safe_remove,
45
64
  leave_vpn,
46
- get_vpn_details,
47
65
  load_server_info,
48
66
  user_login,
49
67
  user_logout,
@@ -51,9 +69,6 @@ from kalavai_client.utils import (
51
69
  register_cluster,
52
70
  unregister_cluster,
53
71
  get_public_seeds,
54
- validate_join_public_seed,
55
- is_storage_compatible,
56
- is_watcher_alive,
57
72
  load_user_session,
58
73
  SERVER_IP_KEY,
59
74
  AUTH_KEY,
@@ -62,112 +77,30 @@ from kalavai_client.utils import (
62
77
  WRITE_AUTH_KEY,
63
78
  PUBLIC_LOCATION_KEY,
64
79
  NODE_NAME_KEY,
65
- CLUSTER_NAME_KEY,
66
- CLUSTER_IP_KEY,
67
- CLUSTER_TOKEN_KEY,
68
- WATCHER_PORT_KEY,
69
- MANDATORY_TOKEN_FIELDS,
70
- USER_NODE_LABEL_KEY,
71
- ALLOW_UNREGISTERED_USER_KEY
72
- )
73
- from kalavai_client.cluster import (
74
- dockerCluster
80
+ CLUSTER_NAME_KEY
75
81
  )
76
82
 
77
83
 
78
- KALAVAI_PLATFORM_URL = os.getenv("KALAVAI_PLATFORM_URL", "https://platform.kalavai.net")
79
84
  LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
80
85
  VERSION = 1
81
86
  RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
82
87
  CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
83
88
  RAY_LABEL = "kalavai.ray.name"
84
89
  PVC_NAME_LABEL = "kalavai.storage.name"
85
- DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
86
90
  VPN_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/vpn-template.yaml")
87
- POOL_CONFIG_TEMPLATE = resource_path("kalavai_client/assets/pool_config_template.yaml")
88
- POOL_CONFIG_DEFAULT_VALUES = resource_path("kalavai_client/assets/pool_config_values.yaml")
89
- USER_WORKSPACE_TEMPLATE = resource_path("kalavai_client/assets/user_workspace.yaml")
90
- DEFAULT_USER_WORKSPACE_VALUES = resource_path("kalavai_client/assets/user_workspace_values.yaml")
91
91
  STORAGE_CLASS_NAME = "local-path"
92
92
  STORAGE_ACCESS_MODE = ["ReadWriteOnce"]
93
- STORAGE_CLASS_LABEL = "kalavai.storage.enabled"
94
93
  DEFAULT_STORAGE_NAME = "pool-cache"
95
94
  DEFAULT_STORAGE_SIZE = 20
96
- DEFAULT_WATCHER_PORT = 30001
97
- USER_NODE_LABEL = "kalavai.cluster.user"
98
- KUBE_VERSION = os.getenv("KALAVAI_KUBE_VERSION", "v1.31.1+k3s1")
99
- DEFAULT_FLANNEL_IFACE = os.getenv("KALAVAI_FLANNEL_IFACE", "netmaker-1")
100
- FORBIDEDEN_IPS = ["127.0.0.1"]
101
- # kalavai templates
102
- HELM_APPS_FILE = resource_path("kalavai_client/assets/apps.yaml")
103
- HELM_APPS_VALUES = resource_path("kalavai_client/assets/apps_values.yaml")
104
- # user specific config files
105
- DEFAULT_CONTAINER_NAME = "kalavai"
106
- DEFAULT_VPN_CONTAINER_NAME = "kalavai-vpn"
107
- CONTAINER_HOST_PATH = user_path("pool/", create_path=True)
108
- USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
109
- USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
110
- USER_HELM_APPS_FILE = user_path("apps.yaml")
111
- USER_KUBECONFIG_FILE = user_path("kubeconfig")
112
- USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
113
-
114
95
 
96
+
115
97
  console = Console()
116
- CLUSTER = dockerCluster(
117
- container_name=DEFAULT_CONTAINER_NAME,
118
- kube_version=KUBE_VERSION,
119
- flannel_iface=DEFAULT_FLANNEL_IFACE,
120
- compose_file=USER_COMPOSE_FILE,
121
- kubeconfig_file=USER_KUBECONFIG_FILE,
122
- poolconfig_file=USER_LOCAL_SERVER_FILE,
123
- dependencies_file=USER_HELM_APPS_FILE
124
- )
125
98
 
126
99
 
127
100
  ######################
128
101
  ## HELPER FUNCTIONS ##
129
102
  ######################
130
103
 
131
- def check_seed_compatibility():
132
- """Check required packages to start pools"""
133
- logs = []
134
- console.log("[white]Checking system requirements...")
135
- # docker
136
- try:
137
- run_cmd("docker version >/dev/null 2>&1")
138
- except:
139
- logs.append("[red]Docker not installed. Install instructions:\n")
140
- logs.append(" Linux: https://docs.docker.com/engine/install/\n")
141
- logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
142
-
143
- if len(logs) == 0:
144
- console.log("[green]System is ready to start a pool")
145
- return True
146
- else:
147
- for log in logs:
148
- console.log(log)
149
- return False
150
-
151
- def check_worker_compatibility():
152
- """Check required packages to join pools"""
153
- logs = []
154
- console.log("[white]Checking system requirements...")
155
- # docker
156
- try:
157
- run_cmd("docker version >/dev/null 2>&1")
158
- except:
159
- logs.append("[red]Docker not installed. Install instructions:\n")
160
- logs.append(" Linux: https://docs.docker.com/engine/install/\n")
161
- logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
162
-
163
- if len(logs) == 0:
164
- console.log("[green]System is ready to join a pool")
165
- return True
166
- else:
167
- for log in logs:
168
- console.log(log)
169
- return False
170
-
171
104
 
172
105
  def cleanup_local():
173
106
  console.log("Removing local cache files...")
@@ -178,6 +111,7 @@ def cleanup_local():
178
111
  safe_remove(USER_KUBECONFIG_FILE)
179
112
  safe_remove(USER_LOCAL_SERVER_FILE)
180
113
  safe_remove(USER_TEMPLATES_FOLDER)
114
+ safe_remove(USER_GUI_COMPOSE_FILE)
181
115
 
182
116
  def pre_join_check(node_name, server_url, server_key):
183
117
  # check with the server that we can connect
@@ -217,75 +151,9 @@ def set_schedulable(schedulable, node_name=load_server_info(data_key=NODE_NAME_K
217
151
  except Exception as e:
218
152
  console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
219
153
 
220
-
221
- def init_user_workspace(force_namespace=None):
222
-
223
- # load template config and populate with values
224
- sidecar_template_yaml = load_template(
225
- template_path=USER_WORKSPACE_TEMPLATE,
226
- values={},
227
- default_values_path=DEFAULT_USER_WORKSPACE_VALUES)
228
-
229
- try:
230
- data = {"config": sidecar_template_yaml}
231
- if force_namespace is not None:
232
- data["force_namespace"] = force_namespace
233
- result = request_to_server(
234
- method="post",
235
- endpoint="/v1/create_user_space",
236
- data=data,
237
- server_creds=USER_LOCAL_SERVER_FILE,
238
- user_cookie=USER_COOKIE
239
- )
240
- console.log(f"Workspace creation (ignore already created warnings): {result}" )
241
- except Exception as e:
242
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
243
-
244
- def pool_init(pool_config_values_path=None):
245
- """Deploy configured objects to initialise pool"""
246
- if pool_config_values_path is None:
247
- return
248
-
249
- # load template config and populate with values
250
- sidecar_template_yaml = load_template(
251
- template_path=POOL_CONFIG_TEMPLATE,
252
- values={},
253
- default_values_path=pool_config_values_path)
254
-
255
- try:
256
- result = request_to_server(
257
- method="post",
258
- endpoint="/v1/deploy_generic_model",
259
- data={"config": sidecar_template_yaml},
260
- server_creds=USER_LOCAL_SERVER_FILE,
261
- user_cookie=USER_COOKIE
262
- )
263
- if 'failed' in result and len(result['failed']) > 0:
264
- console.log(f"[red]Error when deploying pool config\n\n{result['failed']}")
265
- if len(result['successful']) > 0:
266
- console.log(f"[green]Deployed pool config!")
267
- except Exception as e:
268
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
269
-
270
154
  def select_ip_address(subnet=None):
271
- ips = []
272
- retry = 3
273
- while len(ips) == 0:
274
- for iface in ni.interfaces():
275
- try:
276
- ip = ni.ifaddresses(iface)[ni.AF_INET][0]['addr']
277
- if ip in FORBIDEDEN_IPS:
278
- continue
279
- if subnet is None or ipaddress.ip_address(ip) in ipaddress.ip_network(subnet):
280
- ips.append(ip)
281
- except:
282
- pass
283
- if len(ips) == 1:
284
- return ips[0]
285
- time.sleep(2)
286
- retry -= 1
287
- if retry < 0:
288
- raise ValueError(f"No IPs available on subnet {subnet}")
155
+ ips = get_ip_addresses(subnet=subnet)
156
+
289
157
  while True:
290
158
  option = user_confirm(
291
159
  question="Select IP to advertise the node (needs to be visible to other nodes)",
@@ -336,50 +204,50 @@ def select_token_type():
336
204
  break
337
205
  return {"admin": choice == 0, "user": choice == 1, "worker": choice == 2}
338
206
 
339
- def generate_compose_config(role, node_name, is_public, use_gpus=True, node_labels=None, pool_ip=None, vpn_token=None, pool_token=None):
207
+ def input_gpus():
340
208
  num_gpus = 0
341
- if use_gpus:
342
- try:
343
- has_gpus = check_gpu_drivers()
344
- if has_gpus:
345
- max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
346
- num_gpus = user_confirm(
347
- question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
348
- options=range(max_gpus+1)
349
- )
350
- except:
351
- console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
352
- if node_labels is not None:
353
- node_labels = " ".join([f"--node-label {key}={value}" for key, value in node_labels.items()])
354
- compose_values = {
355
- "user_path": user_path(""),
356
- "service_name": DEFAULT_CONTAINER_NAME,
357
- "vpn": is_public,
358
- "vpn_name": DEFAULT_VPN_CONTAINER_NAME,
359
- "pool_ip": pool_ip,
360
- "pool_token": pool_token,
361
- "vpn_token": vpn_token,
362
- "node_name": node_name,
363
- "command": role,
364
- "storage_enabled": "True",
365
- "num_gpus": num_gpus,
366
- "k3s_path": f"{CONTAINER_HOST_PATH}/k3s",
367
- "etc_path": f"{CONTAINER_HOST_PATH}/etc",
368
- "node_labels": node_labels,
369
- "flannel_iface": DEFAULT_FLANNEL_IFACE if is_public else ""
370
- }
371
- # generate local config files
372
- compose_yaml = load_template(
373
- template_path=DOCKER_COMPOSE_TEMPLATE,
374
- values=compose_values)
375
- with open(USER_COMPOSE_FILE, "w") as f:
376
- f.write(compose_yaml)
377
- return compose_yaml
209
+ try:
210
+ has_gpus = check_gpu_drivers()
211
+ if has_gpus:
212
+ max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
213
+ num_gpus = user_confirm(
214
+ question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
215
+ options=range(max_gpus+1)
216
+ )
217
+ except:
218
+ console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
219
+ return num_gpus
378
220
 
379
221
  ##################
380
222
  ## CLI COMMANDS ##
381
223
  ##################
382
224
 
225
+ @arguably.command
226
+ def gui__start(*others, gui_port=3000, backend_port=8000):
227
+ """Run GUI"""
228
+ values = {
229
+ "path": user_path(""),
230
+ "gui_port": gui_port,
231
+ "backend_port": backend_port
232
+ }
233
+ compose_yaml = load_template(
234
+ template_path=DOCKER_COMPOSE_GUI,
235
+ values=values)
236
+ with open(USER_GUI_COMPOSE_FILE, "w") as f:
237
+ f.write(compose_yaml)
238
+
239
+ run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} up -d")
240
+
241
+ console.log(f"[green]Loading GUI, may take a few minutes. It will be available at http://localhost:{gui_port}")
242
+
243
+ @arguably.command
244
+ def gui__stop(*others):
245
+ """Stop GUI"""
246
+ run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} down")
247
+
248
+ console.log("[green]Kalavai GUI has been stopped")
249
+
250
+
383
251
  @arguably.command
384
252
  def login(*others, username: str=None):
385
253
  """
@@ -461,8 +329,9 @@ def pool__publish(*others, description=None):
461
329
  description = description
462
330
 
463
331
  try:
464
- if not pool__check_token(token=token, public=True):
465
- raise ValueError("[red]Cluster must be started with a valid vpn_location to publish")
332
+ valid = check_token(token=token, public=True)
333
+ if "error" in valid:
334
+ raise ValueError(f"[red]Cluster must be started with a valid vpn_location to publish: {valid}")
466
335
  cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
467
336
 
468
337
  register_cluster(
@@ -523,7 +392,7 @@ def pool__list(*others, user_only=False):
523
392
 
524
393
 
525
394
  @arguably.command
526
- def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=HELM_APPS_VALUES, pool_config_values: str=POOL_CONFIG_DEFAULT_VALUES):
395
+ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=None, pool_config_values: str=None):
527
396
  """
528
397
  Start Kalavai pool and start/resume sharing resources.
529
398
 
@@ -531,9 +400,6 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
531
400
  *others: all the other positional arguments go here
532
401
  """
533
402
 
534
- if not check_seed_compatibility():
535
- return
536
-
537
403
  if CLUSTER.is_cluster_init():
538
404
  console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
539
405
  return
@@ -547,127 +413,25 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
547
413
  console.log("Installation was cancelled and did not complete.")
548
414
  return
549
415
 
550
- # if only registered users are allowed, check user has logged in
551
- user = defaultdict(lambda: None)
552
- if only_registered_users or location is not None:
553
- user = user_login(user_cookie=USER_COOKIE)
554
- if user is None:
555
- console.log("[white]--only-registered-users [red]or [white]--location[red] can only be used if the host is authenticated. Run [yellow]kalavai login[red] to authenticate")
556
- exit()
557
-
558
- # join private network if provided
559
- vpn = defaultdict(lambda: None)
560
- node_labels = {
561
- STORAGE_CLASS_LABEL: is_storage_compatible()
562
- }
563
- if location is not None:
564
- console.log("Fetching VPN credentials")
565
- try:
566
- vpn = get_vpn_details(
567
- location=location,
568
- user_cookie=USER_COOKIE)
569
- node_labels[USER_NODE_LABEL] = user["username"]
570
- except Exception as e:
571
- console.log(f"[red]Error when joining network: {str(e)}")
572
- return
573
-
574
- # Generate docker compose recipe
575
- generate_compose_config(
576
- role="server",
577
- vpn_token=vpn["key"],
578
- node_name=socket.gethostname(),
579
- node_labels=node_labels,
580
- is_public=location is not None
581
- )
582
-
583
- # start server
584
- console.log("Deploying seed...")
585
- CLUSTER.start_seed_node()
586
-
587
- while not CLUSTER.is_agent_running():
588
- console.log("Waiting for seed to start...")
589
- time.sleep(10)
590
-
591
416
  # select IP address (for external discovery)
592
417
  if ip_address is None and location is None:
593
418
  # local IP
594
419
  console.log(f"Scanning for valid IPs")
595
420
  ip_address = select_ip_address()
596
- else:
597
- # load VPN ip
598
- ip_address = CLUSTER.get_vpn_ip()
421
+
599
422
  console.log(f"Using {ip_address} address for server")
600
423
 
601
- # populate local cred files
602
- auth_key = str(uuid.uuid4())
603
- write_auth_key = str(uuid.uuid4())
604
- readonly_auth_key = str(uuid.uuid4())
605
-
606
- watcher_service = f"{ip_address}:{DEFAULT_WATCHER_PORT}"
607
- values = {
608
- CLUSTER_NAME_KEY: cluster_name,
609
- CLUSTER_IP_KEY: ip_address,
610
- AUTH_KEY: auth_key,
611
- READONLY_AUTH_KEY: readonly_auth_key,
612
- WRITE_AUTH_KEY: write_auth_key,
613
- WATCHER_PORT_KEY: DEFAULT_WATCHER_PORT,
614
- WATCHER_SERVICE_KEY: watcher_service,
615
- USER_NODE_LABEL_KEY: USER_NODE_LABEL,
616
- ALLOW_UNREGISTERED_USER_KEY: not only_registered_users
617
- }
424
+ console.log(f"[green]Creating {cluster_name} pool, this may take a few minutes...")
618
425
 
619
- store_server_info(
620
- server_ip=ip_address,
621
- auth_key=auth_key,
622
- readonly_auth_key=readonly_auth_key,
623
- write_auth_key=write_auth_key,
624
- file=USER_LOCAL_SERVER_FILE,
625
- watcher_service=watcher_service,
626
- node_name=socket.gethostname(),
426
+ create_pool(
627
427
  cluster_name=cluster_name,
628
- public_location=location,
629
- user_api_key=user["api_key"])
630
-
631
- # Generate helmfile recipe
632
- helm_yaml = load_template(
633
- template_path=HELM_APPS_FILE,
634
- values=values,
635
- default_values_path=app_values,
636
- force_defaults=True)
637
- with open(USER_HELM_APPS_FILE, "w") as f:
638
- f.write(helm_yaml)
639
-
640
- console.log("[green]Config files have been generated in your local machine\n")
641
-
642
- console.log("Setting pool dependencies...")
643
- # set template values in helmfile
644
- try:
645
- CLUSTER.update_dependencies(
646
- dependencies_file=USER_HELM_APPS_FILE
647
- )
648
- except Exception as e:
649
- console.log(f"Error: {str(e)}")
650
- exit()
651
- console.log("[green]Your pool is ready! Grow it by sharing your joining token with others. Run [yellow]kalavai pool token[green] to generate one.")
652
-
653
- if location is not None:
654
- # register with kalavai if it's a public cluster
655
- console.log("Registering public cluster with Kalavai...")
656
- pool__publish()
657
-
658
- # wait until the server is ready to create objects
659
- while True:
660
- console.log("Waiting for core services to be ready, may take a few minutes...")
661
- time.sleep(30)
662
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
663
- break
664
- console.log("Initialise user workspace...")
665
- pool_init(pool_config_values_path=pool_config_values)
666
- # init default namespace
667
- init_user_workspace(force_namespace="default")
668
- if only_registered_users:
669
- # init user namespace
670
- init_user_workspace()
428
+ ip_address=ip_address,
429
+ app_values=app_values,
430
+ pool_config_values=pool_config_values,
431
+ num_gpus=input_gpus(),
432
+ only_registered_users=only_registered_users,
433
+ location=location
434
+ )
671
435
 
672
436
  return None
673
437
 
@@ -720,20 +484,13 @@ def pool__check_token(token, *others, public=False):
720
484
  """
721
485
  Utility to check the validity of a join token
722
486
  """
723
- try:
724
- data = decode_dict(token)
725
- for field in MANDATORY_TOKEN_FIELDS:
726
- assert field in data
727
- if public:
728
- if data[PUBLIC_LOCATION_KEY] is None:
729
- raise ValueError("Token is not valid for public pools. Did you start the cluster with a public_location?")
730
- console.log("[green]Token format is correct")
731
- return True
732
- except Exception as e:
733
- console.log(f"[white]{str(e)}")
734
- console.log("[red]Token is invalid.")
487
+ result = check_token(token=token, public=public)
488
+ if "error" in result:
489
+ console.log(f"[red]Error in token: {result}")
735
490
  return False
736
-
491
+
492
+ console.log("[green]Token format is correct")
493
+ return True
737
494
 
738
495
  @arguably.command
739
496
  def pool__join(token, *others, node_name=None):
@@ -743,9 +500,6 @@ def pool__join(token, *others, node_name=None):
743
500
  Args:
744
501
  *others: all the other positional arguments go here
745
502
  """
746
-
747
- if not check_worker_compatibility():
748
- return
749
503
 
750
504
  # check that k3s is not running already in the host
751
505
  # k3s service running or preinstalled
@@ -763,132 +517,26 @@ def pool__join(token, *others, node_name=None):
763
517
  console.log("[green]Nothing happened.")
764
518
  return
765
519
 
766
- if node_name is None:
767
- node_name = socket.gethostname()
768
-
769
- # check token
770
- if not pool__check_token(token):
771
- return
772
-
773
- try:
774
- data = decode_dict(token)
775
- kalavai_seed_ip = data[CLUSTER_IP_KEY]
776
- kalavai_token = data[CLUSTER_TOKEN_KEY]
777
- cluster_name = data[CLUSTER_NAME_KEY]
778
- auth_key = data[AUTH_KEY]
779
- watcher_service = data[WATCHER_SERVICE_KEY]
780
- public_location = data[PUBLIC_LOCATION_KEY]
781
- vpn = defaultdict(lambda: None)
782
- except Exception as e:
783
- console.log(str(e))
784
- console.log("[red] Invalid token")
785
- return
786
-
787
- # join private network if provided
788
- node_labels = {
789
- STORAGE_CLASS_LABEL: is_storage_compatible()
790
- }
791
- user = defaultdict(lambda: None)
792
- if public_location is not None:
793
- user = user_login(user_cookie=USER_COOKIE)
794
- if user is None:
795
- console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
796
- exit()
797
- console.log("Fetching VPN credentials")
798
- try:
799
- vpn = get_vpn_details(
800
- location=public_location,
801
- user_cookie=USER_COOKIE)
802
- node_labels[USER_NODE_LABEL] = user["username"]
803
- except Exception as e:
804
- console.log(f"[red]Error when joining network: {str(e)}")
805
- console.log("Are you authenticated? Try [yellow]kalavai login")
806
- return
807
- try:
808
- validate_join_public_seed(
809
- cluster_name=cluster_name,
810
- join_key=token,
811
- user_cookie=USER_COOKIE
812
- )
813
- except Exception as e:
814
- console.log(f"[red]Error when joining network: {str(e)}")
815
- return
816
-
817
- # send note to server to let them know the node is coming online
818
- # TODO: won't be able to check for VPN pools...
819
- # if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
820
- # console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with '--node-name'")
821
- # leave_vpn(container_name=DEFAULT_VPN_CONTAINER_NAME)
822
- # return
823
-
824
- # local agent join
825
- # 1. Generate local cache files
826
- console.log("Generating config files...")
827
-
828
- # Generate docker compose recipe
829
- generate_compose_config(
830
- role="agent",
831
- pool_ip=f"https://{kalavai_seed_ip}:6443",
832
- pool_token=kalavai_token,
833
- vpn_token=vpn["key"],
834
- node_name=node_name,
835
- node_labels=node_labels,
836
- is_public=public_location is not None)
837
-
838
- store_server_info(
839
- server_ip=kalavai_seed_ip,
840
- auth_key=auth_key,
841
- file=USER_LOCAL_SERVER_FILE,
842
- watcher_service=watcher_service,
843
- node_name=node_name,
844
- cluster_name=cluster_name,
845
- public_location=public_location,
846
- user_api_key=user["api_key"])
520
+ num_gpus = input_gpus()
847
521
 
848
522
  option = user_confirm(
849
523
  question="Docker compose ready. Would you like Kalavai to deploy it?",
850
524
  options=["no", "yes"]
851
525
  )
852
526
  if option == 0:
853
- console.log("Manually deploy the worker with the following command:\n")
854
- print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
527
+ console.log("[red]Installation aborted")
855
528
  return
856
529
 
857
- console.log(f"[white] Connecting to {cluster_name} @ {kalavai_seed_ip} (this may take a few minutes)...")
858
- try:
859
- CLUSTER.start_worker_node()
860
- except Exception as e:
861
- console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
862
- pool__stop()
863
- exit()
864
-
865
- # ensure we are connected
866
- while True:
867
- console.log("Waiting for core services to be ready, may take a few minutes...")
868
- time.sleep(30)
869
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
870
- break
871
-
872
- # send note to server to let them know the node is coming online
873
- if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
874
- console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with [yellow]--node-name'")
875
- pool__stop()
876
- return
877
-
878
- # check the node has connected successfully
879
- try:
880
- while not CLUSTER.is_agent_running():
881
- console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
882
- time.sleep(30)
883
- except KeyboardInterrupt:
884
- console.log("[red]Installation aborted. Leaving pool.")
885
- pool__stop()
886
- return
887
-
888
- init_user_workspace()
889
-
890
- # set status to schedulable
891
- console.log(f"[green] You are connected to {cluster_name}")
530
+ console.log("Connecting worker to the pool...")
531
+ result = join_pool(
532
+ token=token,
533
+ node_name=node_name,
534
+ num_gpus=num_gpus
535
+ )
536
+ if "error" in result:
537
+ console.log(f"[red]Error when connecting: {result}")
538
+ else:
539
+ console.log(f"[green] You are connected to {result}")
892
540
 
893
541
  @arguably.command
894
542
  def pool__stop(*others, skip_node_deletion=False):
@@ -942,11 +590,12 @@ def pool__pause(*others):
942
590
  """
943
591
  # k3s stop locally
944
592
  console.log("[white] Pausing kalavai app...")
945
- success = CLUSTER.pause_agent()
946
- if success:
947
- console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
593
+ success = pause_agent()
594
+ if "error" in success:
595
+ console.log(f"[red] Error when stopping. {success['error']}")
948
596
  else:
949
- console.log("[red] Error when stopping. Please run [yellow]kalavai pool pause[red] again.")
597
+ console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
598
+
950
599
 
951
600
  @arguably.command
952
601
  def pool__resume(*others):
@@ -961,10 +610,12 @@ def pool__resume(*others):
961
610
  console.log("[red] Kalavai app was not started before, please run [yellow]kalavai pool start[red] to start a pool or [yellow]kalavai pool join[red] to join one first")
962
611
  return
963
612
  console.log("[white] Restarting sharing (may take a few minutes)...")
964
- if CLUSTER.restart_agent():
965
- console.log("[white] Kalava sharing resumed")
613
+ success = resume_agent()
614
+ if "error" in success:
615
+ console.log(f"[red] Error when restarting. {success['error']}")
966
616
  else:
967
- console.log("[red] Error when restarting. Please run [yellow]kalavai pool resume[white] again.")
617
+ console.log("[white] Kalava sharing resumed")
618
+
968
619
 
969
620
 
970
621
  @arguably.command
@@ -1103,7 +754,7 @@ def pool__attach(token, *others, node_name=None):
1103
754
  """
1104
755
 
1105
756
  if node_name is None:
1106
- node_name = socket.gethostname()
757
+ node_name = f"{socket.gethostname()}-{uuid.uuid4().hex[:6]}"
1107
758
 
1108
759
  # check that is not attached to another instance
1109
760
  if os.path.exists(USER_LOCAL_SERVER_FILE):
@@ -1115,70 +766,6 @@ def pool__attach(token, *others, node_name=None):
1115
766
  console.log("[green]Nothing happened.")
1116
767
  return
1117
768
 
1118
- # check token
1119
- if not pool__check_token(token):
1120
- return
1121
-
1122
- try:
1123
- data = decode_dict(token)
1124
- kalavai_seed_ip = data[CLUSTER_IP_KEY]
1125
- cluster_name = data[CLUSTER_NAME_KEY]
1126
- auth_key = data[AUTH_KEY]
1127
- watcher_service = data[WATCHER_SERVICE_KEY]
1128
- public_location = data[PUBLIC_LOCATION_KEY]
1129
- vpn = defaultdict(lambda: None)
1130
- except Exception as e:
1131
- console.log(str(e))
1132
- console.log("[red] Invalid token")
1133
- return
1134
-
1135
- user = defaultdict(lambda: None)
1136
- if public_location is not None:
1137
- user = user_login(user_cookie=USER_COOKIE)
1138
- if user is None:
1139
- console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
1140
- exit()
1141
- console.log("Fetching VPN credentials")
1142
- try:
1143
- vpn = get_vpn_details(
1144
- location=public_location,
1145
- user_cookie=USER_COOKIE)
1146
- except Exception as e:
1147
- console.log(f"[red]Error when joining network: {str(e)}")
1148
- console.log("Are you authenticated? Try [yellow]kalavai login")
1149
- return
1150
- try:
1151
- validate_join_public_seed(
1152
- cluster_name=cluster_name,
1153
- join_key=token,
1154
- user_cookie=USER_COOKIE
1155
- )
1156
- except Exception as e:
1157
- console.log(f"[red]Error when joining network: {str(e)}")
1158
- return
1159
-
1160
- # local agent join
1161
- # 1. Generate local cache files
1162
- console.log("Generating config files...")
1163
-
1164
- # Generate docker compose recipe
1165
- generate_compose_config(
1166
- use_gpus=False,
1167
- role="",
1168
- vpn_token=vpn["key"],
1169
- node_name=node_name,
1170
- is_public=public_location is not None)
1171
-
1172
- store_server_info(
1173
- server_ip=kalavai_seed_ip,
1174
- auth_key=auth_key,
1175
- file=USER_LOCAL_SERVER_FILE,
1176
- watcher_service=watcher_service,
1177
- node_name=node_name,
1178
- cluster_name=cluster_name,
1179
- public_location=public_location,
1180
- user_api_key=user["api_key"])
1181
-
1182
769
  option = user_confirm(
1183
770
  question="Docker compose ready. Would you like Kalavai to deploy it?",
1184
771
  options=["no", "yes"]
@@ -1188,17 +775,13 @@ def pool__attach(token, *others, node_name=None):
1188
775
  print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
1189
776
  return
1190
777
 
1191
- console.log(f"[white] Connecting to {cluster_name} @ {kalavai_seed_ip} (this may take a few minutes)...")
1192
- run_cmd(f"docker compose -f {USER_COMPOSE_FILE} up -d")
1193
- # ensure we are connected
1194
- while True:
1195
- console.log("Waiting for core services to be ready, may take a few minutes...")
1196
- time.sleep(30)
1197
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
1198
- break
778
+ result = attach_to_pool(token=token, node_name=node_name)
1199
779
 
780
+ if "error" in result:
781
+ console.log(f"[red]Error when attaching to pool: {result}")
782
+ return
1200
783
  # set status to schedulable
1201
- console.log(f"[green] You are connected to {cluster_name}")
784
+ console.log(f"[green] You are connected to {result}")
1202
785
 
1203
786
 
1204
787
  @arguably.command
@@ -1422,18 +1005,13 @@ def job__templates(*others):
1422
1005
  console.log(f"[red]Problems with your pool: {str(e)}")
1423
1006
  return
1424
1007
 
1425
- try:
1426
- result = request_to_server(
1427
- method="get",
1428
- endpoint="/v1/get_job_templates",
1429
- server_creds=USER_LOCAL_SERVER_FILE,
1430
- data=None,
1431
- user_cookie=USER_COOKIE
1432
- )
1433
- console.log("Templates available in the pool")
1434
- console.log(result)
1435
- except Exception as e:
1436
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1008
+ templates = fetch_job_templates()
1009
+ if "error" in templates:
1010
+ console.log(f"[red]Error when fetching templates: {str(e)}")
1011
+ return
1012
+
1013
+ console.log("Templates available in the pool")
1014
+ console.log(templates)
1437
1015
 
1438
1016
 
1439
1017
  @arguably.command
@@ -1489,26 +1067,16 @@ def job__run(template_name, *others, values: str=None, force_namespace: str=None
1489
1067
  annotation_key="nvidia.com/nouse-gputype"
1490
1068
  )
1491
1069
 
1492
- # deploy template with kube-watcher
1493
- data = {
1494
- "template": template_name,
1495
- "template_values": values_dict
1496
- }
1497
- if force_namespace is not None:
1498
- data["force_namespace"] = force_namespace
1070
+ result = deploy_job(
1071
+ template_name=template_name,
1072
+ values_dict=values_dict,
1073
+ force_namespace=force_namespace
1074
+ )
1499
1075
 
1500
- try:
1501
- result = request_to_server(
1502
- method="post",
1503
- endpoint="/v1/deploy_job",
1504
- data=data,
1505
- server_creds=USER_LOCAL_SERVER_FILE,
1506
- user_cookie=USER_COOKIE
1507
- )
1076
+ if "error" in result:
1077
+ console.log(f"[red]Error when deploying job: {str(e)}")
1078
+ else:
1508
1079
  console.log(f"[green]{template_name} job deployed")
1509
- except Exception as e:
1510
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1511
- return
1512
1080
 
1513
1081
  @arguably.command
1514
1082
  def job__test(local_template_dir, *others, values, defaults, force_namespace: str=None):
@@ -1581,22 +1149,12 @@ def job__defaults(template_name, *others):
1581
1149
  return
1582
1150
 
1583
1151
  # deploy template with kube-watcher
1584
- data = {
1585
- "template": template_name
1586
- }
1587
- try:
1588
- result = request_to_server(
1589
- method="get",
1590
- endpoint="/v1/job_defaults",
1591
- data=data,
1592
- server_creds=USER_LOCAL_SERVER_FILE,
1593
- user_cookie=USER_COOKIE
1594
- )
1595
- print(
1596
- json.dumps(result,indent=3)
1597
- )
1598
- except Exception as e:
1599
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1152
+ defaults = fetch_job_defaults(name=template_name)
1153
+ if "error" in defaults:
1154
+ console.log(f"[red]Error when fetching job defaults: {defaults}")
1155
+ print(
1156
+ json.dumps(defaults, indent=3)
1157
+ )
1600
1158
 
1601
1159
 
1602
1160
  @arguably.command
@@ -1614,23 +1172,11 @@ def job__delete(name, *others, force_namespace: str=None):
1614
1172
  console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1615
1173
 
1616
1174
  # deploy template with kube-watcher
1617
- data = {
1618
- "label": TEMPLATE_LABEL, # this ensures that both lws template and services are deleted
1619
- "value": name
1620
- }
1621
- if force_namespace is not None:
1622
- data["force_namespace"] = force_namespace
1623
- try:
1624
- result = request_to_server(
1625
- method="post",
1626
- endpoint="/v1/delete_labeled_resources",
1627
- data=data,
1628
- server_creds=USER_LOCAL_SERVER_FILE,
1629
- user_cookie=USER_COOKIE
1630
- )
1175
+ result = delete_job(name=name, force_namespace=force_namespace)
1176
+ if "error" in result:
1177
+ console.log(f"[red]Error when deleting job: {str(e)}")
1178
+ else:
1631
1179
  console.log(f"{result}")
1632
- except Exception as e:
1633
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1634
1180
 
1635
1181
 
1636
1182
  @arguably.command