kalavai-client 0.5.15__py3-none-any.whl → 0.5.16__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
kalavai_client/cli.py CHANGED
@@ -7,19 +7,30 @@ import time
7
7
  import socket
8
8
  from pathlib import Path
9
9
  from getpass import getpass
10
- import ipaddress
11
10
  from sys import exit
12
11
 
13
12
  import yaml
14
- import netifaces as ni
13
+
15
14
  import arguably
16
15
  from rich.console import Console
17
16
 
17
+ from kalavai_client.cluster import CLUSTER
18
18
  from kalavai_client.env import (
19
19
  USER_COOKIE,
20
20
  USER_LOCAL_SERVER_FILE,
21
21
  TEMPLATE_LABEL,
22
- user_path
22
+ KALAVAI_PLATFORM_URL,
23
+ DEFAULT_VPN_CONTAINER_NAME,
24
+ CONTAINER_HOST_PATH,
25
+ USER_COMPOSE_FILE,
26
+ USER_HELM_APPS_FILE,
27
+ USER_KUBECONFIG_FILE,
28
+ USER_VPN_COMPOSE_FILE,
29
+ USER_TEMPLATES_FOLDER,
30
+ DOCKER_COMPOSE_GUI,
31
+ USER_GUI_COMPOSE_FILE,
32
+ user_path,
33
+ resource_path,
23
34
  )
24
35
  from kalavai_client.core import (
25
36
  fetch_resources,
@@ -28,22 +39,29 @@ from kalavai_client.core import (
28
39
  fetch_devices,
29
40
  fetch_job_logs,
30
41
  fetch_gpus,
31
- load_gpu_models
42
+ load_gpu_models,
43
+ fetch_job_templates,
44
+ fetch_job_defaults,
45
+ deploy_job,
46
+ delete_job,
47
+ check_token,
48
+ attach_to_pool,
49
+ join_pool,
50
+ create_pool,
51
+ get_ip_addresses,
52
+ pause_agent,
53
+ resume_agent
32
54
  )
33
55
  from kalavai_client.utils import (
34
56
  check_gpu_drivers,
57
+ load_template,
35
58
  run_cmd,
36
- decode_dict,
37
59
  generate_join_token,
38
60
  user_confirm,
39
- load_template,
40
- store_server_info,
41
61
  generate_table,
42
62
  request_to_server,
43
- resource_path,
44
63
  safe_remove,
45
64
  leave_vpn,
46
- get_vpn_details,
47
65
  load_server_info,
48
66
  user_login,
49
67
  user_logout,
@@ -51,9 +69,6 @@ from kalavai_client.utils import (
51
69
  register_cluster,
52
70
  unregister_cluster,
53
71
  get_public_seeds,
54
- validate_join_public_seed,
55
- is_storage_compatible,
56
- is_watcher_alive,
57
72
  load_user_session,
58
73
  SERVER_IP_KEY,
59
74
  AUTH_KEY,
@@ -62,112 +77,30 @@ from kalavai_client.utils import (
62
77
  WRITE_AUTH_KEY,
63
78
  PUBLIC_LOCATION_KEY,
64
79
  NODE_NAME_KEY,
65
- CLUSTER_NAME_KEY,
66
- CLUSTER_IP_KEY,
67
- CLUSTER_TOKEN_KEY,
68
- WATCHER_PORT_KEY,
69
- MANDATORY_TOKEN_FIELDS,
70
- USER_NODE_LABEL_KEY,
71
- ALLOW_UNREGISTERED_USER_KEY
72
- )
73
- from kalavai_client.cluster import (
74
- dockerCluster
80
+ CLUSTER_NAME_KEY
75
81
  )
76
82
 
77
83
 
78
- KALAVAI_PLATFORM_URL = os.getenv("KALAVAI_PLATFORM_URL", "https://platform.kalavai.net")
79
84
  LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
80
85
  VERSION = 1
81
86
  RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
82
87
  CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
83
88
  RAY_LABEL = "kalavai.ray.name"
84
89
  PVC_NAME_LABEL = "kalavai.storage.name"
85
- DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
86
90
  VPN_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/vpn-template.yaml")
87
- POOL_CONFIG_TEMPLATE = resource_path("kalavai_client/assets/pool_config_template.yaml")
88
- POOL_CONFIG_DEFAULT_VALUES = resource_path("kalavai_client/assets/pool_config_values.yaml")
89
- USER_WORKSPACE_TEMPLATE = resource_path("kalavai_client/assets/user_workspace.yaml")
90
- DEFAULT_USER_WORKSPACE_VALUES = resource_path("kalavai_client/assets/user_workspace_values.yaml")
91
91
  STORAGE_CLASS_NAME = "local-path"
92
92
  STORAGE_ACCESS_MODE = ["ReadWriteOnce"]
93
- STORAGE_CLASS_LABEL = "kalavai.storage.enabled"
94
93
  DEFAULT_STORAGE_NAME = "pool-cache"
95
94
  DEFAULT_STORAGE_SIZE = 20
96
- DEFAULT_WATCHER_PORT = 30001
97
- USER_NODE_LABEL = "kalavai.cluster.user"
98
- KUBE_VERSION = os.getenv("KALAVAI_KUBE_VERSION", "v1.31.1+k3s1")
99
- DEFAULT_FLANNEL_IFACE = os.getenv("KALAVAI_FLANNEL_IFACE", "netmaker-1")
100
- FORBIDEDEN_IPS = ["127.0.0.1"]
101
- # kalavai templates
102
- HELM_APPS_FILE = resource_path("kalavai_client/assets/apps.yaml")
103
- HELM_APPS_VALUES = resource_path("kalavai_client/assets/apps_values.yaml")
104
- # user specific config files
105
- DEFAULT_CONTAINER_NAME = "kalavai"
106
- DEFAULT_VPN_CONTAINER_NAME = "kalavai-vpn"
107
- CONTAINER_HOST_PATH = user_path("pool/", create_path=True)
108
- USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
109
- USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
110
- USER_HELM_APPS_FILE = user_path("apps.yaml")
111
- USER_KUBECONFIG_FILE = user_path("kubeconfig")
112
- USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
113
-
114
95
 
96
+
115
97
  console = Console()
116
- CLUSTER = dockerCluster(
117
- container_name=DEFAULT_CONTAINER_NAME,
118
- kube_version=KUBE_VERSION,
119
- flannel_iface=DEFAULT_FLANNEL_IFACE,
120
- compose_file=USER_COMPOSE_FILE,
121
- kubeconfig_file=USER_KUBECONFIG_FILE,
122
- poolconfig_file=USER_LOCAL_SERVER_FILE,
123
- dependencies_file=USER_HELM_APPS_FILE
124
- )
125
98
 
126
99
 
127
100
  ######################
128
101
  ## HELPER FUNCTIONS ##
129
102
  ######################
130
103
 
131
- def check_seed_compatibility():
132
- """Check required packages to start pools"""
133
- logs = []
134
- console.log("[white]Checking system requirements...")
135
- # docker
136
- try:
137
- run_cmd("docker version >/dev/null 2>&1")
138
- except:
139
- logs.append("[red]Docker not installed. Install instructions:\n")
140
- logs.append(" Linux: https://docs.docker.com/engine/install/\n")
141
- logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
142
-
143
- if len(logs) == 0:
144
- console.log("[green]System is ready to start a pool")
145
- return True
146
- else:
147
- for log in logs:
148
- console.log(log)
149
- return False
150
-
151
- def check_worker_compatibility():
152
- """Check required packages to join pools"""
153
- logs = []
154
- console.log("[white]Checking system requirements...")
155
- # docker
156
- try:
157
- run_cmd("docker version >/dev/null 2>&1")
158
- except:
159
- logs.append("[red]Docker not installed. Install instructions:\n")
160
- logs.append(" Linux: https://docs.docker.com/engine/install/\n")
161
- logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
162
-
163
- if len(logs) == 0:
164
- console.log("[green]System is ready to join a pool")
165
- return True
166
- else:
167
- for log in logs:
168
- console.log(log)
169
- return False
170
-
171
104
 
172
105
  def cleanup_local():
173
106
  console.log("Removing local cache files...")
@@ -178,6 +111,7 @@ def cleanup_local():
178
111
  safe_remove(USER_KUBECONFIG_FILE)
179
112
  safe_remove(USER_LOCAL_SERVER_FILE)
180
113
  safe_remove(USER_TEMPLATES_FOLDER)
114
+ safe_remove(USER_GUI_COMPOSE_FILE)
181
115
 
182
116
  def pre_join_check(node_name, server_url, server_key):
183
117
  # check with the server that we can connect
@@ -217,75 +151,9 @@ def set_schedulable(schedulable, node_name=load_server_info(data_key=NODE_NAME_K
217
151
  except Exception as e:
218
152
  console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
219
153
 
220
-
221
- def init_user_workspace(force_namespace=None):
222
-
223
- # load template config and populate with values
224
- sidecar_template_yaml = load_template(
225
- template_path=USER_WORKSPACE_TEMPLATE,
226
- values={},
227
- default_values_path=DEFAULT_USER_WORKSPACE_VALUES)
228
-
229
- try:
230
- data = {"config": sidecar_template_yaml}
231
- if force_namespace is not None:
232
- data["force_namespace"] = force_namespace
233
- result = request_to_server(
234
- method="post",
235
- endpoint="/v1/create_user_space",
236
- data=data,
237
- server_creds=USER_LOCAL_SERVER_FILE,
238
- user_cookie=USER_COOKIE
239
- )
240
- console.log(f"Workspace creation (ignore already created warnings): {result}" )
241
- except Exception as e:
242
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
243
-
244
- def pool_init(pool_config_values_path=None):
245
- """Deploy configured objects to initialise pool"""
246
- if pool_config_values_path is None:
247
- return
248
-
249
- # load template config and populate with values
250
- sidecar_template_yaml = load_template(
251
- template_path=POOL_CONFIG_TEMPLATE,
252
- values={},
253
- default_values_path=pool_config_values_path)
254
-
255
- try:
256
- result = request_to_server(
257
- method="post",
258
- endpoint="/v1/deploy_generic_model",
259
- data={"config": sidecar_template_yaml},
260
- server_creds=USER_LOCAL_SERVER_FILE,
261
- user_cookie=USER_COOKIE
262
- )
263
- if 'failed' in result and len(result['failed']) > 0:
264
- console.log(f"[red]Error when deploying pool config\n\n{result['failed']}")
265
- if len(result['successful']) > 0:
266
- console.log(f"[green]Deployed pool config!")
267
- except Exception as e:
268
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
269
-
270
154
  def select_ip_address(subnet=None):
271
- ips = []
272
- retry = 3
273
- while len(ips) == 0:
274
- for iface in ni.interfaces():
275
- try:
276
- ip = ni.ifaddresses(iface)[ni.AF_INET][0]['addr']
277
- if ip in FORBIDEDEN_IPS:
278
- continue
279
- if subnet is None or ipaddress.ip_address(ip) in ipaddress.ip_network(subnet):
280
- ips.append(ip)
281
- except:
282
- pass
283
- if len(ips) == 1:
284
- return ips[0]
285
- time.sleep(2)
286
- retry -= 1
287
- if retry < 0:
288
- raise ValueError(f"No IPs available on subnet {subnet}")
155
+ ips = get_ip_addresses(subnet=subnet)
156
+
289
157
  while True:
290
158
  option = user_confirm(
291
159
  question="Select IP to advertise the node (needs to be visible to other nodes)",
@@ -336,50 +204,50 @@ def select_token_type():
336
204
  break
337
205
  return {"admin": choice == 0, "user": choice == 1, "worker": choice == 2}
338
206
 
339
- def generate_compose_config(role, node_name, is_public, use_gpus=True, node_labels=None, pool_ip=None, vpn_token=None, pool_token=None):
207
+ def input_gpus():
340
208
  num_gpus = 0
341
- if use_gpus:
342
- try:
343
- has_gpus = check_gpu_drivers()
344
- if has_gpus:
345
- max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
346
- num_gpus = user_confirm(
347
- question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
348
- options=range(max_gpus+1)
349
- )
350
- except:
351
- console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
352
- if node_labels is not None:
353
- node_labels = " ".join([f"--node-label {key}={value}" for key, value in node_labels.items()])
354
- compose_values = {
355
- "user_path": user_path(""),
356
- "service_name": DEFAULT_CONTAINER_NAME,
357
- "vpn": is_public,
358
- "vpn_name": DEFAULT_VPN_CONTAINER_NAME,
359
- "pool_ip": pool_ip,
360
- "pool_token": pool_token,
361
- "vpn_token": vpn_token,
362
- "node_name": node_name,
363
- "command": role,
364
- "storage_enabled": "True",
365
- "num_gpus": num_gpus,
366
- "k3s_path": f"{CONTAINER_HOST_PATH}/k3s",
367
- "etc_path": f"{CONTAINER_HOST_PATH}/etc",
368
- "node_labels": node_labels,
369
- "flannel_iface": DEFAULT_FLANNEL_IFACE if is_public else ""
370
- }
371
- # generate local config files
372
- compose_yaml = load_template(
373
- template_path=DOCKER_COMPOSE_TEMPLATE,
374
- values=compose_values)
375
- with open(USER_COMPOSE_FILE, "w") as f:
376
- f.write(compose_yaml)
377
- return compose_yaml
209
+ try:
210
+ has_gpus = check_gpu_drivers()
211
+ if has_gpus:
212
+ max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
213
+ num_gpus = user_confirm(
214
+ question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
215
+ options=range(max_gpus+1)
216
+ )
217
+ except:
218
+ console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
219
+ return num_gpus
378
220
 
379
221
  ##################
380
222
  ## CLI COMMANDS ##
381
223
  ##################
382
224
 
225
+ @arguably.command
226
+ def gui__start(*others, gui_port=3000, backend_port=8000):
227
+ """Run GUI"""
228
+ values = {
229
+ "path": user_path(""),
230
+ "gui_port": gui_port,
231
+ "backend_port": backend_port
232
+ }
233
+ compose_yaml = load_template(
234
+ template_path=DOCKER_COMPOSE_GUI,
235
+ values=values)
236
+ with open(USER_GUI_COMPOSE_FILE, "w") as f:
237
+ f.write(compose_yaml)
238
+
239
+ run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} up -d")
240
+
241
+ console.log(f"[green]Loading GUI, may take a few minutes. It will be available at http://localhost:{gui_port}")
242
+
243
+ @arguably.command
244
+ def gui__stop(*others):
245
+ """Stop GUI"""
246
+ run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} down")
247
+
248
+ console.log("[green]Kalavai GUI has been stopped")
249
+
250
+
383
251
  @arguably.command
384
252
  def login(*others, username: str=None):
385
253
  """
@@ -461,8 +329,9 @@ def pool__publish(*others, description=None):
461
329
  description = description
462
330
 
463
331
  try:
464
- if not pool__check_token(token=token, public=True):
465
- raise ValueError("[red]Cluster must be started with a valid vpn_location to publish")
332
+ valid = check_token(token=token, public=True)
333
+ if "error" in valid:
334
+ raise ValueError(f"[red]Cluster must be started with a valid vpn_location to publish: {valid}")
466
335
  cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
467
336
 
468
337
  register_cluster(
@@ -523,7 +392,7 @@ def pool__list(*others, user_only=False):
523
392
 
524
393
 
525
394
  @arguably.command
526
- def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=HELM_APPS_VALUES, pool_config_values: str=POOL_CONFIG_DEFAULT_VALUES):
395
+ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=None, pool_config_values: str=None):
527
396
  """
528
397
  Start Kalavai pool and start/resume sharing resources.
529
398
 
@@ -531,9 +400,6 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
531
400
  *others: all the other positional arguments go here
532
401
  """
533
402
 
534
- if not check_seed_compatibility():
535
- return
536
-
537
403
  if CLUSTER.is_cluster_init():
538
404
  console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
539
405
  return
@@ -547,127 +413,25 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
547
413
  console.log("Installation was cancelled and did not complete.")
548
414
  return
549
415
 
550
- # if only registered users are allowed, check user has logged in
551
- user = defaultdict(lambda: None)
552
- if only_registered_users or location is not None:
553
- user = user_login(user_cookie=USER_COOKIE)
554
- if user is None:
555
- console.log("[white]--only-registered-users [red]or [white]--location[red] can only be used if the host is authenticated. Run [yellow]kalavai login[red] to authenticate")
556
- exit()
557
-
558
- # join private network if provided
559
- vpn = defaultdict(lambda: None)
560
- node_labels = {
561
- STORAGE_CLASS_LABEL: is_storage_compatible()
562
- }
563
- if location is not None:
564
- console.log("Fetching VPN credentials")
565
- try:
566
- vpn = get_vpn_details(
567
- location=location,
568
- user_cookie=USER_COOKIE)
569
- node_labels[USER_NODE_LABEL] = user["username"]
570
- except Exception as e:
571
- console.log(f"[red]Error when joining network: {str(e)}")
572
- return
573
-
574
- # Generate docker compose recipe
575
- generate_compose_config(
576
- role="server",
577
- vpn_token=vpn["key"],
578
- node_name=socket.gethostname(),
579
- node_labels=node_labels,
580
- is_public=location is not None
581
- )
582
-
583
- # start server
584
- console.log("Deploying seed...")
585
- CLUSTER.start_seed_node()
586
-
587
- while not CLUSTER.is_agent_running():
588
- console.log("Waiting for seed to start...")
589
- time.sleep(10)
590
-
591
416
  # select IP address (for external discovery)
592
417
  if ip_address is None and location is None:
593
418
  # local IP
594
419
  console.log(f"Scanning for valid IPs")
595
420
  ip_address = select_ip_address()
596
- else:
597
- # load VPN ip
598
- ip_address = CLUSTER.get_vpn_ip()
421
+
599
422
  console.log(f"Using {ip_address} address for server")
600
423
 
601
- # populate local cred files
602
- auth_key = str(uuid.uuid4())
603
- write_auth_key = str(uuid.uuid4())
604
- readonly_auth_key = str(uuid.uuid4())
605
-
606
- watcher_service = f"{ip_address}:{DEFAULT_WATCHER_PORT}"
607
- values = {
608
- CLUSTER_NAME_KEY: cluster_name,
609
- CLUSTER_IP_KEY: ip_address,
610
- AUTH_KEY: auth_key,
611
- READONLY_AUTH_KEY: readonly_auth_key,
612
- WRITE_AUTH_KEY: write_auth_key,
613
- WATCHER_PORT_KEY: DEFAULT_WATCHER_PORT,
614
- WATCHER_SERVICE_KEY: watcher_service,
615
- USER_NODE_LABEL_KEY: USER_NODE_LABEL,
616
- ALLOW_UNREGISTERED_USER_KEY: not only_registered_users
617
- }
424
+ console.log(f"[green]Creating {cluster_name} pool, this may take a few minutes...")
618
425
 
619
- store_server_info(
620
- server_ip=ip_address,
621
- auth_key=auth_key,
622
- readonly_auth_key=readonly_auth_key,
623
- write_auth_key=write_auth_key,
624
- file=USER_LOCAL_SERVER_FILE,
625
- watcher_service=watcher_service,
626
- node_name=socket.gethostname(),
426
+ create_pool(
627
427
  cluster_name=cluster_name,
628
- public_location=location,
629
- user_api_key=user["api_key"])
630
-
631
- # Generate helmfile recipe
632
- helm_yaml = load_template(
633
- template_path=HELM_APPS_FILE,
634
- values=values,
635
- default_values_path=app_values,
636
- force_defaults=True)
637
- with open(USER_HELM_APPS_FILE, "w") as f:
638
- f.write(helm_yaml)
639
-
640
- console.log("[green]Config files have been generated in your local machine\n")
641
-
642
- console.log("Setting pool dependencies...")
643
- # set template values in helmfile
644
- try:
645
- CLUSTER.update_dependencies(
646
- dependencies_file=USER_HELM_APPS_FILE
647
- )
648
- except Exception as e:
649
- console.log(f"Error: {str(e)}")
650
- exit()
651
- console.log("[green]Your pool is ready! Grow it by sharing your joining token with others. Run [yellow]kalavai pool token[green] to generate one.")
652
-
653
- if location is not None:
654
- # register with kalavai if it's a public cluster
655
- console.log("Registering public cluster with Kalavai...")
656
- pool__publish()
657
-
658
- # wait until the server is ready to create objects
659
- while True:
660
- console.log("Waiting for core services to be ready, may take a few minutes...")
661
- time.sleep(30)
662
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
663
- break
664
- console.log("Initialise user workspace...")
665
- pool_init(pool_config_values_path=pool_config_values)
666
- # init default namespace
667
- init_user_workspace(force_namespace="default")
668
- if only_registered_users:
669
- # init user namespace
670
- init_user_workspace()
428
+ ip_address=ip_address,
429
+ app_values=app_values,
430
+ pool_config_values=pool_config_values,
431
+ num_gpus=input_gpus(),
432
+ only_registered_users=only_registered_users,
433
+ location=location
434
+ )
671
435
 
672
436
  return None
673
437
 
@@ -720,20 +484,13 @@ def pool__check_token(token, *others, public=False):
720
484
  """
721
485
  Utility to check the validity of a join token
722
486
  """
723
- try:
724
- data = decode_dict(token)
725
- for field in MANDATORY_TOKEN_FIELDS:
726
- assert field in data
727
- if public:
728
- if data[PUBLIC_LOCATION_KEY] is None:
729
- raise ValueError("Token is not valid for public pools. Did you start the cluster with a public_location?")
730
- console.log("[green]Token format is correct")
731
- return True
732
- except Exception as e:
733
- console.log(f"[white]{str(e)}")
734
- console.log("[red]Token is invalid.")
487
+ result = check_token(token=token, public=public)
488
+ if "error" in result:
489
+ console.log(f"[red]Error in token: {result}")
735
490
  return False
736
-
491
+
492
+ console.log("[green]Token format is correct")
493
+ return True
737
494
 
738
495
  @arguably.command
739
496
  def pool__join(token, *others, node_name=None):
@@ -743,9 +500,6 @@ def pool__join(token, *others, node_name=None):
743
500
  Args:
744
501
  *others: all the other positional arguments go here
745
502
  """
746
-
747
- if not check_worker_compatibility():
748
- return
749
503
 
750
504
  # check that k3s is not running already in the host
751
505
  # k3s service running or preinstalled
@@ -763,119 +517,26 @@ def pool__join(token, *others, node_name=None):
763
517
  console.log("[green]Nothing happened.")
764
518
  return
765
519
 
766
- if node_name is None:
767
- node_name = socket.gethostname()
768
-
769
- # check token
770
- if not pool__check_token(token):
771
- return
772
-
773
- try:
774
- data = decode_dict(token)
775
- kalavai_seed_ip = data[CLUSTER_IP_KEY]
776
- kalavai_token = data[CLUSTER_TOKEN_KEY]
777
- cluster_name = data[CLUSTER_NAME_KEY]
778
- auth_key = data[AUTH_KEY]
779
- watcher_service = data[WATCHER_SERVICE_KEY]
780
- public_location = data[PUBLIC_LOCATION_KEY]
781
- vpn = defaultdict(lambda: None)
782
- except Exception as e:
783
- console.log(str(e))
784
- console.log("[red] Invalid token")
785
- return
786
-
787
- # join private network if provided
788
- node_labels = {
789
- STORAGE_CLASS_LABEL: is_storage_compatible()
790
- }
791
- user = defaultdict(lambda: None)
792
- if public_location is not None:
793
- user = user_login(user_cookie=USER_COOKIE)
794
- if user is None:
795
- console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
796
- exit()
797
- console.log("Fetching VPN credentials")
798
- try:
799
- vpn = get_vpn_details(
800
- location=public_location,
801
- user_cookie=USER_COOKIE)
802
- node_labels[USER_NODE_LABEL] = user["username"]
803
- except Exception as e:
804
- console.log(f"[red]Error when joining network: {str(e)}")
805
- console.log("Are you authenticated? Try [yellow]kalavai login")
806
- return
807
- try:
808
- validate_join_public_seed(
809
- cluster_name=cluster_name,
810
- join_key=token,
811
- user_cookie=USER_COOKIE
812
- )
813
- except Exception as e:
814
- console.log(f"[red]Error when joining network: {str(e)}")
815
- return
816
-
817
- # local agent join
818
- # 1. Generate local cache files
819
- console.log("Generating config files...")
820
-
821
- # Generate docker compose recipe
822
- generate_compose_config(
823
- role="agent",
824
- pool_ip=f"https://{kalavai_seed_ip}:6443",
825
- pool_token=kalavai_token,
826
- vpn_token=vpn["key"],
827
- node_name=node_name,
828
- node_labels=node_labels,
829
- is_public=public_location is not None)
830
-
831
- store_server_info(
832
- server_ip=kalavai_seed_ip,
833
- auth_key=auth_key,
834
- file=USER_LOCAL_SERVER_FILE,
835
- watcher_service=watcher_service,
836
- node_name=node_name,
837
- cluster_name=cluster_name,
838
- public_location=public_location,
839
- user_api_key=user["api_key"])
520
+ num_gpus = input_gpus()
840
521
 
841
522
  option = user_confirm(
842
523
  question="Docker compose ready. Would you like Kalavai to deploy it?",
843
524
  options=["no", "yes"]
844
525
  )
845
526
  if option == 0:
846
- console.log("Manually deploy the worker with the following command:\n")
847
- print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
527
+ console.log("[red]Installation aborted")
848
528
  return
849
529
 
850
- console.log(f"[white] Connecting to {cluster_name} @ {kalavai_seed_ip} (this may take a few minutes)...")
851
- try:
852
- CLUSTER.start_worker_node()
853
- except Exception as e:
854
- console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
855
- pool__stop()
856
- exit()
857
-
858
- # ensure we are connected
859
- while True:
860
- console.log("Waiting for core services to be ready, may take a few minutes...")
861
- time.sleep(30)
862
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
863
- break
864
-
865
- # check the node has connected successfully
866
- try:
867
- while not CLUSTER.is_agent_running():
868
- console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
869
- time.sleep(30)
870
- except KeyboardInterrupt:
871
- console.log("[red]Installation aborted. Leaving pool.")
872
- pool__stop()
873
- return
874
-
875
- init_user_workspace()
876
-
877
- # set status to schedulable
878
- console.log(f"[green] You are connected to {cluster_name}")
530
+ console.log("Connecting worker to the pool...")
531
+ result = join_pool(
532
+ token=token,
533
+ node_name=node_name,
534
+ num_gpus=num_gpus
535
+ )
536
+ if "error" in result:
537
+ console.log(f"[red]Error when connecting: {result}")
538
+ else:
539
+ console.log(f"[green] You are connected to {result}")
879
540
 
880
541
  @arguably.command
881
542
  def pool__stop(*others, skip_node_deletion=False):
@@ -929,11 +590,12 @@ def pool__pause(*others):
929
590
  """
930
591
  # k3s stop locally
931
592
  console.log("[white] Pausing kalavai app...")
932
- success = CLUSTER.pause_agent()
933
- if success:
934
- console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
593
+ success = pause_agent()
594
+ if "error" in success:
595
+ console.log(f"[red] Error when stopping. {success['error']}")
935
596
  else:
936
- console.log("[red] Error when stopping. Please run [yellow]kalavai pool pause[red] again.")
597
+ console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
598
+
937
599
 
938
600
  @arguably.command
939
601
  def pool__resume(*others):
@@ -948,10 +610,12 @@ def pool__resume(*others):
948
610
  console.log("[red] Kalavai app was not started before, please run [yellow]kalavai pool start[red] to start a pool or [yellow]kalavai pool join[red] to join one first")
949
611
  return
950
612
  console.log("[white] Restarting sharing (may take a few minutes)...")
951
- if CLUSTER.restart_agent():
952
- console.log("[white] Kalava sharing resumed")
613
+ success = resume_agent()
614
+ if "error" in success:
615
+ console.log(f"[red] Error when restarting. {success['error']}")
953
616
  else:
954
- console.log("[red] Error when restarting. Please run [yellow]kalavai pool resume[white] again.")
617
+ console.log("[white] Kalava sharing resumed")
618
+
955
619
 
956
620
 
957
621
  @arguably.command
@@ -1090,7 +754,7 @@ def pool__attach(token, *others, node_name=None):
1090
754
  """
1091
755
 
1092
756
  if node_name is None:
1093
- node_name = socket.gethostname()
757
+ node_name = f"{socket.gethostname()}-{uuid.uuid4().hex[:6]}"
1094
758
 
1095
759
  # check that is not attached to another instance
1096
760
  if os.path.exists(USER_LOCAL_SERVER_FILE):
@@ -1102,70 +766,6 @@ def pool__attach(token, *others, node_name=None):
1102
766
  console.log("[green]Nothing happened.")
1103
767
  return
1104
768
 
1105
- # check token
1106
- if not pool__check_token(token):
1107
- return
1108
-
1109
- try:
1110
- data = decode_dict(token)
1111
- kalavai_seed_ip = data[CLUSTER_IP_KEY]
1112
- cluster_name = data[CLUSTER_NAME_KEY]
1113
- auth_key = data[AUTH_KEY]
1114
- watcher_service = data[WATCHER_SERVICE_KEY]
1115
- public_location = data[PUBLIC_LOCATION_KEY]
1116
- vpn = defaultdict(lambda: None)
1117
- except Exception as e:
1118
- console.log(str(e))
1119
- console.log("[red] Invalid token")
1120
- return
1121
-
1122
- user = defaultdict(lambda: None)
1123
- if public_location is not None:
1124
- user = user_login(user_cookie=USER_COOKIE)
1125
- if user is None:
1126
- console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
1127
- exit()
1128
- console.log("Fetching VPN credentials")
1129
- try:
1130
- vpn = get_vpn_details(
1131
- location=public_location,
1132
- user_cookie=USER_COOKIE)
1133
- except Exception as e:
1134
- console.log(f"[red]Error when joining network: {str(e)}")
1135
- console.log("Are you authenticated? Try [yellow]kalavai login")
1136
- return
1137
- try:
1138
- validate_join_public_seed(
1139
- cluster_name=cluster_name,
1140
- join_key=token,
1141
- user_cookie=USER_COOKIE
1142
- )
1143
- except Exception as e:
1144
- console.log(f"[red]Error when joining network: {str(e)}")
1145
- return
1146
-
1147
- # local agent join
1148
- # 1. Generate local cache files
1149
- console.log("Generating config files...")
1150
-
1151
- # Generate docker compose recipe
1152
- generate_compose_config(
1153
- use_gpus=False,
1154
- role="",
1155
- vpn_token=vpn["key"],
1156
- node_name=node_name,
1157
- is_public=public_location is not None)
1158
-
1159
- store_server_info(
1160
- server_ip=kalavai_seed_ip,
1161
- auth_key=auth_key,
1162
- file=USER_LOCAL_SERVER_FILE,
1163
- watcher_service=watcher_service,
1164
- node_name=node_name,
1165
- cluster_name=cluster_name,
1166
- public_location=public_location,
1167
- user_api_key=user["api_key"])
1168
-
1169
769
  option = user_confirm(
1170
770
  question="Docker compose ready. Would you like Kalavai to deploy it?",
1171
771
  options=["no", "yes"]
@@ -1175,17 +775,13 @@ def pool__attach(token, *others, node_name=None):
1175
775
  print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
1176
776
  return
1177
777
 
1178
- console.log(f"[white] Connecting to {cluster_name} @ {kalavai_seed_ip} (this may take a few minutes)...")
1179
- run_cmd(f"docker compose -f {USER_COMPOSE_FILE} up -d")
1180
- # ensure we are connected
1181
- while True:
1182
- console.log("Waiting for core services to be ready, may take a few minutes...")
1183
- time.sleep(30)
1184
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
1185
- break
778
+ result = attach_to_pool(token=token, node_name=node_name)
1186
779
 
780
+ if "error" in result:
781
+ console.log(f"[red]Error when attaching to pool: {result}")
782
+ return
1187
783
  # set status to schedulable
1188
- console.log(f"[green] You are connected to {cluster_name}")
784
+ console.log(f"[green] You are connected to {result}")
1189
785
 
1190
786
 
1191
787
  @arguably.command
@@ -1409,18 +1005,13 @@ def job__templates(*others):
1409
1005
  console.log(f"[red]Problems with your pool: {str(e)}")
1410
1006
  return
1411
1007
 
1412
- try:
1413
- result = request_to_server(
1414
- method="get",
1415
- endpoint="/v1/get_job_templates",
1416
- server_creds=USER_LOCAL_SERVER_FILE,
1417
- data=None,
1418
- user_cookie=USER_COOKIE
1419
- )
1420
- console.log("Templates available in the pool")
1421
- console.log(result)
1422
- except Exception as e:
1423
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1008
+ templates = fetch_job_templates()
1009
+ if "error" in templates:
1010
+ console.log(f"[red]Error when fetching templates: {str(e)}")
1011
+ return
1012
+
1013
+ console.log("Templates available in the pool")
1014
+ console.log(templates)
1424
1015
 
1425
1016
 
1426
1017
  @arguably.command
@@ -1476,26 +1067,16 @@ def job__run(template_name, *others, values: str=None, force_namespace: str=None
1476
1067
  annotation_key="nvidia.com/nouse-gputype"
1477
1068
  )
1478
1069
 
1479
- # deploy template with kube-watcher
1480
- data = {
1481
- "template": template_name,
1482
- "template_values": values_dict
1483
- }
1484
- if force_namespace is not None:
1485
- data["force_namespace"] = force_namespace
1070
+ result = deploy_job(
1071
+ template_name=template_name,
1072
+ values_dict=values_dict,
1073
+ force_namespace=force_namespace
1074
+ )
1486
1075
 
1487
- try:
1488
- result = request_to_server(
1489
- method="post",
1490
- endpoint="/v1/deploy_job",
1491
- data=data,
1492
- server_creds=USER_LOCAL_SERVER_FILE,
1493
- user_cookie=USER_COOKIE
1494
- )
1076
+ if "error" in result:
1077
+ console.log(f"[red]Error when deploying job: {str(e)}")
1078
+ else:
1495
1079
  console.log(f"[green]{template_name} job deployed")
1496
- except Exception as e:
1497
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1498
- return
1499
1080
 
1500
1081
  @arguably.command
1501
1082
  def job__test(local_template_dir, *others, values, defaults, force_namespace: str=None):
@@ -1568,22 +1149,12 @@ def job__defaults(template_name, *others):
1568
1149
  return
1569
1150
 
1570
1151
  # deploy template with kube-watcher
1571
- data = {
1572
- "template": template_name
1573
- }
1574
- try:
1575
- result = request_to_server(
1576
- method="get",
1577
- endpoint="/v1/job_defaults",
1578
- data=data,
1579
- server_creds=USER_LOCAL_SERVER_FILE,
1580
- user_cookie=USER_COOKIE
1581
- )
1582
- print(
1583
- json.dumps(result,indent=3)
1584
- )
1585
- except Exception as e:
1586
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1152
+ defaults = fetch_job_defaults(name=template_name)
1153
+ if "error" in defaults:
1154
+ console.log(f"[red]Error when fetching job defaults: {defaults}")
1155
+ print(
1156
+ json.dumps(defaults, indent=3)
1157
+ )
1587
1158
 
1588
1159
 
1589
1160
  @arguably.command
@@ -1601,23 +1172,11 @@ def job__delete(name, *others, force_namespace: str=None):
1601
1172
  console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1602
1173
 
1603
1174
  # deploy template with kube-watcher
1604
- data = {
1605
- "label": TEMPLATE_LABEL, # this ensures that both lws template and services are deleted
1606
- "value": name
1607
- }
1608
- if force_namespace is not None:
1609
- data["force_namespace"] = force_namespace
1610
- try:
1611
- result = request_to_server(
1612
- method="post",
1613
- endpoint="/v1/delete_labeled_resources",
1614
- data=data,
1615
- server_creds=USER_LOCAL_SERVER_FILE,
1616
- user_cookie=USER_COOKIE
1617
- )
1175
+ result = delete_job(name=name, force_namespace=force_namespace)
1176
+ if "error" in result:
1177
+ console.log(f"[red]Error when deleting job: {str(e)}")
1178
+ else:
1618
1179
  console.log(f"{result}")
1619
- except Exception as e:
1620
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1621
1180
 
1622
1181
 
1623
1182
  @arguably.command