kalavai-client 0.5.14__py3-none-any.whl → 0.5.16__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
kalavai_client/cli.py CHANGED
@@ -7,19 +7,30 @@ import time
7
7
  import socket
8
8
  from pathlib import Path
9
9
  from getpass import getpass
10
- import ipaddress
11
10
  from sys import exit
12
11
 
13
12
  import yaml
14
- import netifaces as ni
13
+
15
14
  import arguably
16
15
  from rich.console import Console
17
16
 
17
+ from kalavai_client.cluster import CLUSTER
18
18
  from kalavai_client.env import (
19
19
  USER_COOKIE,
20
20
  USER_LOCAL_SERVER_FILE,
21
21
  TEMPLATE_LABEL,
22
- user_path
22
+ KALAVAI_PLATFORM_URL,
23
+ DEFAULT_VPN_CONTAINER_NAME,
24
+ CONTAINER_HOST_PATH,
25
+ USER_COMPOSE_FILE,
26
+ USER_HELM_APPS_FILE,
27
+ USER_KUBECONFIG_FILE,
28
+ USER_VPN_COMPOSE_FILE,
29
+ USER_TEMPLATES_FOLDER,
30
+ DOCKER_COMPOSE_GUI,
31
+ USER_GUI_COMPOSE_FILE,
32
+ user_path,
33
+ resource_path,
23
34
  )
24
35
  from kalavai_client.core import (
25
36
  fetch_resources,
@@ -28,22 +39,29 @@ from kalavai_client.core import (
28
39
  fetch_devices,
29
40
  fetch_job_logs,
30
41
  fetch_gpus,
31
- load_gpu_models
42
+ load_gpu_models,
43
+ fetch_job_templates,
44
+ fetch_job_defaults,
45
+ deploy_job,
46
+ delete_job,
47
+ check_token,
48
+ attach_to_pool,
49
+ join_pool,
50
+ create_pool,
51
+ get_ip_addresses,
52
+ pause_agent,
53
+ resume_agent
32
54
  )
33
55
  from kalavai_client.utils import (
34
56
  check_gpu_drivers,
57
+ load_template,
35
58
  run_cmd,
36
- decode_dict,
37
59
  generate_join_token,
38
60
  user_confirm,
39
- load_template,
40
- store_server_info,
41
61
  generate_table,
42
62
  request_to_server,
43
- resource_path,
44
63
  safe_remove,
45
64
  leave_vpn,
46
- get_vpn_details,
47
65
  load_server_info,
48
66
  user_login,
49
67
  user_logout,
@@ -51,9 +69,6 @@ from kalavai_client.utils import (
51
69
  register_cluster,
52
70
  unregister_cluster,
53
71
  get_public_seeds,
54
- validate_join_public_seed,
55
- is_storage_compatible,
56
- is_watcher_alive,
57
72
  load_user_session,
58
73
  SERVER_IP_KEY,
59
74
  AUTH_KEY,
@@ -62,112 +77,30 @@ from kalavai_client.utils import (
62
77
  WRITE_AUTH_KEY,
63
78
  PUBLIC_LOCATION_KEY,
64
79
  NODE_NAME_KEY,
65
- CLUSTER_NAME_KEY,
66
- CLUSTER_IP_KEY,
67
- CLUSTER_TOKEN_KEY,
68
- WATCHER_PORT_KEY,
69
- MANDATORY_TOKEN_FIELDS,
70
- USER_NODE_LABEL_KEY,
71
- ALLOW_UNREGISTERED_USER_KEY
72
- )
73
- from kalavai_client.cluster import (
74
- dockerCluster
80
+ CLUSTER_NAME_KEY
75
81
  )
76
82
 
77
83
 
78
- KALAVAI_PLATFORM_URL = os.getenv("KALAVAI_PLATFORM_URL", "https://platform.kalavai.net")
79
84
  LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
80
85
  VERSION = 1
81
86
  RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
82
87
  CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
83
88
  RAY_LABEL = "kalavai.ray.name"
84
89
  PVC_NAME_LABEL = "kalavai.storage.name"
85
- DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
86
90
  VPN_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/vpn-template.yaml")
87
- POOL_CONFIG_TEMPLATE = resource_path("kalavai_client/assets/pool_config_template.yaml")
88
- POOL_CONFIG_DEFAULT_VALUES = resource_path("kalavai_client/assets/pool_config_values.yaml")
89
- USER_WORKSPACE_TEMPLATE = resource_path("kalavai_client/assets/user_workspace.yaml")
90
- DEFAULT_USER_WORKSPACE_VALUES = resource_path("kalavai_client/assets/user_workspace_values.yaml")
91
91
  STORAGE_CLASS_NAME = "local-path"
92
92
  STORAGE_ACCESS_MODE = ["ReadWriteOnce"]
93
- STORAGE_CLASS_LABEL = "kalavai.storage.enabled"
94
93
  DEFAULT_STORAGE_NAME = "pool-cache"
95
94
  DEFAULT_STORAGE_SIZE = 20
96
- DEFAULT_WATCHER_PORT = 30001
97
- USER_NODE_LABEL = "kalavai.cluster.user"
98
- KUBE_VERSION = os.getenv("KALAVAI_KUBE_VERSION", "v1.31.1+k3s1")
99
- DEFAULT_FLANNEL_IFACE = os.getenv("KALAVAI_FLANNEL_IFACE", "netmaker-1")
100
- FORBIDEDEN_IPS = ["127.0.0.1"]
101
- # kalavai templates
102
- HELM_APPS_FILE = resource_path("kalavai_client/assets/apps.yaml")
103
- HELM_APPS_VALUES = resource_path("kalavai_client/assets/apps_values.yaml")
104
- # user specific config files
105
- DEFAULT_CONTAINER_NAME = "kalavai"
106
- DEFAULT_VPN_CONTAINER_NAME = "kalavai-vpn"
107
- CONTAINER_HOST_PATH = user_path("pool/", create_path=True)
108
- USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
109
- USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
110
- USER_HELM_APPS_FILE = user_path("apps.yaml")
111
- USER_KUBECONFIG_FILE = user_path("kubeconfig")
112
- USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
113
-
114
95
 
96
+
115
97
  console = Console()
116
- CLUSTER = dockerCluster(
117
- container_name=DEFAULT_CONTAINER_NAME,
118
- kube_version=KUBE_VERSION,
119
- flannel_iface=DEFAULT_FLANNEL_IFACE,
120
- compose_file=USER_COMPOSE_FILE,
121
- kubeconfig_file=USER_KUBECONFIG_FILE,
122
- poolconfig_file=USER_LOCAL_SERVER_FILE,
123
- dependencies_file=USER_HELM_APPS_FILE
124
- )
125
98
 
126
99
 
127
100
  ######################
128
101
  ## HELPER FUNCTIONS ##
129
102
  ######################
130
103
 
131
- def check_seed_compatibility():
132
- """Check required packages to start pools"""
133
- logs = []
134
- console.log("[white]Checking system requirements...")
135
- # docker
136
- try:
137
- run_cmd("docker version >/dev/null 2>&1")
138
- except:
139
- logs.append("[red]Docker not installed. Install instructions:\n")
140
- logs.append(" Linux: https://docs.docker.com/engine/install/\n")
141
- logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
142
-
143
- if len(logs) == 0:
144
- console.log("[green]System is ready to start a pool")
145
- return True
146
- else:
147
- for log in logs:
148
- console.log(log)
149
- return False
150
-
151
- def check_worker_compatibility():
152
- """Check required packages to join pools"""
153
- logs = []
154
- console.log("[white]Checking system requirements...")
155
- # docker
156
- try:
157
- run_cmd("docker version >/dev/null 2>&1")
158
- except:
159
- logs.append("[red]Docker not installed. Install instructions:\n")
160
- logs.append(" Linux: https://docs.docker.com/engine/install/\n")
161
- logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
162
-
163
- if len(logs) == 0:
164
- console.log("[green]System is ready to join a pool")
165
- return True
166
- else:
167
- for log in logs:
168
- console.log(log)
169
- return False
170
-
171
104
 
172
105
  def cleanup_local():
173
106
  console.log("Removing local cache files...")
@@ -178,6 +111,7 @@ def cleanup_local():
178
111
  safe_remove(USER_KUBECONFIG_FILE)
179
112
  safe_remove(USER_LOCAL_SERVER_FILE)
180
113
  safe_remove(USER_TEMPLATES_FOLDER)
114
+ safe_remove(USER_GUI_COMPOSE_FILE)
181
115
 
182
116
  def pre_join_check(node_name, server_url, server_key):
183
117
  # check with the server that we can connect
@@ -217,75 +151,9 @@ def set_schedulable(schedulable, node_name=load_server_info(data_key=NODE_NAME_K
217
151
  except Exception as e:
218
152
  console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
219
153
 
220
-
221
- def init_user_workspace(force_namespace=None):
222
-
223
- # load template config and populate with values
224
- sidecar_template_yaml = load_template(
225
- template_path=USER_WORKSPACE_TEMPLATE,
226
- values={},
227
- default_values_path=DEFAULT_USER_WORKSPACE_VALUES)
228
-
229
- try:
230
- data = {"config": sidecar_template_yaml}
231
- if force_namespace is not None:
232
- data["force_namespace"] = force_namespace
233
- result = request_to_server(
234
- method="post",
235
- endpoint="/v1/create_user_space",
236
- data=data,
237
- server_creds=USER_LOCAL_SERVER_FILE,
238
- user_cookie=USER_COOKIE
239
- )
240
- console.log(f"Workspace creation (ignore already created warnings): {result}" )
241
- except Exception as e:
242
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
243
-
244
- def pool_init(pool_config_values_path=None):
245
- """Deploy configured objects to initialise pool"""
246
- if pool_config_values_path is None:
247
- return
248
-
249
- # load template config and populate with values
250
- sidecar_template_yaml = load_template(
251
- template_path=POOL_CONFIG_TEMPLATE,
252
- values={},
253
- default_values_path=pool_config_values_path)
254
-
255
- try:
256
- result = request_to_server(
257
- method="post",
258
- endpoint="/v1/deploy_generic_model",
259
- data={"config": sidecar_template_yaml},
260
- server_creds=USER_LOCAL_SERVER_FILE,
261
- user_cookie=USER_COOKIE
262
- )
263
- if 'failed' in result and len(result['failed']) > 0:
264
- console.log(f"[red]Error when deploying pool config\n\n{result['failed']}")
265
- if len(result['successful']) > 0:
266
- console.log(f"[green]Deployed pool config!")
267
- except Exception as e:
268
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
269
-
270
154
  def select_ip_address(subnet=None):
271
- ips = []
272
- retry = 3
273
- while len(ips) == 0:
274
- for iface in ni.interfaces():
275
- try:
276
- ip = ni.ifaddresses(iface)[ni.AF_INET][0]['addr']
277
- if ip in FORBIDEDEN_IPS:
278
- continue
279
- if subnet is None or ipaddress.ip_address(ip) in ipaddress.ip_network(subnet):
280
- ips.append(ip)
281
- except:
282
- pass
283
- if len(ips) == 1:
284
- return ips[0]
285
- time.sleep(2)
286
- retry -= 1
287
- if retry < 0:
288
- raise ValueError(f"No IPs available on subnet {subnet}")
155
+ ips = get_ip_addresses(subnet=subnet)
156
+
289
157
  while True:
290
158
  option = user_confirm(
291
159
  question="Select IP to advertise the node (needs to be visible to other nodes)",
@@ -336,50 +204,50 @@ def select_token_type():
336
204
  break
337
205
  return {"admin": choice == 0, "user": choice == 1, "worker": choice == 2}
338
206
 
339
- def generate_compose_config(role, node_name, is_public, use_gpus=True, node_labels=None, pool_ip=None, vpn_token=None, pool_token=None):
207
+ def input_gpus():
340
208
  num_gpus = 0
341
- if use_gpus:
342
- try:
343
- has_gpus = check_gpu_drivers()
344
- if has_gpus:
345
- max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
346
- num_gpus = user_confirm(
347
- question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
348
- options=range(max_gpus+1)
349
- )
350
- except:
351
- console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
352
- if node_labels is not None:
353
- node_labels = " ".join([f"--node-label {key}={value}" for key, value in node_labels.items()])
354
- compose_values = {
355
- "user_path": user_path(""),
356
- "service_name": DEFAULT_CONTAINER_NAME,
357
- "vpn": is_public,
358
- "vpn_name": DEFAULT_VPN_CONTAINER_NAME,
359
- "pool_ip": pool_ip,
360
- "pool_token": pool_token,
361
- "vpn_token": vpn_token,
362
- "node_name": node_name,
363
- "command": role,
364
- "storage_enabled": "True",
365
- "num_gpus": num_gpus,
366
- "k3s_path": f"{CONTAINER_HOST_PATH}/k3s",
367
- "etc_path": f"{CONTAINER_HOST_PATH}/etc",
368
- "node_labels": node_labels,
369
- "flannel_iface": DEFAULT_FLANNEL_IFACE if is_public else ""
370
- }
371
- # generate local config files
372
- compose_yaml = load_template(
373
- template_path=DOCKER_COMPOSE_TEMPLATE,
374
- values=compose_values)
375
- with open(USER_COMPOSE_FILE, "w") as f:
376
- f.write(compose_yaml)
377
- return compose_yaml
209
+ try:
210
+ has_gpus = check_gpu_drivers()
211
+ if has_gpus:
212
+ max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
213
+ num_gpus = user_confirm(
214
+ question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
215
+ options=range(max_gpus+1)
216
+ )
217
+ except:
218
+ console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
219
+ return num_gpus
378
220
 
379
221
  ##################
380
222
  ## CLI COMMANDS ##
381
223
  ##################
382
224
 
225
+ @arguably.command
226
+ def gui__start(*others, gui_port=3000, backend_port=8000):
227
+ """Run GUI"""
228
+ values = {
229
+ "path": user_path(""),
230
+ "gui_port": gui_port,
231
+ "backend_port": backend_port
232
+ }
233
+ compose_yaml = load_template(
234
+ template_path=DOCKER_COMPOSE_GUI,
235
+ values=values)
236
+ with open(USER_GUI_COMPOSE_FILE, "w") as f:
237
+ f.write(compose_yaml)
238
+
239
+ run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} up -d")
240
+
241
+ console.log(f"[green]Loading GUI, may take a few minutes. It will be available at http://localhost:{gui_port}")
242
+
243
+ @arguably.command
244
+ def gui__stop(*others):
245
+ """Stop GUI"""
246
+ run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} down")
247
+
248
+ console.log("[green]Kalavai GUI has been stopped")
249
+
250
+
383
251
  @arguably.command
384
252
  def login(*others, username: str=None):
385
253
  """
@@ -461,8 +329,9 @@ def pool__publish(*others, description=None):
461
329
  description = description
462
330
 
463
331
  try:
464
- if not pool__check_token(token=token, public=True):
465
- raise ValueError("[red]Cluster must be started with a valid vpn_location to publish")
332
+ valid = check_token(token=token, public=True)
333
+ if "error" in valid:
334
+ raise ValueError(f"[red]Cluster must be started with a valid vpn_location to publish: {valid}")
466
335
  cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
467
336
 
468
337
  register_cluster(
@@ -523,7 +392,7 @@ def pool__list(*others, user_only=False):
523
392
 
524
393
 
525
394
  @arguably.command
526
- def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=HELM_APPS_VALUES, pool_config_values: str=POOL_CONFIG_DEFAULT_VALUES):
395
+ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=None, pool_config_values: str=None):
527
396
  """
528
397
  Start Kalavai pool and start/resume sharing resources.
529
398
 
@@ -531,9 +400,6 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
531
400
  *others: all the other positional arguments go here
532
401
  """
533
402
 
534
- if not check_seed_compatibility():
535
- return
536
-
537
403
  if CLUSTER.is_cluster_init():
538
404
  console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
539
405
  return
@@ -547,127 +413,25 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
547
413
  console.log("Installation was cancelled and did not complete.")
548
414
  return
549
415
 
550
- # if only registered users are allowed, check user has logged in
551
- user = defaultdict(lambda: None)
552
- if only_registered_users or location is not None:
553
- user = user_login(user_cookie=USER_COOKIE)
554
- if user is None:
555
- console.log("[white]--only-registered-users [red]or [white]--location[red] can only be used if the host is authenticated. Run [yellow]kalavai login[red] to authenticate")
556
- exit()
557
-
558
- # join private network if provided
559
- vpn = defaultdict(lambda: None)
560
- node_labels = {
561
- STORAGE_CLASS_LABEL: is_storage_compatible()
562
- }
563
- if location is not None:
564
- console.log("Fetching VPN credentials")
565
- try:
566
- vpn = get_vpn_details(
567
- location=location,
568
- user_cookie=USER_COOKIE)
569
- node_labels[USER_NODE_LABEL] = user["username"]
570
- except Exception as e:
571
- console.log(f"[red]Error when joining network: {str(e)}")
572
- return
573
-
574
- # Generate docker compose recipe
575
- generate_compose_config(
576
- role="server",
577
- vpn_token=vpn["key"],
578
- node_name=socket.gethostname(),
579
- node_labels=node_labels,
580
- is_public=location is not None
581
- )
582
-
583
- # start server
584
- console.log("Deploying seed...")
585
- CLUSTER.start_seed_node()
586
-
587
- while not CLUSTER.is_agent_running():
588
- console.log("Waiting for seed to start...")
589
- time.sleep(10)
590
-
591
416
  # select IP address (for external discovery)
592
417
  if ip_address is None and location is None:
593
418
  # local IP
594
419
  console.log(f"Scanning for valid IPs")
595
420
  ip_address = select_ip_address()
596
- else:
597
- # load VPN ip
598
- ip_address = CLUSTER.get_vpn_ip()
421
+
599
422
  console.log(f"Using {ip_address} address for server")
600
423
 
601
- # populate local cred files
602
- auth_key = str(uuid.uuid4())
603
- write_auth_key = str(uuid.uuid4())
604
- readonly_auth_key = str(uuid.uuid4())
605
-
606
- watcher_service = f"{ip_address}:{DEFAULT_WATCHER_PORT}"
607
- values = {
608
- CLUSTER_NAME_KEY: cluster_name,
609
- CLUSTER_IP_KEY: ip_address,
610
- AUTH_KEY: auth_key,
611
- READONLY_AUTH_KEY: readonly_auth_key,
612
- WRITE_AUTH_KEY: write_auth_key,
613
- WATCHER_PORT_KEY: DEFAULT_WATCHER_PORT,
614
- WATCHER_SERVICE_KEY: watcher_service,
615
- USER_NODE_LABEL_KEY: USER_NODE_LABEL,
616
- ALLOW_UNREGISTERED_USER_KEY: not only_registered_users
617
- }
424
+ console.log(f"[green]Creating {cluster_name} pool, this may take a few minutes...")
618
425
 
619
- store_server_info(
620
- server_ip=ip_address,
621
- auth_key=auth_key,
622
- readonly_auth_key=readonly_auth_key,
623
- write_auth_key=write_auth_key,
624
- file=USER_LOCAL_SERVER_FILE,
625
- watcher_service=watcher_service,
626
- node_name=socket.gethostname(),
426
+ create_pool(
627
427
  cluster_name=cluster_name,
628
- public_location=location,
629
- user_api_key=user["api_key"])
630
-
631
- # Generate helmfile recipe
632
- helm_yaml = load_template(
633
- template_path=HELM_APPS_FILE,
634
- values=values,
635
- default_values_path=app_values,
636
- force_defaults=True)
637
- with open(USER_HELM_APPS_FILE, "w") as f:
638
- f.write(helm_yaml)
639
-
640
- console.log("[green]Config files have been generated in your local machine\n")
641
-
642
- console.log("Setting pool dependencies...")
643
- # set template values in helmfile
644
- try:
645
- CLUSTER.update_dependencies(
646
- dependencies_file=USER_HELM_APPS_FILE
647
- )
648
- except Exception as e:
649
- console.log(f"Error: {str(e)}")
650
- exit()
651
- console.log("[green]Your pool is ready! Grow it by sharing your joining token with others. Run [yellow]kalavai pool token[green] to generate one.")
652
-
653
- if location is not None:
654
- # register with kalavai if it's a public cluster
655
- console.log("Registering public cluster with Kalavai...")
656
- pool__publish()
657
-
658
- # wait until the server is ready to create objects
659
- while True:
660
- console.log("Waiting for core services to be ready, may take a few minutes...")
661
- time.sleep(30)
662
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
663
- break
664
- console.log("Initialise user workspace...")
665
- pool_init(pool_config_values_path=pool_config_values)
666
- # init default namespace
667
- init_user_workspace(force_namespace="default")
668
- if only_registered_users:
669
- # init user namespace
670
- init_user_workspace()
428
+ ip_address=ip_address,
429
+ app_values=app_values,
430
+ pool_config_values=pool_config_values,
431
+ num_gpus=input_gpus(),
432
+ only_registered_users=only_registered_users,
433
+ location=location
434
+ )
671
435
 
672
436
  return None
673
437
 
@@ -720,20 +484,13 @@ def pool__check_token(token, *others, public=False):
720
484
  """
721
485
  Utility to check the validity of a join token
722
486
  """
723
- try:
724
- data = decode_dict(token)
725
- for field in MANDATORY_TOKEN_FIELDS:
726
- assert field in data
727
- if public:
728
- if data[PUBLIC_LOCATION_KEY] is None:
729
- raise ValueError("Token is not valid for public pools. Did you start the cluster with a public_location?")
730
- console.log("[green]Token format is correct")
731
- return True
732
- except Exception as e:
733
- console.log(f"[white]{str(e)}")
734
- console.log("[red]Token is invalid.")
487
+ result = check_token(token=token, public=public)
488
+ if "error" in result:
489
+ console.log(f"[red]Error in token: {result}")
735
490
  return False
736
-
491
+
492
+ console.log("[green]Token format is correct")
493
+ return True
737
494
 
738
495
  @arguably.command
739
496
  def pool__join(token, *others, node_name=None):
@@ -743,9 +500,6 @@ def pool__join(token, *others, node_name=None):
743
500
  Args:
744
501
  *others: all the other positional arguments go here
745
502
  """
746
-
747
- if not check_worker_compatibility():
748
- return
749
503
 
750
504
  # check that k3s is not running already in the host
751
505
  # k3s service running or preinstalled
@@ -763,132 +517,26 @@ def pool__join(token, *others, node_name=None):
763
517
  console.log("[green]Nothing happened.")
764
518
  return
765
519
 
766
- if node_name is None:
767
- node_name = socket.gethostname()
768
-
769
- # check token
770
- if not pool__check_token(token):
771
- return
772
-
773
- try:
774
- data = decode_dict(token)
775
- kalavai_seed_ip = data[CLUSTER_IP_KEY]
776
- kalavai_token = data[CLUSTER_TOKEN_KEY]
777
- cluster_name = data[CLUSTER_NAME_KEY]
778
- auth_key = data[AUTH_KEY]
779
- watcher_service = data[WATCHER_SERVICE_KEY]
780
- public_location = data[PUBLIC_LOCATION_KEY]
781
- vpn = defaultdict(lambda: None)
782
- except Exception as e:
783
- console.log(str(e))
784
- console.log("[red] Invalid token")
785
- return
786
-
787
- # join private network if provided
788
- node_labels = {
789
- STORAGE_CLASS_LABEL: is_storage_compatible()
790
- }
791
- user = defaultdict(lambda: None)
792
- if public_location is not None:
793
- user = user_login(user_cookie=USER_COOKIE)
794
- if user is None:
795
- console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
796
- exit()
797
- console.log("Fetching VPN credentials")
798
- try:
799
- vpn = get_vpn_details(
800
- location=public_location,
801
- user_cookie=USER_COOKIE)
802
- node_labels[USER_NODE_LABEL] = user["username"]
803
- except Exception as e:
804
- console.log(f"[red]Error when joining network: {str(e)}")
805
- console.log("Are you authenticated? Try [yellow]kalavai login")
806
- return
807
- try:
808
- validate_join_public_seed(
809
- cluster_name=cluster_name,
810
- join_key=token,
811
- user_cookie=USER_COOKIE
812
- )
813
- except Exception as e:
814
- console.log(f"[red]Error when joining network: {str(e)}")
815
- return
816
-
817
- # send note to server to let them know the node is coming online
818
- # TODO: won't be able to check for VPN pools...
819
- # if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
820
- # console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with '--node-name'")
821
- # leave_vpn(container_name=DEFAULT_VPN_CONTAINER_NAME)
822
- # return
823
-
824
- # local agent join
825
- # 1. Generate local cache files
826
- console.log("Generating config files...")
827
-
828
- # Generate docker compose recipe
829
- generate_compose_config(
830
- role="agent",
831
- pool_ip=f"https://{kalavai_seed_ip}:6443",
832
- pool_token=kalavai_token,
833
- vpn_token=vpn["key"],
834
- node_name=node_name,
835
- node_labels=node_labels,
836
- is_public=public_location is not None)
837
-
838
- store_server_info(
839
- server_ip=kalavai_seed_ip,
840
- auth_key=auth_key,
841
- file=USER_LOCAL_SERVER_FILE,
842
- watcher_service=watcher_service,
843
- node_name=node_name,
844
- cluster_name=cluster_name,
845
- public_location=public_location,
846
- user_api_key=user["api_key"])
520
+ num_gpus = input_gpus()
847
521
 
848
522
  option = user_confirm(
849
523
  question="Docker compose ready. Would you like Kalavai to deploy it?",
850
524
  options=["no", "yes"]
851
525
  )
852
526
  if option == 0:
853
- console.log("Manually deploy the worker with the following command:\n")
854
- print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
527
+ console.log("[red]Installation aborted")
855
528
  return
856
529
 
857
- console.log(f"[white] Connecting to {cluster_name} @ {kalavai_seed_ip} (this may take a few minutes)...")
858
- try:
859
- CLUSTER.start_worker_node()
860
- except Exception as e:
861
- console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
862
- pool__stop()
863
- exit()
864
-
865
- # ensure we are connected
866
- while True:
867
- console.log("Waiting for core services to be ready, may take a few minutes...")
868
- time.sleep(30)
869
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
870
- break
871
-
872
- # send note to server to let them know the node is coming online
873
- if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
874
- console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with [yellow]--node-name'")
875
- pool__stop()
876
- return
877
-
878
- # check the node has connected successfully
879
- try:
880
- while not CLUSTER.is_agent_running():
881
- console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
882
- time.sleep(30)
883
- except KeyboardInterrupt:
884
- console.log("[red]Installation aborted. Leaving pool.")
885
- pool__stop()
886
- return
887
-
888
- init_user_workspace()
889
-
890
- # set status to schedulable
891
- console.log(f"[green] You are connected to {cluster_name}")
530
+ console.log("Connecting worker to the pool...")
531
+ result = join_pool(
532
+ token=token,
533
+ node_name=node_name,
534
+ num_gpus=num_gpus
535
+ )
536
+ if "error" in result:
537
+ console.log(f"[red]Error when connecting: {result}")
538
+ else:
539
+ console.log(f"[green] You are connected to {result}")
892
540
 
893
541
  @arguably.command
894
542
  def pool__stop(*others, skip_node_deletion=False):
@@ -942,11 +590,12 @@ def pool__pause(*others):
942
590
  """
943
591
  # k3s stop locally
944
592
  console.log("[white] Pausing kalavai app...")
945
- success = CLUSTER.pause_agent()
946
- if success:
947
- console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
593
+ success = pause_agent()
594
+ if "error" in success:
595
+ console.log(f"[red] Error when stopping. {success['error']}")
948
596
  else:
949
- console.log("[red] Error when stopping. Please run [yellow]kalavai pool pause[red] again.")
597
+ console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
598
+
950
599
 
951
600
  @arguably.command
952
601
  def pool__resume(*others):
@@ -961,10 +610,12 @@ def pool__resume(*others):
961
610
  console.log("[red] Kalavai app was not started before, please run [yellow]kalavai pool start[red] to start a pool or [yellow]kalavai pool join[red] to join one first")
962
611
  return
963
612
  console.log("[white] Restarting sharing (may take a few minutes)...")
964
- if CLUSTER.restart_agent():
965
- console.log("[white] Kalava sharing resumed")
613
+ success = resume_agent()
614
+ if "error" in success:
615
+ console.log(f"[red] Error when restarting. {success['error']}")
966
616
  else:
967
- console.log("[red] Error when restarting. Please run [yellow]kalavai pool resume[white] again.")
617
+ console.log("[white] Kalava sharing resumed")
618
+
968
619
 
969
620
 
970
621
  @arguably.command
@@ -1103,7 +754,7 @@ def pool__attach(token, *others, node_name=None):
1103
754
  """
1104
755
 
1105
756
  if node_name is None:
1106
- node_name = socket.gethostname()
757
+ node_name = f"{socket.gethostname()}-{uuid.uuid4().hex[:6]}"
1107
758
 
1108
759
  # check that is not attached to another instance
1109
760
  if os.path.exists(USER_LOCAL_SERVER_FILE):
@@ -1115,70 +766,6 @@ def pool__attach(token, *others, node_name=None):
1115
766
  console.log("[green]Nothing happened.")
1116
767
  return
1117
768
 
1118
- # check token
1119
- if not pool__check_token(token):
1120
- return
1121
-
1122
- try:
1123
- data = decode_dict(token)
1124
- kalavai_seed_ip = data[CLUSTER_IP_KEY]
1125
- cluster_name = data[CLUSTER_NAME_KEY]
1126
- auth_key = data[AUTH_KEY]
1127
- watcher_service = data[WATCHER_SERVICE_KEY]
1128
- public_location = data[PUBLIC_LOCATION_KEY]
1129
- vpn = defaultdict(lambda: None)
1130
- except Exception as e:
1131
- console.log(str(e))
1132
- console.log("[red] Invalid token")
1133
- return
1134
-
1135
- user = defaultdict(lambda: None)
1136
- if public_location is not None:
1137
- user = user_login(user_cookie=USER_COOKIE)
1138
- if user is None:
1139
- console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
1140
- exit()
1141
- console.log("Fetching VPN credentials")
1142
- try:
1143
- vpn = get_vpn_details(
1144
- location=public_location,
1145
- user_cookie=USER_COOKIE)
1146
- except Exception as e:
1147
- console.log(f"[red]Error when joining network: {str(e)}")
1148
- console.log("Are you authenticated? Try [yellow]kalavai login")
1149
- return
1150
- try:
1151
- validate_join_public_seed(
1152
- cluster_name=cluster_name,
1153
- join_key=token,
1154
- user_cookie=USER_COOKIE
1155
- )
1156
- except Exception as e:
1157
- console.log(f"[red]Error when joining network: {str(e)}")
1158
- return
1159
-
1160
- # local agent join
1161
- # 1. Generate local cache files
1162
- console.log("Generating config files...")
1163
-
1164
- # Generate docker compose recipe
1165
- generate_compose_config(
1166
- use_gpus=False,
1167
- role="",
1168
- vpn_token=vpn["key"],
1169
- node_name=node_name,
1170
- is_public=public_location is not None)
1171
-
1172
- store_server_info(
1173
- server_ip=kalavai_seed_ip,
1174
- auth_key=auth_key,
1175
- file=USER_LOCAL_SERVER_FILE,
1176
- watcher_service=watcher_service,
1177
- node_name=node_name,
1178
- cluster_name=cluster_name,
1179
- public_location=public_location,
1180
- user_api_key=user["api_key"])
1181
-
1182
769
  option = user_confirm(
1183
770
  question="Docker compose ready. Would you like Kalavai to deploy it?",
1184
771
  options=["no", "yes"]
@@ -1188,17 +775,13 @@ def pool__attach(token, *others, node_name=None):
1188
775
  print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
1189
776
  return
1190
777
 
1191
- console.log(f"[white] Connecting to {cluster_name} @ {kalavai_seed_ip} (this may take a few minutes)...")
1192
- run_cmd(f"docker compose -f {USER_COMPOSE_FILE} up -d")
1193
- # ensure we are connected
1194
- while True:
1195
- console.log("Waiting for core services to be ready, may take a few minutes...")
1196
- time.sleep(30)
1197
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
1198
- break
778
+ result = attach_to_pool(token=token, node_name=node_name)
1199
779
 
780
+ if "error" in result:
781
+ console.log(f"[red]Error when attaching to pool: {result}")
782
+ return
1200
783
  # set status to schedulable
1201
- console.log(f"[green] You are connected to {cluster_name}")
784
+ console.log(f"[green] You are connected to {result}")
1202
785
 
1203
786
 
1204
787
  @arguably.command
@@ -1422,18 +1005,13 @@ def job__templates(*others):
1422
1005
  console.log(f"[red]Problems with your pool: {str(e)}")
1423
1006
  return
1424
1007
 
1425
- try:
1426
- result = request_to_server(
1427
- method="get",
1428
- endpoint="/v1/get_job_templates",
1429
- server_creds=USER_LOCAL_SERVER_FILE,
1430
- data=None,
1431
- user_cookie=USER_COOKIE
1432
- )
1433
- console.log("Templates available in the pool")
1434
- console.log(result)
1435
- except Exception as e:
1436
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1008
+ templates = fetch_job_templates()
1009
+ if "error" in templates:
1010
+ console.log(f"[red]Error when fetching templates: {str(e)}")
1011
+ return
1012
+
1013
+ console.log("Templates available in the pool")
1014
+ console.log(templates)
1437
1015
 
1438
1016
 
1439
1017
  @arguably.command
@@ -1489,26 +1067,16 @@ def job__run(template_name, *others, values: str=None, force_namespace: str=None
1489
1067
  annotation_key="nvidia.com/nouse-gputype"
1490
1068
  )
1491
1069
 
1492
- # deploy template with kube-watcher
1493
- data = {
1494
- "template": template_name,
1495
- "template_values": values_dict
1496
- }
1497
- if force_namespace is not None:
1498
- data["force_namespace"] = force_namespace
1070
+ result = deploy_job(
1071
+ template_name=template_name,
1072
+ values_dict=values_dict,
1073
+ force_namespace=force_namespace
1074
+ )
1499
1075
 
1500
- try:
1501
- result = request_to_server(
1502
- method="post",
1503
- endpoint="/v1/deploy_job",
1504
- data=data,
1505
- server_creds=USER_LOCAL_SERVER_FILE,
1506
- user_cookie=USER_COOKIE
1507
- )
1076
+ if "error" in result:
1077
+ console.log(f"[red]Error when deploying job: {str(e)}")
1078
+ else:
1508
1079
  console.log(f"[green]{template_name} job deployed")
1509
- except Exception as e:
1510
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1511
- return
1512
1080
 
1513
1081
  @arguably.command
1514
1082
  def job__test(local_template_dir, *others, values, defaults, force_namespace: str=None):
@@ -1581,22 +1149,12 @@ def job__defaults(template_name, *others):
1581
1149
  return
1582
1150
 
1583
1151
  # deploy template with kube-watcher
1584
- data = {
1585
- "template": template_name
1586
- }
1587
- try:
1588
- result = request_to_server(
1589
- method="get",
1590
- endpoint="/v1/job_defaults",
1591
- data=data,
1592
- server_creds=USER_LOCAL_SERVER_FILE,
1593
- user_cookie=USER_COOKIE
1594
- )
1595
- print(
1596
- json.dumps(result,indent=3)
1597
- )
1598
- except Exception as e:
1599
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1152
+ defaults = fetch_job_defaults(name=template_name)
1153
+ if "error" in defaults:
1154
+ console.log(f"[red]Error when fetching job defaults: {defaults}")
1155
+ print(
1156
+ json.dumps(defaults, indent=3)
1157
+ )
1600
1158
 
1601
1159
 
1602
1160
  @arguably.command
@@ -1614,23 +1172,11 @@ def job__delete(name, *others, force_namespace: str=None):
1614
1172
  console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1615
1173
 
1616
1174
  # deploy template with kube-watcher
1617
- data = {
1618
- "label": TEMPLATE_LABEL, # this ensures that both lws template and services are deleted
1619
- "value": name
1620
- }
1621
- if force_namespace is not None:
1622
- data["force_namespace"] = force_namespace
1623
- try:
1624
- result = request_to_server(
1625
- method="post",
1626
- endpoint="/v1/delete_labeled_resources",
1627
- data=data,
1628
- server_creds=USER_LOCAL_SERVER_FILE,
1629
- user_cookie=USER_COOKIE
1630
- )
1175
+ result = delete_job(name=name, force_namespace=force_namespace)
1176
+ if "error" in result:
1177
+ console.log(f"[red]Error when deleting job: {str(e)}")
1178
+ else:
1631
1179
  console.log(f"{result}")
1632
- except Exception as e:
1633
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1634
1180
 
1635
1181
 
1636
1182
  @arguably.command