kalavai-client 0.5.15__py3-none-any.whl → 0.5.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kalavai_client/cli.py CHANGED
@@ -7,19 +7,30 @@ import time
7
7
  import socket
8
8
  from pathlib import Path
9
9
  from getpass import getpass
10
- import ipaddress
11
10
  from sys import exit
12
11
 
13
12
  import yaml
14
- import netifaces as ni
13
+
15
14
  import arguably
16
15
  from rich.console import Console
17
16
 
17
+ from kalavai_client.cluster import CLUSTER
18
18
  from kalavai_client.env import (
19
19
  USER_COOKIE,
20
20
  USER_LOCAL_SERVER_FILE,
21
21
  TEMPLATE_LABEL,
22
- user_path
22
+ KALAVAI_PLATFORM_URL,
23
+ DEFAULT_VPN_CONTAINER_NAME,
24
+ CONTAINER_HOST_PATH,
25
+ USER_COMPOSE_FILE,
26
+ USER_HELM_APPS_FILE,
27
+ USER_KUBECONFIG_FILE,
28
+ USER_VPN_COMPOSE_FILE,
29
+ USER_TEMPLATES_FOLDER,
30
+ DOCKER_COMPOSE_GUI,
31
+ USER_GUI_COMPOSE_FILE,
32
+ user_path,
33
+ resource_path,
23
34
  )
24
35
  from kalavai_client.core import (
25
36
  fetch_resources,
@@ -28,22 +39,29 @@ from kalavai_client.core import (
28
39
  fetch_devices,
29
40
  fetch_job_logs,
30
41
  fetch_gpus,
31
- load_gpu_models
42
+ load_gpu_models,
43
+ fetch_job_templates,
44
+ fetch_job_defaults,
45
+ deploy_job,
46
+ delete_job,
47
+ check_token,
48
+ attach_to_pool,
49
+ join_pool,
50
+ create_pool,
51
+ get_ip_addresses,
52
+ pause_agent,
53
+ resume_agent
32
54
  )
33
55
  from kalavai_client.utils import (
34
56
  check_gpu_drivers,
57
+ load_template,
35
58
  run_cmd,
36
- decode_dict,
37
59
  generate_join_token,
38
60
  user_confirm,
39
- load_template,
40
- store_server_info,
41
61
  generate_table,
42
62
  request_to_server,
43
- resource_path,
44
63
  safe_remove,
45
64
  leave_vpn,
46
- get_vpn_details,
47
65
  load_server_info,
48
66
  user_login,
49
67
  user_logout,
@@ -51,9 +69,6 @@ from kalavai_client.utils import (
51
69
  register_cluster,
52
70
  unregister_cluster,
53
71
  get_public_seeds,
54
- validate_join_public_seed,
55
- is_storage_compatible,
56
- is_watcher_alive,
57
72
  load_user_session,
58
73
  SERVER_IP_KEY,
59
74
  AUTH_KEY,
@@ -62,112 +77,30 @@ from kalavai_client.utils import (
62
77
  WRITE_AUTH_KEY,
63
78
  PUBLIC_LOCATION_KEY,
64
79
  NODE_NAME_KEY,
65
- CLUSTER_NAME_KEY,
66
- CLUSTER_IP_KEY,
67
- CLUSTER_TOKEN_KEY,
68
- WATCHER_PORT_KEY,
69
- MANDATORY_TOKEN_FIELDS,
70
- USER_NODE_LABEL_KEY,
71
- ALLOW_UNREGISTERED_USER_KEY
72
- )
73
- from kalavai_client.cluster import (
74
- dockerCluster
80
+ CLUSTER_NAME_KEY
75
81
  )
76
82
 
77
83
 
78
- KALAVAI_PLATFORM_URL = os.getenv("KALAVAI_PLATFORM_URL", "https://platform.kalavai.net")
79
84
  LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
80
85
  VERSION = 1
81
86
  RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
82
87
  CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
83
88
  RAY_LABEL = "kalavai.ray.name"
84
89
  PVC_NAME_LABEL = "kalavai.storage.name"
85
- DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
86
90
  VPN_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/vpn-template.yaml")
87
- POOL_CONFIG_TEMPLATE = resource_path("kalavai_client/assets/pool_config_template.yaml")
88
- POOL_CONFIG_DEFAULT_VALUES = resource_path("kalavai_client/assets/pool_config_values.yaml")
89
- USER_WORKSPACE_TEMPLATE = resource_path("kalavai_client/assets/user_workspace.yaml")
90
- DEFAULT_USER_WORKSPACE_VALUES = resource_path("kalavai_client/assets/user_workspace_values.yaml")
91
91
  STORAGE_CLASS_NAME = "local-path"
92
92
  STORAGE_ACCESS_MODE = ["ReadWriteOnce"]
93
- STORAGE_CLASS_LABEL = "kalavai.storage.enabled"
94
93
  DEFAULT_STORAGE_NAME = "pool-cache"
95
94
  DEFAULT_STORAGE_SIZE = 20
96
- DEFAULT_WATCHER_PORT = 30001
97
- USER_NODE_LABEL = "kalavai.cluster.user"
98
- KUBE_VERSION = os.getenv("KALAVAI_KUBE_VERSION", "v1.31.1+k3s1")
99
- DEFAULT_FLANNEL_IFACE = os.getenv("KALAVAI_FLANNEL_IFACE", "netmaker-1")
100
- FORBIDEDEN_IPS = ["127.0.0.1"]
101
- # kalavai templates
102
- HELM_APPS_FILE = resource_path("kalavai_client/assets/apps.yaml")
103
- HELM_APPS_VALUES = resource_path("kalavai_client/assets/apps_values.yaml")
104
- # user specific config files
105
- DEFAULT_CONTAINER_NAME = "kalavai"
106
- DEFAULT_VPN_CONTAINER_NAME = "kalavai-vpn"
107
- CONTAINER_HOST_PATH = user_path("pool/", create_path=True)
108
- USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
109
- USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
110
- USER_HELM_APPS_FILE = user_path("apps.yaml")
111
- USER_KUBECONFIG_FILE = user_path("kubeconfig")
112
- USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
113
-
114
95
 
96
+
115
97
  console = Console()
116
- CLUSTER = dockerCluster(
117
- container_name=DEFAULT_CONTAINER_NAME,
118
- kube_version=KUBE_VERSION,
119
- flannel_iface=DEFAULT_FLANNEL_IFACE,
120
- compose_file=USER_COMPOSE_FILE,
121
- kubeconfig_file=USER_KUBECONFIG_FILE,
122
- poolconfig_file=USER_LOCAL_SERVER_FILE,
123
- dependencies_file=USER_HELM_APPS_FILE
124
- )
125
98
 
126
99
 
127
100
  ######################
128
101
  ## HELPER FUNCTIONS ##
129
102
  ######################
130
103
 
131
- def check_seed_compatibility():
132
- """Check required packages to start pools"""
133
- logs = []
134
- console.log("[white]Checking system requirements...")
135
- # docker
136
- try:
137
- run_cmd("docker version >/dev/null 2>&1")
138
- except:
139
- logs.append("[red]Docker not installed. Install instructions:\n")
140
- logs.append(" Linux: https://docs.docker.com/engine/install/\n")
141
- logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
142
-
143
- if len(logs) == 0:
144
- console.log("[green]System is ready to start a pool")
145
- return True
146
- else:
147
- for log in logs:
148
- console.log(log)
149
- return False
150
-
151
- def check_worker_compatibility():
152
- """Check required packages to join pools"""
153
- logs = []
154
- console.log("[white]Checking system requirements...")
155
- # docker
156
- try:
157
- run_cmd("docker version >/dev/null 2>&1")
158
- except:
159
- logs.append("[red]Docker not installed. Install instructions:\n")
160
- logs.append(" Linux: https://docs.docker.com/engine/install/\n")
161
- logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
162
-
163
- if len(logs) == 0:
164
- console.log("[green]System is ready to join a pool")
165
- return True
166
- else:
167
- for log in logs:
168
- console.log(log)
169
- return False
170
-
171
104
 
172
105
  def cleanup_local():
173
106
  console.log("Removing local cache files...")
@@ -178,6 +111,7 @@ def cleanup_local():
178
111
  safe_remove(USER_KUBECONFIG_FILE)
179
112
  safe_remove(USER_LOCAL_SERVER_FILE)
180
113
  safe_remove(USER_TEMPLATES_FOLDER)
114
+ safe_remove(USER_GUI_COMPOSE_FILE)
181
115
 
182
116
  def pre_join_check(node_name, server_url, server_key):
183
117
  # check with the server that we can connect
@@ -217,75 +151,11 @@ def set_schedulable(schedulable, node_name=load_server_info(data_key=NODE_NAME_K
217
151
  except Exception as e:
218
152
  console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
219
153
 
220
-
221
- def init_user_workspace(force_namespace=None):
222
-
223
- # load template config and populate with values
224
- sidecar_template_yaml = load_template(
225
- template_path=USER_WORKSPACE_TEMPLATE,
226
- values={},
227
- default_values_path=DEFAULT_USER_WORKSPACE_VALUES)
228
-
229
- try:
230
- data = {"config": sidecar_template_yaml}
231
- if force_namespace is not None:
232
- data["force_namespace"] = force_namespace
233
- result = request_to_server(
234
- method="post",
235
- endpoint="/v1/create_user_space",
236
- data=data,
237
- server_creds=USER_LOCAL_SERVER_FILE,
238
- user_cookie=USER_COOKIE
239
- )
240
- console.log(f"Workspace creation (ignore already created warnings): {result}" )
241
- except Exception as e:
242
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
243
-
244
- def pool_init(pool_config_values_path=None):
245
- """Deploy configured objects to initialise pool"""
246
- if pool_config_values_path is None:
247
- return
248
-
249
- # load template config and populate with values
250
- sidecar_template_yaml = load_template(
251
- template_path=POOL_CONFIG_TEMPLATE,
252
- values={},
253
- default_values_path=pool_config_values_path)
254
-
255
- try:
256
- result = request_to_server(
257
- method="post",
258
- endpoint="/v1/deploy_generic_model",
259
- data={"config": sidecar_template_yaml},
260
- server_creds=USER_LOCAL_SERVER_FILE,
261
- user_cookie=USER_COOKIE
262
- )
263
- if 'failed' in result and len(result['failed']) > 0:
264
- console.log(f"[red]Error when deploying pool config\n\n{result['failed']}")
265
- if len(result['successful']) > 0:
266
- console.log(f"[green]Deployed pool config!")
267
- except Exception as e:
268
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
269
-
270
154
  def select_ip_address(subnet=None):
271
- ips = []
272
- retry = 3
273
- while len(ips) == 0:
274
- for iface in ni.interfaces():
275
- try:
276
- ip = ni.ifaddresses(iface)[ni.AF_INET][0]['addr']
277
- if ip in FORBIDEDEN_IPS:
278
- continue
279
- if subnet is None or ipaddress.ip_address(ip) in ipaddress.ip_network(subnet):
280
- ips.append(ip)
281
- except:
282
- pass
283
- if len(ips) == 1:
284
- return ips[0]
285
- time.sleep(2)
286
- retry -= 1
287
- if retry < 0:
288
- raise ValueError(f"No IPs available on subnet {subnet}")
155
+ ips = get_ip_addresses(subnet=subnet)
156
+ if len(ips) == 1:
157
+ return ips[0]
158
+
289
159
  while True:
290
160
  option = user_confirm(
291
161
  question="Select IP to advertise the node (needs to be visible to other nodes)",
@@ -336,50 +206,50 @@ def select_token_type():
336
206
  break
337
207
  return {"admin": choice == 0, "user": choice == 1, "worker": choice == 2}
338
208
 
339
- def generate_compose_config(role, node_name, is_public, use_gpus=True, node_labels=None, pool_ip=None, vpn_token=None, pool_token=None):
209
+ def input_gpus():
340
210
  num_gpus = 0
341
- if use_gpus:
342
- try:
343
- has_gpus = check_gpu_drivers()
344
- if has_gpus:
345
- max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
346
- num_gpus = user_confirm(
347
- question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
348
- options=range(max_gpus+1)
349
- )
350
- except:
351
- console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
352
- if node_labels is not None:
353
- node_labels = " ".join([f"--node-label {key}={value}" for key, value in node_labels.items()])
354
- compose_values = {
355
- "user_path": user_path(""),
356
- "service_name": DEFAULT_CONTAINER_NAME,
357
- "vpn": is_public,
358
- "vpn_name": DEFAULT_VPN_CONTAINER_NAME,
359
- "pool_ip": pool_ip,
360
- "pool_token": pool_token,
361
- "vpn_token": vpn_token,
362
- "node_name": node_name,
363
- "command": role,
364
- "storage_enabled": "True",
365
- "num_gpus": num_gpus,
366
- "k3s_path": f"{CONTAINER_HOST_PATH}/k3s",
367
- "etc_path": f"{CONTAINER_HOST_PATH}/etc",
368
- "node_labels": node_labels,
369
- "flannel_iface": DEFAULT_FLANNEL_IFACE if is_public else ""
370
- }
371
- # generate local config files
372
- compose_yaml = load_template(
373
- template_path=DOCKER_COMPOSE_TEMPLATE,
374
- values=compose_values)
375
- with open(USER_COMPOSE_FILE, "w") as f:
376
- f.write(compose_yaml)
377
- return compose_yaml
211
+ try:
212
+ has_gpus = check_gpu_drivers()
213
+ if has_gpus:
214
+ max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
215
+ num_gpus = user_confirm(
216
+ question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
217
+ options=range(max_gpus+1)
218
+ )
219
+ except:
220
+ console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
221
+ return num_gpus
378
222
 
379
223
  ##################
380
224
  ## CLI COMMANDS ##
381
225
  ##################
382
226
 
227
+ @arguably.command
228
+ def gui__start(*others, gui_port=3000, backend_port=8000):
229
+ """Run GUI"""
230
+ values = {
231
+ "path": user_path(""),
232
+ "gui_port": gui_port,
233
+ "backend_port": backend_port
234
+ }
235
+ compose_yaml = load_template(
236
+ template_path=DOCKER_COMPOSE_GUI,
237
+ values=values)
238
+ with open(USER_GUI_COMPOSE_FILE, "w") as f:
239
+ f.write(compose_yaml)
240
+
241
+ run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} up -d")
242
+
243
+ console.log(f"[green]Loading GUI, may take a few minutes. It will be available at http://localhost:{gui_port}")
244
+
245
+ @arguably.command
246
+ def gui__stop(*others):
247
+ """Stop GUI"""
248
+ run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} down")
249
+
250
+ console.log("[green]Kalavai GUI has been stopped")
251
+
252
+
383
253
  @arguably.command
384
254
  def login(*others, username: str=None):
385
255
  """
@@ -461,8 +331,9 @@ def pool__publish(*others, description=None):
461
331
  description = description
462
332
 
463
333
  try:
464
- if not pool__check_token(token=token, public=True):
465
- raise ValueError("[red]Cluster must be started with a valid vpn_location to publish")
334
+ valid = check_token(token=token, public=True)
335
+ if "error" in valid:
336
+ raise ValueError(f"[red]Cluster must be started with a valid vpn_location to publish: {valid}")
466
337
  cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
467
338
 
468
339
  register_cluster(
@@ -523,7 +394,7 @@ def pool__list(*others, user_only=False):
523
394
 
524
395
 
525
396
  @arguably.command
526
- def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=HELM_APPS_VALUES, pool_config_values: str=POOL_CONFIG_DEFAULT_VALUES):
397
+ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=None, pool_config_values: str=None):
527
398
  """
528
399
  Start Kalavai pool and start/resume sharing resources.
529
400
 
@@ -531,9 +402,6 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
531
402
  *others: all the other positional arguments go here
532
403
  """
533
404
 
534
- if not check_seed_compatibility():
535
- return
536
-
537
405
  if CLUSTER.is_cluster_init():
538
406
  console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
539
407
  return
@@ -547,127 +415,25 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
547
415
  console.log("Installation was cancelled and did not complete.")
548
416
  return
549
417
 
550
- # if only registered users are allowed, check user has logged in
551
- user = defaultdict(lambda: None)
552
- if only_registered_users or location is not None:
553
- user = user_login(user_cookie=USER_COOKIE)
554
- if user is None:
555
- console.log("[white]--only-registered-users [red]or [white]--location[red] can only be used if the host is authenticated. Run [yellow]kalavai login[red] to authenticate")
556
- exit()
557
-
558
- # join private network if provided
559
- vpn = defaultdict(lambda: None)
560
- node_labels = {
561
- STORAGE_CLASS_LABEL: is_storage_compatible()
562
- }
563
- if location is not None:
564
- console.log("Fetching VPN credentials")
565
- try:
566
- vpn = get_vpn_details(
567
- location=location,
568
- user_cookie=USER_COOKIE)
569
- node_labels[USER_NODE_LABEL] = user["username"]
570
- except Exception as e:
571
- console.log(f"[red]Error when joining network: {str(e)}")
572
- return
573
-
574
- # Generate docker compose recipe
575
- generate_compose_config(
576
- role="server",
577
- vpn_token=vpn["key"],
578
- node_name=socket.gethostname(),
579
- node_labels=node_labels,
580
- is_public=location is not None
581
- )
582
-
583
- # start server
584
- console.log("Deploying seed...")
585
- CLUSTER.start_seed_node()
586
-
587
- while not CLUSTER.is_agent_running():
588
- console.log("Waiting for seed to start...")
589
- time.sleep(10)
590
-
591
418
  # select IP address (for external discovery)
592
419
  if ip_address is None and location is None:
593
420
  # local IP
594
421
  console.log(f"Scanning for valid IPs")
595
422
  ip_address = select_ip_address()
596
- else:
597
- # load VPN ip
598
- ip_address = CLUSTER.get_vpn_ip()
423
+
599
424
  console.log(f"Using {ip_address} address for server")
600
425
 
601
- # populate local cred files
602
- auth_key = str(uuid.uuid4())
603
- write_auth_key = str(uuid.uuid4())
604
- readonly_auth_key = str(uuid.uuid4())
605
-
606
- watcher_service = f"{ip_address}:{DEFAULT_WATCHER_PORT}"
607
- values = {
608
- CLUSTER_NAME_KEY: cluster_name,
609
- CLUSTER_IP_KEY: ip_address,
610
- AUTH_KEY: auth_key,
611
- READONLY_AUTH_KEY: readonly_auth_key,
612
- WRITE_AUTH_KEY: write_auth_key,
613
- WATCHER_PORT_KEY: DEFAULT_WATCHER_PORT,
614
- WATCHER_SERVICE_KEY: watcher_service,
615
- USER_NODE_LABEL_KEY: USER_NODE_LABEL,
616
- ALLOW_UNREGISTERED_USER_KEY: not only_registered_users
617
- }
426
+ console.log(f"[green]Creating {cluster_name} pool, this may take a few minutes...")
618
427
 
619
- store_server_info(
620
- server_ip=ip_address,
621
- auth_key=auth_key,
622
- readonly_auth_key=readonly_auth_key,
623
- write_auth_key=write_auth_key,
624
- file=USER_LOCAL_SERVER_FILE,
625
- watcher_service=watcher_service,
626
- node_name=socket.gethostname(),
428
+ create_pool(
627
429
  cluster_name=cluster_name,
628
- public_location=location,
629
- user_api_key=user["api_key"])
630
-
631
- # Generate helmfile recipe
632
- helm_yaml = load_template(
633
- template_path=HELM_APPS_FILE,
634
- values=values,
635
- default_values_path=app_values,
636
- force_defaults=True)
637
- with open(USER_HELM_APPS_FILE, "w") as f:
638
- f.write(helm_yaml)
639
-
640
- console.log("[green]Config files have been generated in your local machine\n")
641
-
642
- console.log("Setting pool dependencies...")
643
- # set template values in helmfile
644
- try:
645
- CLUSTER.update_dependencies(
646
- dependencies_file=USER_HELM_APPS_FILE
647
- )
648
- except Exception as e:
649
- console.log(f"Error: {str(e)}")
650
- exit()
651
- console.log("[green]Your pool is ready! Grow it by sharing your joining token with others. Run [yellow]kalavai pool token[green] to generate one.")
652
-
653
- if location is not None:
654
- # register with kalavai if it's a public cluster
655
- console.log("Registering public cluster with Kalavai...")
656
- pool__publish()
657
-
658
- # wait until the server is ready to create objects
659
- while True:
660
- console.log("Waiting for core services to be ready, may take a few minutes...")
661
- time.sleep(30)
662
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
663
- break
664
- console.log("Initialise user workspace...")
665
- pool_init(pool_config_values_path=pool_config_values)
666
- # init default namespace
667
- init_user_workspace(force_namespace="default")
668
- if only_registered_users:
669
- # init user namespace
670
- init_user_workspace()
430
+ ip_address=ip_address,
431
+ app_values=app_values,
432
+ pool_config_values=pool_config_values,
433
+ num_gpus=input_gpus(),
434
+ only_registered_users=only_registered_users,
435
+ location=location
436
+ )
671
437
 
672
438
  return None
673
439
 
@@ -720,20 +486,13 @@ def pool__check_token(token, *others, public=False):
720
486
  """
721
487
  Utility to check the validity of a join token
722
488
  """
723
- try:
724
- data = decode_dict(token)
725
- for field in MANDATORY_TOKEN_FIELDS:
726
- assert field in data
727
- if public:
728
- if data[PUBLIC_LOCATION_KEY] is None:
729
- raise ValueError("Token is not valid for public pools. Did you start the cluster with a public_location?")
730
- console.log("[green]Token format is correct")
731
- return True
732
- except Exception as e:
733
- console.log(f"[white]{str(e)}")
734
- console.log("[red]Token is invalid.")
489
+ result = check_token(token=token, public=public)
490
+ if "error" in result:
491
+ console.log(f"[red]Error in token: {result}")
735
492
  return False
736
-
493
+
494
+ console.log("[green]Token format is correct")
495
+ return True
737
496
 
738
497
  @arguably.command
739
498
  def pool__join(token, *others, node_name=None):
@@ -743,9 +502,6 @@ def pool__join(token, *others, node_name=None):
743
502
  Args:
744
503
  *others: all the other positional arguments go here
745
504
  """
746
-
747
- if not check_worker_compatibility():
748
- return
749
505
 
750
506
  # check that k3s is not running already in the host
751
507
  # k3s service running or preinstalled
@@ -763,119 +519,26 @@ def pool__join(token, *others, node_name=None):
763
519
  console.log("[green]Nothing happened.")
764
520
  return
765
521
 
766
- if node_name is None:
767
- node_name = socket.gethostname()
768
-
769
- # check token
770
- if not pool__check_token(token):
771
- return
772
-
773
- try:
774
- data = decode_dict(token)
775
- kalavai_seed_ip = data[CLUSTER_IP_KEY]
776
- kalavai_token = data[CLUSTER_TOKEN_KEY]
777
- cluster_name = data[CLUSTER_NAME_KEY]
778
- auth_key = data[AUTH_KEY]
779
- watcher_service = data[WATCHER_SERVICE_KEY]
780
- public_location = data[PUBLIC_LOCATION_KEY]
781
- vpn = defaultdict(lambda: None)
782
- except Exception as e:
783
- console.log(str(e))
784
- console.log("[red] Invalid token")
785
- return
786
-
787
- # join private network if provided
788
- node_labels = {
789
- STORAGE_CLASS_LABEL: is_storage_compatible()
790
- }
791
- user = defaultdict(lambda: None)
792
- if public_location is not None:
793
- user = user_login(user_cookie=USER_COOKIE)
794
- if user is None:
795
- console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
796
- exit()
797
- console.log("Fetching VPN credentials")
798
- try:
799
- vpn = get_vpn_details(
800
- location=public_location,
801
- user_cookie=USER_COOKIE)
802
- node_labels[USER_NODE_LABEL] = user["username"]
803
- except Exception as e:
804
- console.log(f"[red]Error when joining network: {str(e)}")
805
- console.log("Are you authenticated? Try [yellow]kalavai login")
806
- return
807
- try:
808
- validate_join_public_seed(
809
- cluster_name=cluster_name,
810
- join_key=token,
811
- user_cookie=USER_COOKIE
812
- )
813
- except Exception as e:
814
- console.log(f"[red]Error when joining network: {str(e)}")
815
- return
816
-
817
- # local agent join
818
- # 1. Generate local cache files
819
- console.log("Generating config files...")
820
-
821
- # Generate docker compose recipe
822
- generate_compose_config(
823
- role="agent",
824
- pool_ip=f"https://{kalavai_seed_ip}:6443",
825
- pool_token=kalavai_token,
826
- vpn_token=vpn["key"],
827
- node_name=node_name,
828
- node_labels=node_labels,
829
- is_public=public_location is not None)
830
-
831
- store_server_info(
832
- server_ip=kalavai_seed_ip,
833
- auth_key=auth_key,
834
- file=USER_LOCAL_SERVER_FILE,
835
- watcher_service=watcher_service,
836
- node_name=node_name,
837
- cluster_name=cluster_name,
838
- public_location=public_location,
839
- user_api_key=user["api_key"])
522
+ num_gpus = input_gpus()
840
523
 
841
524
  option = user_confirm(
842
525
  question="Docker compose ready. Would you like Kalavai to deploy it?",
843
526
  options=["no", "yes"]
844
527
  )
845
528
  if option == 0:
846
- console.log("Manually deploy the worker with the following command:\n")
847
- print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
529
+ console.log("[red]Installation aborted")
848
530
  return
849
531
 
850
- console.log(f"[white] Connecting to {cluster_name} @ {kalavai_seed_ip} (this may take a few minutes)...")
851
- try:
852
- CLUSTER.start_worker_node()
853
- except Exception as e:
854
- console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
855
- pool__stop()
856
- exit()
857
-
858
- # ensure we are connected
859
- while True:
860
- console.log("Waiting for core services to be ready, may take a few minutes...")
861
- time.sleep(30)
862
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
863
- break
864
-
865
- # check the node has connected successfully
866
- try:
867
- while not CLUSTER.is_agent_running():
868
- console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
869
- time.sleep(30)
870
- except KeyboardInterrupt:
871
- console.log("[red]Installation aborted. Leaving pool.")
872
- pool__stop()
873
- return
874
-
875
- init_user_workspace()
876
-
877
- # set status to schedulable
878
- console.log(f"[green] You are connected to {cluster_name}")
532
+ console.log("Connecting worker to the pool...")
533
+ result = join_pool(
534
+ token=token,
535
+ node_name=node_name,
536
+ num_gpus=num_gpus
537
+ )
538
+ if "error" in result:
539
+ console.log(f"[red]Error when connecting: {result}")
540
+ else:
541
+ console.log(f"[green] You are connected to {result}")
879
542
 
880
543
  @arguably.command
881
544
  def pool__stop(*others, skip_node_deletion=False):
@@ -929,11 +592,12 @@ def pool__pause(*others):
929
592
  """
930
593
  # k3s stop locally
931
594
  console.log("[white] Pausing kalavai app...")
932
- success = CLUSTER.pause_agent()
933
- if success:
934
- console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
595
+ success = pause_agent()
596
+ if "error" in success:
597
+ console.log(f"[red] Error when stopping. {success['error']}")
935
598
  else:
936
- console.log("[red] Error when stopping. Please run [yellow]kalavai pool pause[red] again.")
599
+ console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
600
+
937
601
 
938
602
  @arguably.command
939
603
  def pool__resume(*others):
@@ -948,10 +612,12 @@ def pool__resume(*others):
948
612
  console.log("[red] Kalavai app was not started before, please run [yellow]kalavai pool start[red] to start a pool or [yellow]kalavai pool join[red] to join one first")
949
613
  return
950
614
  console.log("[white] Restarting sharing (may take a few minutes)...")
951
- if CLUSTER.restart_agent():
952
- console.log("[white] Kalava sharing resumed")
615
+ success = resume_agent()
616
+ if "error" in success:
617
+ console.log(f"[red] Error when restarting. {success['error']}")
953
618
  else:
954
- console.log("[red] Error when restarting. Please run [yellow]kalavai pool resume[white] again.")
619
+ console.log("[white] Kalava sharing resumed")
620
+
955
621
 
956
622
 
957
623
  @arguably.command
@@ -1090,7 +756,7 @@ def pool__attach(token, *others, node_name=None):
1090
756
  """
1091
757
 
1092
758
  if node_name is None:
1093
- node_name = socket.gethostname()
759
+ node_name = f"{socket.gethostname()}-{uuid.uuid4().hex[:6]}"
1094
760
 
1095
761
  # check that is not attached to another instance
1096
762
  if os.path.exists(USER_LOCAL_SERVER_FILE):
@@ -1102,70 +768,6 @@ def pool__attach(token, *others, node_name=None):
1102
768
  console.log("[green]Nothing happened.")
1103
769
  return
1104
770
 
1105
- # check token
1106
- if not pool__check_token(token):
1107
- return
1108
-
1109
- try:
1110
- data = decode_dict(token)
1111
- kalavai_seed_ip = data[CLUSTER_IP_KEY]
1112
- cluster_name = data[CLUSTER_NAME_KEY]
1113
- auth_key = data[AUTH_KEY]
1114
- watcher_service = data[WATCHER_SERVICE_KEY]
1115
- public_location = data[PUBLIC_LOCATION_KEY]
1116
- vpn = defaultdict(lambda: None)
1117
- except Exception as e:
1118
- console.log(str(e))
1119
- console.log("[red] Invalid token")
1120
- return
1121
-
1122
- user = defaultdict(lambda: None)
1123
- if public_location is not None:
1124
- user = user_login(user_cookie=USER_COOKIE)
1125
- if user is None:
1126
- console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
1127
- exit()
1128
- console.log("Fetching VPN credentials")
1129
- try:
1130
- vpn = get_vpn_details(
1131
- location=public_location,
1132
- user_cookie=USER_COOKIE)
1133
- except Exception as e:
1134
- console.log(f"[red]Error when joining network: {str(e)}")
1135
- console.log("Are you authenticated? Try [yellow]kalavai login")
1136
- return
1137
- try:
1138
- validate_join_public_seed(
1139
- cluster_name=cluster_name,
1140
- join_key=token,
1141
- user_cookie=USER_COOKIE
1142
- )
1143
- except Exception as e:
1144
- console.log(f"[red]Error when joining network: {str(e)}")
1145
- return
1146
-
1147
- # local agent join
1148
- # 1. Generate local cache files
1149
- console.log("Generating config files...")
1150
-
1151
- # Generate docker compose recipe
1152
- generate_compose_config(
1153
- use_gpus=False,
1154
- role="",
1155
- vpn_token=vpn["key"],
1156
- node_name=node_name,
1157
- is_public=public_location is not None)
1158
-
1159
- store_server_info(
1160
- server_ip=kalavai_seed_ip,
1161
- auth_key=auth_key,
1162
- file=USER_LOCAL_SERVER_FILE,
1163
- watcher_service=watcher_service,
1164
- node_name=node_name,
1165
- cluster_name=cluster_name,
1166
- public_location=public_location,
1167
- user_api_key=user["api_key"])
1168
-
1169
771
  option = user_confirm(
1170
772
  question="Docker compose ready. Would you like Kalavai to deploy it?",
1171
773
  options=["no", "yes"]
@@ -1175,17 +777,13 @@ def pool__attach(token, *others, node_name=None):
1175
777
  print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
1176
778
  return
1177
779
 
1178
- console.log(f"[white] Connecting to {cluster_name} @ {kalavai_seed_ip} (this may take a few minutes)...")
1179
- run_cmd(f"docker compose -f {USER_COMPOSE_FILE} up -d")
1180
- # ensure we are connected
1181
- while True:
1182
- console.log("Waiting for core services to be ready, may take a few minutes...")
1183
- time.sleep(30)
1184
- if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
1185
- break
780
+ result = attach_to_pool(token=token, node_name=node_name)
1186
781
 
782
+ if "error" in result:
783
+ console.log(f"[red]Error when attaching to pool: {result}")
784
+ return
1187
785
  # set status to schedulable
1188
- console.log(f"[green] You are connected to {cluster_name}")
786
+ console.log(f"[green] You are connected to {result}")
1189
787
 
1190
788
 
1191
789
  @arguably.command
@@ -1409,18 +1007,13 @@ def job__templates(*others):
1409
1007
  console.log(f"[red]Problems with your pool: {str(e)}")
1410
1008
  return
1411
1009
 
1412
- try:
1413
- result = request_to_server(
1414
- method="get",
1415
- endpoint="/v1/get_job_templates",
1416
- server_creds=USER_LOCAL_SERVER_FILE,
1417
- data=None,
1418
- user_cookie=USER_COOKIE
1419
- )
1420
- console.log("Templates available in the pool")
1421
- console.log(result)
1422
- except Exception as e:
1423
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1010
+ templates = fetch_job_templates()
1011
+ if "error" in templates:
1012
+ console.log(f"[red]Error when fetching templates: {str(e)}")
1013
+ return
1014
+
1015
+ console.log("Templates available in the pool")
1016
+ console.log(templates)
1424
1017
 
1425
1018
 
1426
1019
  @arguably.command
@@ -1476,26 +1069,16 @@ def job__run(template_name, *others, values: str=None, force_namespace: str=None
1476
1069
  annotation_key="nvidia.com/nouse-gputype"
1477
1070
  )
1478
1071
 
1479
- # deploy template with kube-watcher
1480
- data = {
1481
- "template": template_name,
1482
- "template_values": values_dict
1483
- }
1484
- if force_namespace is not None:
1485
- data["force_namespace"] = force_namespace
1072
+ result = deploy_job(
1073
+ template_name=template_name,
1074
+ values_dict=values_dict,
1075
+ force_namespace=force_namespace
1076
+ )
1486
1077
 
1487
- try:
1488
- result = request_to_server(
1489
- method="post",
1490
- endpoint="/v1/deploy_job",
1491
- data=data,
1492
- server_creds=USER_LOCAL_SERVER_FILE,
1493
- user_cookie=USER_COOKIE
1494
- )
1078
+ if "error" in result:
1079
+ console.log(f"[red]Error when deploying job: {str(e)}")
1080
+ else:
1495
1081
  console.log(f"[green]{template_name} job deployed")
1496
- except Exception as e:
1497
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1498
- return
1499
1082
 
1500
1083
  @arguably.command
1501
1084
  def job__test(local_template_dir, *others, values, defaults, force_namespace: str=None):
@@ -1568,22 +1151,12 @@ def job__defaults(template_name, *others):
1568
1151
  return
1569
1152
 
1570
1153
  # deploy template with kube-watcher
1571
- data = {
1572
- "template": template_name
1573
- }
1574
- try:
1575
- result = request_to_server(
1576
- method="get",
1577
- endpoint="/v1/job_defaults",
1578
- data=data,
1579
- server_creds=USER_LOCAL_SERVER_FILE,
1580
- user_cookie=USER_COOKIE
1581
- )
1582
- print(
1583
- json.dumps(result,indent=3)
1584
- )
1585
- except Exception as e:
1586
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1154
+ defaults = fetch_job_defaults(name=template_name)
1155
+ if "error" in defaults:
1156
+ console.log(f"[red]Error when fetching job defaults: {defaults}")
1157
+ print(
1158
+ json.dumps(defaults, indent=3)
1159
+ )
1587
1160
 
1588
1161
 
1589
1162
  @arguably.command
@@ -1601,23 +1174,11 @@ def job__delete(name, *others, force_namespace: str=None):
1601
1174
  console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1602
1175
 
1603
1176
  # deploy template with kube-watcher
1604
- data = {
1605
- "label": TEMPLATE_LABEL, # this ensures that both lws template and services are deleted
1606
- "value": name
1607
- }
1608
- if force_namespace is not None:
1609
- data["force_namespace"] = force_namespace
1610
- try:
1611
- result = request_to_server(
1612
- method="post",
1613
- endpoint="/v1/delete_labeled_resources",
1614
- data=data,
1615
- server_creds=USER_LOCAL_SERVER_FILE,
1616
- user_cookie=USER_COOKIE
1617
- )
1177
+ result = delete_job(name=name, force_namespace=force_namespace)
1178
+ if "error" in result:
1179
+ console.log(f"[red]Error when deleting job: {str(e)}")
1180
+ else:
1618
1181
  console.log(f"{result}")
1619
- except Exception as e:
1620
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1621
1182
 
1622
1183
 
1623
1184
  @arguably.command