kalavai-client 0.5.15__py3-none-any.whl → 0.5.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kalavai_client/__init__.py +1 -1
- kalavai_client/assets/apps.yaml +1 -1
- kalavai_client/assets/docker-compose-gui.yaml +10 -0
- kalavai_client/assets/docker-compose-template.yaml +5 -3
- kalavai_client/cli.py +145 -584
- kalavai_client/cluster.py +25 -2
- kalavai_client/core.py +653 -4
- kalavai_client/env.py +41 -2
- kalavai_client/utils.py +55 -19
- {kalavai_client-0.5.15.dist-info → kalavai_client-0.5.17.dist-info}/METADATA +5 -4
- kalavai_client-0.5.17.dist-info/RECORD +23 -0
- {kalavai_client-0.5.15.dist-info → kalavai_client-0.5.17.dist-info}/WHEEL +1 -1
- kalavai_client-0.5.15.dist-info/RECORD +0 -22
- {kalavai_client-0.5.15.dist-info → kalavai_client-0.5.17.dist-info}/LICENSE +0 -0
- {kalavai_client-0.5.15.dist-info → kalavai_client-0.5.17.dist-info}/entry_points.txt +0 -0
kalavai_client/cli.py
CHANGED
@@ -7,19 +7,30 @@ import time
|
|
7
7
|
import socket
|
8
8
|
from pathlib import Path
|
9
9
|
from getpass import getpass
|
10
|
-
import ipaddress
|
11
10
|
from sys import exit
|
12
11
|
|
13
12
|
import yaml
|
14
|
-
|
13
|
+
|
15
14
|
import arguably
|
16
15
|
from rich.console import Console
|
17
16
|
|
17
|
+
from kalavai_client.cluster import CLUSTER
|
18
18
|
from kalavai_client.env import (
|
19
19
|
USER_COOKIE,
|
20
20
|
USER_LOCAL_SERVER_FILE,
|
21
21
|
TEMPLATE_LABEL,
|
22
|
-
|
22
|
+
KALAVAI_PLATFORM_URL,
|
23
|
+
DEFAULT_VPN_CONTAINER_NAME,
|
24
|
+
CONTAINER_HOST_PATH,
|
25
|
+
USER_COMPOSE_FILE,
|
26
|
+
USER_HELM_APPS_FILE,
|
27
|
+
USER_KUBECONFIG_FILE,
|
28
|
+
USER_VPN_COMPOSE_FILE,
|
29
|
+
USER_TEMPLATES_FOLDER,
|
30
|
+
DOCKER_COMPOSE_GUI,
|
31
|
+
USER_GUI_COMPOSE_FILE,
|
32
|
+
user_path,
|
33
|
+
resource_path,
|
23
34
|
)
|
24
35
|
from kalavai_client.core import (
|
25
36
|
fetch_resources,
|
@@ -28,22 +39,29 @@ from kalavai_client.core import (
|
|
28
39
|
fetch_devices,
|
29
40
|
fetch_job_logs,
|
30
41
|
fetch_gpus,
|
31
|
-
load_gpu_models
|
42
|
+
load_gpu_models,
|
43
|
+
fetch_job_templates,
|
44
|
+
fetch_job_defaults,
|
45
|
+
deploy_job,
|
46
|
+
delete_job,
|
47
|
+
check_token,
|
48
|
+
attach_to_pool,
|
49
|
+
join_pool,
|
50
|
+
create_pool,
|
51
|
+
get_ip_addresses,
|
52
|
+
pause_agent,
|
53
|
+
resume_agent
|
32
54
|
)
|
33
55
|
from kalavai_client.utils import (
|
34
56
|
check_gpu_drivers,
|
57
|
+
load_template,
|
35
58
|
run_cmd,
|
36
|
-
decode_dict,
|
37
59
|
generate_join_token,
|
38
60
|
user_confirm,
|
39
|
-
load_template,
|
40
|
-
store_server_info,
|
41
61
|
generate_table,
|
42
62
|
request_to_server,
|
43
|
-
resource_path,
|
44
63
|
safe_remove,
|
45
64
|
leave_vpn,
|
46
|
-
get_vpn_details,
|
47
65
|
load_server_info,
|
48
66
|
user_login,
|
49
67
|
user_logout,
|
@@ -51,9 +69,6 @@ from kalavai_client.utils import (
|
|
51
69
|
register_cluster,
|
52
70
|
unregister_cluster,
|
53
71
|
get_public_seeds,
|
54
|
-
validate_join_public_seed,
|
55
|
-
is_storage_compatible,
|
56
|
-
is_watcher_alive,
|
57
72
|
load_user_session,
|
58
73
|
SERVER_IP_KEY,
|
59
74
|
AUTH_KEY,
|
@@ -62,112 +77,30 @@ from kalavai_client.utils import (
|
|
62
77
|
WRITE_AUTH_KEY,
|
63
78
|
PUBLIC_LOCATION_KEY,
|
64
79
|
NODE_NAME_KEY,
|
65
|
-
CLUSTER_NAME_KEY
|
66
|
-
CLUSTER_IP_KEY,
|
67
|
-
CLUSTER_TOKEN_KEY,
|
68
|
-
WATCHER_PORT_KEY,
|
69
|
-
MANDATORY_TOKEN_FIELDS,
|
70
|
-
USER_NODE_LABEL_KEY,
|
71
|
-
ALLOW_UNREGISTERED_USER_KEY
|
72
|
-
)
|
73
|
-
from kalavai_client.cluster import (
|
74
|
-
dockerCluster
|
80
|
+
CLUSTER_NAME_KEY
|
75
81
|
)
|
76
82
|
|
77
83
|
|
78
|
-
KALAVAI_PLATFORM_URL = os.getenv("KALAVAI_PLATFORM_URL", "https://platform.kalavai.net")
|
79
84
|
LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
|
80
85
|
VERSION = 1
|
81
86
|
RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
|
82
87
|
CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
|
83
88
|
RAY_LABEL = "kalavai.ray.name"
|
84
89
|
PVC_NAME_LABEL = "kalavai.storage.name"
|
85
|
-
DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
|
86
90
|
VPN_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/vpn-template.yaml")
|
87
|
-
POOL_CONFIG_TEMPLATE = resource_path("kalavai_client/assets/pool_config_template.yaml")
|
88
|
-
POOL_CONFIG_DEFAULT_VALUES = resource_path("kalavai_client/assets/pool_config_values.yaml")
|
89
|
-
USER_WORKSPACE_TEMPLATE = resource_path("kalavai_client/assets/user_workspace.yaml")
|
90
|
-
DEFAULT_USER_WORKSPACE_VALUES = resource_path("kalavai_client/assets/user_workspace_values.yaml")
|
91
91
|
STORAGE_CLASS_NAME = "local-path"
|
92
92
|
STORAGE_ACCESS_MODE = ["ReadWriteOnce"]
|
93
|
-
STORAGE_CLASS_LABEL = "kalavai.storage.enabled"
|
94
93
|
DEFAULT_STORAGE_NAME = "pool-cache"
|
95
94
|
DEFAULT_STORAGE_SIZE = 20
|
96
|
-
DEFAULT_WATCHER_PORT = 30001
|
97
|
-
USER_NODE_LABEL = "kalavai.cluster.user"
|
98
|
-
KUBE_VERSION = os.getenv("KALAVAI_KUBE_VERSION", "v1.31.1+k3s1")
|
99
|
-
DEFAULT_FLANNEL_IFACE = os.getenv("KALAVAI_FLANNEL_IFACE", "netmaker-1")
|
100
|
-
FORBIDEDEN_IPS = ["127.0.0.1"]
|
101
|
-
# kalavai templates
|
102
|
-
HELM_APPS_FILE = resource_path("kalavai_client/assets/apps.yaml")
|
103
|
-
HELM_APPS_VALUES = resource_path("kalavai_client/assets/apps_values.yaml")
|
104
|
-
# user specific config files
|
105
|
-
DEFAULT_CONTAINER_NAME = "kalavai"
|
106
|
-
DEFAULT_VPN_CONTAINER_NAME = "kalavai-vpn"
|
107
|
-
CONTAINER_HOST_PATH = user_path("pool/", create_path=True)
|
108
|
-
USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
|
109
|
-
USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
|
110
|
-
USER_HELM_APPS_FILE = user_path("apps.yaml")
|
111
|
-
USER_KUBECONFIG_FILE = user_path("kubeconfig")
|
112
|
-
USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
|
113
|
-
|
114
95
|
|
96
|
+
|
115
97
|
console = Console()
|
116
|
-
CLUSTER = dockerCluster(
|
117
|
-
container_name=DEFAULT_CONTAINER_NAME,
|
118
|
-
kube_version=KUBE_VERSION,
|
119
|
-
flannel_iface=DEFAULT_FLANNEL_IFACE,
|
120
|
-
compose_file=USER_COMPOSE_FILE,
|
121
|
-
kubeconfig_file=USER_KUBECONFIG_FILE,
|
122
|
-
poolconfig_file=USER_LOCAL_SERVER_FILE,
|
123
|
-
dependencies_file=USER_HELM_APPS_FILE
|
124
|
-
)
|
125
98
|
|
126
99
|
|
127
100
|
######################
|
128
101
|
## HELPER FUNCTIONS ##
|
129
102
|
######################
|
130
103
|
|
131
|
-
def check_seed_compatibility():
|
132
|
-
"""Check required packages to start pools"""
|
133
|
-
logs = []
|
134
|
-
console.log("[white]Checking system requirements...")
|
135
|
-
# docker
|
136
|
-
try:
|
137
|
-
run_cmd("docker version >/dev/null 2>&1")
|
138
|
-
except:
|
139
|
-
logs.append("[red]Docker not installed. Install instructions:\n")
|
140
|
-
logs.append(" Linux: https://docs.docker.com/engine/install/\n")
|
141
|
-
logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
|
142
|
-
|
143
|
-
if len(logs) == 0:
|
144
|
-
console.log("[green]System is ready to start a pool")
|
145
|
-
return True
|
146
|
-
else:
|
147
|
-
for log in logs:
|
148
|
-
console.log(log)
|
149
|
-
return False
|
150
|
-
|
151
|
-
def check_worker_compatibility():
|
152
|
-
"""Check required packages to join pools"""
|
153
|
-
logs = []
|
154
|
-
console.log("[white]Checking system requirements...")
|
155
|
-
# docker
|
156
|
-
try:
|
157
|
-
run_cmd("docker version >/dev/null 2>&1")
|
158
|
-
except:
|
159
|
-
logs.append("[red]Docker not installed. Install instructions:\n")
|
160
|
-
logs.append(" Linux: https://docs.docker.com/engine/install/\n")
|
161
|
-
logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
|
162
|
-
|
163
|
-
if len(logs) == 0:
|
164
|
-
console.log("[green]System is ready to join a pool")
|
165
|
-
return True
|
166
|
-
else:
|
167
|
-
for log in logs:
|
168
|
-
console.log(log)
|
169
|
-
return False
|
170
|
-
|
171
104
|
|
172
105
|
def cleanup_local():
|
173
106
|
console.log("Removing local cache files...")
|
@@ -178,6 +111,7 @@ def cleanup_local():
|
|
178
111
|
safe_remove(USER_KUBECONFIG_FILE)
|
179
112
|
safe_remove(USER_LOCAL_SERVER_FILE)
|
180
113
|
safe_remove(USER_TEMPLATES_FOLDER)
|
114
|
+
safe_remove(USER_GUI_COMPOSE_FILE)
|
181
115
|
|
182
116
|
def pre_join_check(node_name, server_url, server_key):
|
183
117
|
# check with the server that we can connect
|
@@ -217,75 +151,11 @@ def set_schedulable(schedulable, node_name=load_server_info(data_key=NODE_NAME_K
|
|
217
151
|
except Exception as e:
|
218
152
|
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
219
153
|
|
220
|
-
|
221
|
-
def init_user_workspace(force_namespace=None):
|
222
|
-
|
223
|
-
# load template config and populate with values
|
224
|
-
sidecar_template_yaml = load_template(
|
225
|
-
template_path=USER_WORKSPACE_TEMPLATE,
|
226
|
-
values={},
|
227
|
-
default_values_path=DEFAULT_USER_WORKSPACE_VALUES)
|
228
|
-
|
229
|
-
try:
|
230
|
-
data = {"config": sidecar_template_yaml}
|
231
|
-
if force_namespace is not None:
|
232
|
-
data["force_namespace"] = force_namespace
|
233
|
-
result = request_to_server(
|
234
|
-
method="post",
|
235
|
-
endpoint="/v1/create_user_space",
|
236
|
-
data=data,
|
237
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
238
|
-
user_cookie=USER_COOKIE
|
239
|
-
)
|
240
|
-
console.log(f"Workspace creation (ignore already created warnings): {result}" )
|
241
|
-
except Exception as e:
|
242
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
243
|
-
|
244
|
-
def pool_init(pool_config_values_path=None):
|
245
|
-
"""Deploy configured objects to initialise pool"""
|
246
|
-
if pool_config_values_path is None:
|
247
|
-
return
|
248
|
-
|
249
|
-
# load template config and populate with values
|
250
|
-
sidecar_template_yaml = load_template(
|
251
|
-
template_path=POOL_CONFIG_TEMPLATE,
|
252
|
-
values={},
|
253
|
-
default_values_path=pool_config_values_path)
|
254
|
-
|
255
|
-
try:
|
256
|
-
result = request_to_server(
|
257
|
-
method="post",
|
258
|
-
endpoint="/v1/deploy_generic_model",
|
259
|
-
data={"config": sidecar_template_yaml},
|
260
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
261
|
-
user_cookie=USER_COOKIE
|
262
|
-
)
|
263
|
-
if 'failed' in result and len(result['failed']) > 0:
|
264
|
-
console.log(f"[red]Error when deploying pool config\n\n{result['failed']}")
|
265
|
-
if len(result['successful']) > 0:
|
266
|
-
console.log(f"[green]Deployed pool config!")
|
267
|
-
except Exception as e:
|
268
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
269
|
-
|
270
154
|
def select_ip_address(subnet=None):
|
271
|
-
ips =
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
try:
|
276
|
-
ip = ni.ifaddresses(iface)[ni.AF_INET][0]['addr']
|
277
|
-
if ip in FORBIDEDEN_IPS:
|
278
|
-
continue
|
279
|
-
if subnet is None or ipaddress.ip_address(ip) in ipaddress.ip_network(subnet):
|
280
|
-
ips.append(ip)
|
281
|
-
except:
|
282
|
-
pass
|
283
|
-
if len(ips) == 1:
|
284
|
-
return ips[0]
|
285
|
-
time.sleep(2)
|
286
|
-
retry -= 1
|
287
|
-
if retry < 0:
|
288
|
-
raise ValueError(f"No IPs available on subnet {subnet}")
|
155
|
+
ips = get_ip_addresses(subnet=subnet)
|
156
|
+
if len(ips) == 1:
|
157
|
+
return ips[0]
|
158
|
+
|
289
159
|
while True:
|
290
160
|
option = user_confirm(
|
291
161
|
question="Select IP to advertise the node (needs to be visible to other nodes)",
|
@@ -336,50 +206,50 @@ def select_token_type():
|
|
336
206
|
break
|
337
207
|
return {"admin": choice == 0, "user": choice == 1, "worker": choice == 2}
|
338
208
|
|
339
|
-
def
|
209
|
+
def input_gpus():
|
340
210
|
num_gpus = 0
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
if node_labels is not None:
|
353
|
-
node_labels = " ".join([f"--node-label {key}={value}" for key, value in node_labels.items()])
|
354
|
-
compose_values = {
|
355
|
-
"user_path": user_path(""),
|
356
|
-
"service_name": DEFAULT_CONTAINER_NAME,
|
357
|
-
"vpn": is_public,
|
358
|
-
"vpn_name": DEFAULT_VPN_CONTAINER_NAME,
|
359
|
-
"pool_ip": pool_ip,
|
360
|
-
"pool_token": pool_token,
|
361
|
-
"vpn_token": vpn_token,
|
362
|
-
"node_name": node_name,
|
363
|
-
"command": role,
|
364
|
-
"storage_enabled": "True",
|
365
|
-
"num_gpus": num_gpus,
|
366
|
-
"k3s_path": f"{CONTAINER_HOST_PATH}/k3s",
|
367
|
-
"etc_path": f"{CONTAINER_HOST_PATH}/etc",
|
368
|
-
"node_labels": node_labels,
|
369
|
-
"flannel_iface": DEFAULT_FLANNEL_IFACE if is_public else ""
|
370
|
-
}
|
371
|
-
# generate local config files
|
372
|
-
compose_yaml = load_template(
|
373
|
-
template_path=DOCKER_COMPOSE_TEMPLATE,
|
374
|
-
values=compose_values)
|
375
|
-
with open(USER_COMPOSE_FILE, "w") as f:
|
376
|
-
f.write(compose_yaml)
|
377
|
-
return compose_yaml
|
211
|
+
try:
|
212
|
+
has_gpus = check_gpu_drivers()
|
213
|
+
if has_gpus:
|
214
|
+
max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
|
215
|
+
num_gpus = user_confirm(
|
216
|
+
question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
|
217
|
+
options=range(max_gpus+1)
|
218
|
+
)
|
219
|
+
except:
|
220
|
+
console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
|
221
|
+
return num_gpus
|
378
222
|
|
379
223
|
##################
|
380
224
|
## CLI COMMANDS ##
|
381
225
|
##################
|
382
226
|
|
227
|
+
@arguably.command
|
228
|
+
def gui__start(*others, gui_port=3000, backend_port=8000):
|
229
|
+
"""Run GUI"""
|
230
|
+
values = {
|
231
|
+
"path": user_path(""),
|
232
|
+
"gui_port": gui_port,
|
233
|
+
"backend_port": backend_port
|
234
|
+
}
|
235
|
+
compose_yaml = load_template(
|
236
|
+
template_path=DOCKER_COMPOSE_GUI,
|
237
|
+
values=values)
|
238
|
+
with open(USER_GUI_COMPOSE_FILE, "w") as f:
|
239
|
+
f.write(compose_yaml)
|
240
|
+
|
241
|
+
run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} up -d")
|
242
|
+
|
243
|
+
console.log(f"[green]Loading GUI, may take a few minutes. It will be available at http://localhost:{gui_port}")
|
244
|
+
|
245
|
+
@arguably.command
|
246
|
+
def gui__stop(*others):
|
247
|
+
"""Stop GUI"""
|
248
|
+
run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} down")
|
249
|
+
|
250
|
+
console.log("[green]Kalavai GUI has been stopped")
|
251
|
+
|
252
|
+
|
383
253
|
@arguably.command
|
384
254
|
def login(*others, username: str=None):
|
385
255
|
"""
|
@@ -461,8 +331,9 @@ def pool__publish(*others, description=None):
|
|
461
331
|
description = description
|
462
332
|
|
463
333
|
try:
|
464
|
-
|
465
|
-
|
334
|
+
valid = check_token(token=token, public=True)
|
335
|
+
if "error" in valid:
|
336
|
+
raise ValueError(f"[red]Cluster must be started with a valid vpn_location to publish: {valid}")
|
466
337
|
cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
|
467
338
|
|
468
339
|
register_cluster(
|
@@ -523,7 +394,7 @@ def pool__list(*others, user_only=False):
|
|
523
394
|
|
524
395
|
|
525
396
|
@arguably.command
|
526
|
-
def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=
|
397
|
+
def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=None, pool_config_values: str=None):
|
527
398
|
"""
|
528
399
|
Start Kalavai pool and start/resume sharing resources.
|
529
400
|
|
@@ -531,9 +402,6 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
|
|
531
402
|
*others: all the other positional arguments go here
|
532
403
|
"""
|
533
404
|
|
534
|
-
if not check_seed_compatibility():
|
535
|
-
return
|
536
|
-
|
537
405
|
if CLUSTER.is_cluster_init():
|
538
406
|
console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
|
539
407
|
return
|
@@ -547,127 +415,25 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
|
|
547
415
|
console.log("Installation was cancelled and did not complete.")
|
548
416
|
return
|
549
417
|
|
550
|
-
# if only registered users are allowed, check user has logged in
|
551
|
-
user = defaultdict(lambda: None)
|
552
|
-
if only_registered_users or location is not None:
|
553
|
-
user = user_login(user_cookie=USER_COOKIE)
|
554
|
-
if user is None:
|
555
|
-
console.log("[white]--only-registered-users [red]or [white]--location[red] can only be used if the host is authenticated. Run [yellow]kalavai login[red] to authenticate")
|
556
|
-
exit()
|
557
|
-
|
558
|
-
# join private network if provided
|
559
|
-
vpn = defaultdict(lambda: None)
|
560
|
-
node_labels = {
|
561
|
-
STORAGE_CLASS_LABEL: is_storage_compatible()
|
562
|
-
}
|
563
|
-
if location is not None:
|
564
|
-
console.log("Fetching VPN credentials")
|
565
|
-
try:
|
566
|
-
vpn = get_vpn_details(
|
567
|
-
location=location,
|
568
|
-
user_cookie=USER_COOKIE)
|
569
|
-
node_labels[USER_NODE_LABEL] = user["username"]
|
570
|
-
except Exception as e:
|
571
|
-
console.log(f"[red]Error when joining network: {str(e)}")
|
572
|
-
return
|
573
|
-
|
574
|
-
# Generate docker compose recipe
|
575
|
-
generate_compose_config(
|
576
|
-
role="server",
|
577
|
-
vpn_token=vpn["key"],
|
578
|
-
node_name=socket.gethostname(),
|
579
|
-
node_labels=node_labels,
|
580
|
-
is_public=location is not None
|
581
|
-
)
|
582
|
-
|
583
|
-
# start server
|
584
|
-
console.log("Deploying seed...")
|
585
|
-
CLUSTER.start_seed_node()
|
586
|
-
|
587
|
-
while not CLUSTER.is_agent_running():
|
588
|
-
console.log("Waiting for seed to start...")
|
589
|
-
time.sleep(10)
|
590
|
-
|
591
418
|
# select IP address (for external discovery)
|
592
419
|
if ip_address is None and location is None:
|
593
420
|
# local IP
|
594
421
|
console.log(f"Scanning for valid IPs")
|
595
422
|
ip_address = select_ip_address()
|
596
|
-
|
597
|
-
# load VPN ip
|
598
|
-
ip_address = CLUSTER.get_vpn_ip()
|
423
|
+
|
599
424
|
console.log(f"Using {ip_address} address for server")
|
600
425
|
|
601
|
-
|
602
|
-
auth_key = str(uuid.uuid4())
|
603
|
-
write_auth_key = str(uuid.uuid4())
|
604
|
-
readonly_auth_key = str(uuid.uuid4())
|
605
|
-
|
606
|
-
watcher_service = f"{ip_address}:{DEFAULT_WATCHER_PORT}"
|
607
|
-
values = {
|
608
|
-
CLUSTER_NAME_KEY: cluster_name,
|
609
|
-
CLUSTER_IP_KEY: ip_address,
|
610
|
-
AUTH_KEY: auth_key,
|
611
|
-
READONLY_AUTH_KEY: readonly_auth_key,
|
612
|
-
WRITE_AUTH_KEY: write_auth_key,
|
613
|
-
WATCHER_PORT_KEY: DEFAULT_WATCHER_PORT,
|
614
|
-
WATCHER_SERVICE_KEY: watcher_service,
|
615
|
-
USER_NODE_LABEL_KEY: USER_NODE_LABEL,
|
616
|
-
ALLOW_UNREGISTERED_USER_KEY: not only_registered_users
|
617
|
-
}
|
426
|
+
console.log(f"[green]Creating {cluster_name} pool, this may take a few minutes...")
|
618
427
|
|
619
|
-
|
620
|
-
server_ip=ip_address,
|
621
|
-
auth_key=auth_key,
|
622
|
-
readonly_auth_key=readonly_auth_key,
|
623
|
-
write_auth_key=write_auth_key,
|
624
|
-
file=USER_LOCAL_SERVER_FILE,
|
625
|
-
watcher_service=watcher_service,
|
626
|
-
node_name=socket.gethostname(),
|
428
|
+
create_pool(
|
627
429
|
cluster_name=cluster_name,
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
default_values_path=app_values,
|
636
|
-
force_defaults=True)
|
637
|
-
with open(USER_HELM_APPS_FILE, "w") as f:
|
638
|
-
f.write(helm_yaml)
|
639
|
-
|
640
|
-
console.log("[green]Config files have been generated in your local machine\n")
|
641
|
-
|
642
|
-
console.log("Setting pool dependencies...")
|
643
|
-
# set template values in helmfile
|
644
|
-
try:
|
645
|
-
CLUSTER.update_dependencies(
|
646
|
-
dependencies_file=USER_HELM_APPS_FILE
|
647
|
-
)
|
648
|
-
except Exception as e:
|
649
|
-
console.log(f"Error: {str(e)}")
|
650
|
-
exit()
|
651
|
-
console.log("[green]Your pool is ready! Grow it by sharing your joining token with others. Run [yellow]kalavai pool token[green] to generate one.")
|
652
|
-
|
653
|
-
if location is not None:
|
654
|
-
# register with kalavai if it's a public cluster
|
655
|
-
console.log("Registering public cluster with Kalavai...")
|
656
|
-
pool__publish()
|
657
|
-
|
658
|
-
# wait until the server is ready to create objects
|
659
|
-
while True:
|
660
|
-
console.log("Waiting for core services to be ready, may take a few minutes...")
|
661
|
-
time.sleep(30)
|
662
|
-
if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
|
663
|
-
break
|
664
|
-
console.log("Initialise user workspace...")
|
665
|
-
pool_init(pool_config_values_path=pool_config_values)
|
666
|
-
# init default namespace
|
667
|
-
init_user_workspace(force_namespace="default")
|
668
|
-
if only_registered_users:
|
669
|
-
# init user namespace
|
670
|
-
init_user_workspace()
|
430
|
+
ip_address=ip_address,
|
431
|
+
app_values=app_values,
|
432
|
+
pool_config_values=pool_config_values,
|
433
|
+
num_gpus=input_gpus(),
|
434
|
+
only_registered_users=only_registered_users,
|
435
|
+
location=location
|
436
|
+
)
|
671
437
|
|
672
438
|
return None
|
673
439
|
|
@@ -720,20 +486,13 @@ def pool__check_token(token, *others, public=False):
|
|
720
486
|
"""
|
721
487
|
Utility to check the validity of a join token
|
722
488
|
"""
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
assert field in data
|
727
|
-
if public:
|
728
|
-
if data[PUBLIC_LOCATION_KEY] is None:
|
729
|
-
raise ValueError("Token is not valid for public pools. Did you start the cluster with a public_location?")
|
730
|
-
console.log("[green]Token format is correct")
|
731
|
-
return True
|
732
|
-
except Exception as e:
|
733
|
-
console.log(f"[white]{str(e)}")
|
734
|
-
console.log("[red]Token is invalid.")
|
489
|
+
result = check_token(token=token, public=public)
|
490
|
+
if "error" in result:
|
491
|
+
console.log(f"[red]Error in token: {result}")
|
735
492
|
return False
|
736
|
-
|
493
|
+
|
494
|
+
console.log("[green]Token format is correct")
|
495
|
+
return True
|
737
496
|
|
738
497
|
@arguably.command
|
739
498
|
def pool__join(token, *others, node_name=None):
|
@@ -743,9 +502,6 @@ def pool__join(token, *others, node_name=None):
|
|
743
502
|
Args:
|
744
503
|
*others: all the other positional arguments go here
|
745
504
|
"""
|
746
|
-
|
747
|
-
if not check_worker_compatibility():
|
748
|
-
return
|
749
505
|
|
750
506
|
# check that k3s is not running already in the host
|
751
507
|
# k3s service running or preinstalled
|
@@ -763,119 +519,26 @@ def pool__join(token, *others, node_name=None):
|
|
763
519
|
console.log("[green]Nothing happened.")
|
764
520
|
return
|
765
521
|
|
766
|
-
|
767
|
-
node_name = socket.gethostname()
|
768
|
-
|
769
|
-
# check token
|
770
|
-
if not pool__check_token(token):
|
771
|
-
return
|
772
|
-
|
773
|
-
try:
|
774
|
-
data = decode_dict(token)
|
775
|
-
kalavai_seed_ip = data[CLUSTER_IP_KEY]
|
776
|
-
kalavai_token = data[CLUSTER_TOKEN_KEY]
|
777
|
-
cluster_name = data[CLUSTER_NAME_KEY]
|
778
|
-
auth_key = data[AUTH_KEY]
|
779
|
-
watcher_service = data[WATCHER_SERVICE_KEY]
|
780
|
-
public_location = data[PUBLIC_LOCATION_KEY]
|
781
|
-
vpn = defaultdict(lambda: None)
|
782
|
-
except Exception as e:
|
783
|
-
console.log(str(e))
|
784
|
-
console.log("[red] Invalid token")
|
785
|
-
return
|
786
|
-
|
787
|
-
# join private network if provided
|
788
|
-
node_labels = {
|
789
|
-
STORAGE_CLASS_LABEL: is_storage_compatible()
|
790
|
-
}
|
791
|
-
user = defaultdict(lambda: None)
|
792
|
-
if public_location is not None:
|
793
|
-
user = user_login(user_cookie=USER_COOKIE)
|
794
|
-
if user is None:
|
795
|
-
console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
|
796
|
-
exit()
|
797
|
-
console.log("Fetching VPN credentials")
|
798
|
-
try:
|
799
|
-
vpn = get_vpn_details(
|
800
|
-
location=public_location,
|
801
|
-
user_cookie=USER_COOKIE)
|
802
|
-
node_labels[USER_NODE_LABEL] = user["username"]
|
803
|
-
except Exception as e:
|
804
|
-
console.log(f"[red]Error when joining network: {str(e)}")
|
805
|
-
console.log("Are you authenticated? Try [yellow]kalavai login")
|
806
|
-
return
|
807
|
-
try:
|
808
|
-
validate_join_public_seed(
|
809
|
-
cluster_name=cluster_name,
|
810
|
-
join_key=token,
|
811
|
-
user_cookie=USER_COOKIE
|
812
|
-
)
|
813
|
-
except Exception as e:
|
814
|
-
console.log(f"[red]Error when joining network: {str(e)}")
|
815
|
-
return
|
816
|
-
|
817
|
-
# local agent join
|
818
|
-
# 1. Generate local cache files
|
819
|
-
console.log("Generating config files...")
|
820
|
-
|
821
|
-
# Generate docker compose recipe
|
822
|
-
generate_compose_config(
|
823
|
-
role="agent",
|
824
|
-
pool_ip=f"https://{kalavai_seed_ip}:6443",
|
825
|
-
pool_token=kalavai_token,
|
826
|
-
vpn_token=vpn["key"],
|
827
|
-
node_name=node_name,
|
828
|
-
node_labels=node_labels,
|
829
|
-
is_public=public_location is not None)
|
830
|
-
|
831
|
-
store_server_info(
|
832
|
-
server_ip=kalavai_seed_ip,
|
833
|
-
auth_key=auth_key,
|
834
|
-
file=USER_LOCAL_SERVER_FILE,
|
835
|
-
watcher_service=watcher_service,
|
836
|
-
node_name=node_name,
|
837
|
-
cluster_name=cluster_name,
|
838
|
-
public_location=public_location,
|
839
|
-
user_api_key=user["api_key"])
|
522
|
+
num_gpus = input_gpus()
|
840
523
|
|
841
524
|
option = user_confirm(
|
842
525
|
question="Docker compose ready. Would you like Kalavai to deploy it?",
|
843
526
|
options=["no", "yes"]
|
844
527
|
)
|
845
528
|
if option == 0:
|
846
|
-
console.log("
|
847
|
-
print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
|
529
|
+
console.log("[red]Installation aborted")
|
848
530
|
return
|
849
531
|
|
850
|
-
console.log(
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
console.log("Waiting for core services to be ready, may take a few minutes...")
|
861
|
-
time.sleep(30)
|
862
|
-
if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
|
863
|
-
break
|
864
|
-
|
865
|
-
# check the node has connected successfully
|
866
|
-
try:
|
867
|
-
while not CLUSTER.is_agent_running():
|
868
|
-
console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
|
869
|
-
time.sleep(30)
|
870
|
-
except KeyboardInterrupt:
|
871
|
-
console.log("[red]Installation aborted. Leaving pool.")
|
872
|
-
pool__stop()
|
873
|
-
return
|
874
|
-
|
875
|
-
init_user_workspace()
|
876
|
-
|
877
|
-
# set status to schedulable
|
878
|
-
console.log(f"[green] You are connected to {cluster_name}")
|
532
|
+
console.log("Connecting worker to the pool...")
|
533
|
+
result = join_pool(
|
534
|
+
token=token,
|
535
|
+
node_name=node_name,
|
536
|
+
num_gpus=num_gpus
|
537
|
+
)
|
538
|
+
if "error" in result:
|
539
|
+
console.log(f"[red]Error when connecting: {result}")
|
540
|
+
else:
|
541
|
+
console.log(f"[green] You are connected to {result}")
|
879
542
|
|
880
543
|
@arguably.command
|
881
544
|
def pool__stop(*others, skip_node_deletion=False):
|
@@ -929,11 +592,12 @@ def pool__pause(*others):
|
|
929
592
|
"""
|
930
593
|
# k3s stop locally
|
931
594
|
console.log("[white] Pausing kalavai app...")
|
932
|
-
success =
|
933
|
-
if success:
|
934
|
-
console.log("[
|
595
|
+
success = pause_agent()
|
596
|
+
if "error" in success:
|
597
|
+
console.log(f"[red] Error when stopping. {success['error']}")
|
935
598
|
else:
|
936
|
-
console.log("[
|
599
|
+
console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
|
600
|
+
|
937
601
|
|
938
602
|
@arguably.command
|
939
603
|
def pool__resume(*others):
|
@@ -948,10 +612,12 @@ def pool__resume(*others):
|
|
948
612
|
console.log("[red] Kalavai app was not started before, please run [yellow]kalavai pool start[red] to start a pool or [yellow]kalavai pool join[red] to join one first")
|
949
613
|
return
|
950
614
|
console.log("[white] Restarting sharing (may take a few minutes)...")
|
951
|
-
|
952
|
-
|
615
|
+
success = resume_agent()
|
616
|
+
if "error" in success:
|
617
|
+
console.log(f"[red] Error when restarting. {success['error']}")
|
953
618
|
else:
|
954
|
-
console.log("[
|
619
|
+
console.log("[white] Kalava sharing resumed")
|
620
|
+
|
955
621
|
|
956
622
|
|
957
623
|
@arguably.command
|
@@ -1090,7 +756,7 @@ def pool__attach(token, *others, node_name=None):
|
|
1090
756
|
"""
|
1091
757
|
|
1092
758
|
if node_name is None:
|
1093
|
-
node_name = socket.gethostname()
|
759
|
+
node_name = f"{socket.gethostname()}-{uuid.uuid4().hex[:6]}"
|
1094
760
|
|
1095
761
|
# check that is not attached to another instance
|
1096
762
|
if os.path.exists(USER_LOCAL_SERVER_FILE):
|
@@ -1102,70 +768,6 @@ def pool__attach(token, *others, node_name=None):
|
|
1102
768
|
console.log("[green]Nothing happened.")
|
1103
769
|
return
|
1104
770
|
|
1105
|
-
# check token
|
1106
|
-
if not pool__check_token(token):
|
1107
|
-
return
|
1108
|
-
|
1109
|
-
try:
|
1110
|
-
data = decode_dict(token)
|
1111
|
-
kalavai_seed_ip = data[CLUSTER_IP_KEY]
|
1112
|
-
cluster_name = data[CLUSTER_NAME_KEY]
|
1113
|
-
auth_key = data[AUTH_KEY]
|
1114
|
-
watcher_service = data[WATCHER_SERVICE_KEY]
|
1115
|
-
public_location = data[PUBLIC_LOCATION_KEY]
|
1116
|
-
vpn = defaultdict(lambda: None)
|
1117
|
-
except Exception as e:
|
1118
|
-
console.log(str(e))
|
1119
|
-
console.log("[red] Invalid token")
|
1120
|
-
return
|
1121
|
-
|
1122
|
-
user = defaultdict(lambda: None)
|
1123
|
-
if public_location is not None:
|
1124
|
-
user = user_login(user_cookie=USER_COOKIE)
|
1125
|
-
if user is None:
|
1126
|
-
console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
|
1127
|
-
exit()
|
1128
|
-
console.log("Fetching VPN credentials")
|
1129
|
-
try:
|
1130
|
-
vpn = get_vpn_details(
|
1131
|
-
location=public_location,
|
1132
|
-
user_cookie=USER_COOKIE)
|
1133
|
-
except Exception as e:
|
1134
|
-
console.log(f"[red]Error when joining network: {str(e)}")
|
1135
|
-
console.log("Are you authenticated? Try [yellow]kalavai login")
|
1136
|
-
return
|
1137
|
-
try:
|
1138
|
-
validate_join_public_seed(
|
1139
|
-
cluster_name=cluster_name,
|
1140
|
-
join_key=token,
|
1141
|
-
user_cookie=USER_COOKIE
|
1142
|
-
)
|
1143
|
-
except Exception as e:
|
1144
|
-
console.log(f"[red]Error when joining network: {str(e)}")
|
1145
|
-
return
|
1146
|
-
|
1147
|
-
# local agent join
|
1148
|
-
# 1. Generate local cache files
|
1149
|
-
console.log("Generating config files...")
|
1150
|
-
|
1151
|
-
# Generate docker compose recipe
|
1152
|
-
generate_compose_config(
|
1153
|
-
use_gpus=False,
|
1154
|
-
role="",
|
1155
|
-
vpn_token=vpn["key"],
|
1156
|
-
node_name=node_name,
|
1157
|
-
is_public=public_location is not None)
|
1158
|
-
|
1159
|
-
store_server_info(
|
1160
|
-
server_ip=kalavai_seed_ip,
|
1161
|
-
auth_key=auth_key,
|
1162
|
-
file=USER_LOCAL_SERVER_FILE,
|
1163
|
-
watcher_service=watcher_service,
|
1164
|
-
node_name=node_name,
|
1165
|
-
cluster_name=cluster_name,
|
1166
|
-
public_location=public_location,
|
1167
|
-
user_api_key=user["api_key"])
|
1168
|
-
|
1169
771
|
option = user_confirm(
|
1170
772
|
question="Docker compose ready. Would you like Kalavai to deploy it?",
|
1171
773
|
options=["no", "yes"]
|
@@ -1175,17 +777,13 @@ def pool__attach(token, *others, node_name=None):
|
|
1175
777
|
print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
|
1176
778
|
return
|
1177
779
|
|
1178
|
-
|
1179
|
-
run_cmd(f"docker compose -f {USER_COMPOSE_FILE} up -d")
|
1180
|
-
# ensure we are connected
|
1181
|
-
while True:
|
1182
|
-
console.log("Waiting for core services to be ready, may take a few minutes...")
|
1183
|
-
time.sleep(30)
|
1184
|
-
if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
|
1185
|
-
break
|
780
|
+
result = attach_to_pool(token=token, node_name=node_name)
|
1186
781
|
|
782
|
+
if "error" in result:
|
783
|
+
console.log(f"[red]Error when attaching to pool: {result}")
|
784
|
+
return
|
1187
785
|
# set status to schedulable
|
1188
|
-
console.log(f"[green] You are connected to {
|
786
|
+
console.log(f"[green] You are connected to {result}")
|
1189
787
|
|
1190
788
|
|
1191
789
|
@arguably.command
|
@@ -1409,18 +1007,13 @@ def job__templates(*others):
|
|
1409
1007
|
console.log(f"[red]Problems with your pool: {str(e)}")
|
1410
1008
|
return
|
1411
1009
|
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1418
|
-
|
1419
|
-
)
|
1420
|
-
console.log("Templates available in the pool")
|
1421
|
-
console.log(result)
|
1422
|
-
except Exception as e:
|
1423
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1010
|
+
templates = fetch_job_templates()
|
1011
|
+
if "error" in templates:
|
1012
|
+
console.log(f"[red]Error when fetching templates: {str(e)}")
|
1013
|
+
return
|
1014
|
+
|
1015
|
+
console.log("Templates available in the pool")
|
1016
|
+
console.log(templates)
|
1424
1017
|
|
1425
1018
|
|
1426
1019
|
@arguably.command
|
@@ -1476,26 +1069,16 @@ def job__run(template_name, *others, values: str=None, force_namespace: str=None
|
|
1476
1069
|
annotation_key="nvidia.com/nouse-gputype"
|
1477
1070
|
)
|
1478
1071
|
|
1479
|
-
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1484
|
-
if force_namespace is not None:
|
1485
|
-
data["force_namespace"] = force_namespace
|
1072
|
+
result = deploy_job(
|
1073
|
+
template_name=template_name,
|
1074
|
+
values_dict=values_dict,
|
1075
|
+
force_namespace=force_namespace
|
1076
|
+
)
|
1486
1077
|
|
1487
|
-
|
1488
|
-
|
1489
|
-
|
1490
|
-
endpoint="/v1/deploy_job",
|
1491
|
-
data=data,
|
1492
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1493
|
-
user_cookie=USER_COOKIE
|
1494
|
-
)
|
1078
|
+
if "error" in result:
|
1079
|
+
console.log(f"[red]Error when deploying job: {str(e)}")
|
1080
|
+
else:
|
1495
1081
|
console.log(f"[green]{template_name} job deployed")
|
1496
|
-
except Exception as e:
|
1497
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1498
|
-
return
|
1499
1082
|
|
1500
1083
|
@arguably.command
|
1501
1084
|
def job__test(local_template_dir, *others, values, defaults, force_namespace: str=None):
|
@@ -1568,22 +1151,12 @@ def job__defaults(template_name, *others):
|
|
1568
1151
|
return
|
1569
1152
|
|
1570
1153
|
# deploy template with kube-watcher
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
endpoint="/v1/job_defaults",
|
1578
|
-
data=data,
|
1579
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1580
|
-
user_cookie=USER_COOKIE
|
1581
|
-
)
|
1582
|
-
print(
|
1583
|
-
json.dumps(result,indent=3)
|
1584
|
-
)
|
1585
|
-
except Exception as e:
|
1586
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1154
|
+
defaults = fetch_job_defaults(name=template_name)
|
1155
|
+
if "error" in defaults:
|
1156
|
+
console.log(f"[red]Error when fetching job defaults: {defaults}")
|
1157
|
+
print(
|
1158
|
+
json.dumps(defaults, indent=3)
|
1159
|
+
)
|
1587
1160
|
|
1588
1161
|
|
1589
1162
|
@arguably.command
|
@@ -1601,23 +1174,11 @@ def job__delete(name, *others, force_namespace: str=None):
|
|
1601
1174
|
console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
|
1602
1175
|
|
1603
1176
|
# deploy template with kube-watcher
|
1604
|
-
|
1605
|
-
|
1606
|
-
"
|
1607
|
-
|
1608
|
-
if force_namespace is not None:
|
1609
|
-
data["force_namespace"] = force_namespace
|
1610
|
-
try:
|
1611
|
-
result = request_to_server(
|
1612
|
-
method="post",
|
1613
|
-
endpoint="/v1/delete_labeled_resources",
|
1614
|
-
data=data,
|
1615
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1616
|
-
user_cookie=USER_COOKIE
|
1617
|
-
)
|
1177
|
+
result = delete_job(name=name, force_namespace=force_namespace)
|
1178
|
+
if "error" in result:
|
1179
|
+
console.log(f"[red]Error when deleting job: {str(e)}")
|
1180
|
+
else:
|
1618
1181
|
console.log(f"{result}")
|
1619
|
-
except Exception as e:
|
1620
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1621
1182
|
|
1622
1183
|
|
1623
1184
|
@arguably.command
|