kalavai-client 0.5.15__py3-none-any.whl → 0.5.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kalavai_client/__init__.py +1 -1
- kalavai_client/assets/apps.yaml +1 -1
- kalavai_client/assets/docker-compose-gui.yaml +10 -0
- kalavai_client/assets/docker-compose-template.yaml +5 -3
- kalavai_client/cli.py +143 -584
- kalavai_client/cluster.py +25 -2
- kalavai_client/core.py +655 -4
- kalavai_client/env.py +41 -2
- kalavai_client/utils.py +55 -19
- {kalavai_client-0.5.15.dist-info → kalavai_client-0.5.16.dist-info}/METADATA +5 -4
- kalavai_client-0.5.16.dist-info/RECORD +23 -0
- {kalavai_client-0.5.15.dist-info → kalavai_client-0.5.16.dist-info}/WHEEL +1 -1
- kalavai_client-0.5.15.dist-info/RECORD +0 -22
- {kalavai_client-0.5.15.dist-info → kalavai_client-0.5.16.dist-info}/LICENSE +0 -0
- {kalavai_client-0.5.15.dist-info → kalavai_client-0.5.16.dist-info}/entry_points.txt +0 -0
kalavai_client/cli.py
CHANGED
@@ -7,19 +7,30 @@ import time
|
|
7
7
|
import socket
|
8
8
|
from pathlib import Path
|
9
9
|
from getpass import getpass
|
10
|
-
import ipaddress
|
11
10
|
from sys import exit
|
12
11
|
|
13
12
|
import yaml
|
14
|
-
|
13
|
+
|
15
14
|
import arguably
|
16
15
|
from rich.console import Console
|
17
16
|
|
17
|
+
from kalavai_client.cluster import CLUSTER
|
18
18
|
from kalavai_client.env import (
|
19
19
|
USER_COOKIE,
|
20
20
|
USER_LOCAL_SERVER_FILE,
|
21
21
|
TEMPLATE_LABEL,
|
22
|
-
|
22
|
+
KALAVAI_PLATFORM_URL,
|
23
|
+
DEFAULT_VPN_CONTAINER_NAME,
|
24
|
+
CONTAINER_HOST_PATH,
|
25
|
+
USER_COMPOSE_FILE,
|
26
|
+
USER_HELM_APPS_FILE,
|
27
|
+
USER_KUBECONFIG_FILE,
|
28
|
+
USER_VPN_COMPOSE_FILE,
|
29
|
+
USER_TEMPLATES_FOLDER,
|
30
|
+
DOCKER_COMPOSE_GUI,
|
31
|
+
USER_GUI_COMPOSE_FILE,
|
32
|
+
user_path,
|
33
|
+
resource_path,
|
23
34
|
)
|
24
35
|
from kalavai_client.core import (
|
25
36
|
fetch_resources,
|
@@ -28,22 +39,29 @@ from kalavai_client.core import (
|
|
28
39
|
fetch_devices,
|
29
40
|
fetch_job_logs,
|
30
41
|
fetch_gpus,
|
31
|
-
load_gpu_models
|
42
|
+
load_gpu_models,
|
43
|
+
fetch_job_templates,
|
44
|
+
fetch_job_defaults,
|
45
|
+
deploy_job,
|
46
|
+
delete_job,
|
47
|
+
check_token,
|
48
|
+
attach_to_pool,
|
49
|
+
join_pool,
|
50
|
+
create_pool,
|
51
|
+
get_ip_addresses,
|
52
|
+
pause_agent,
|
53
|
+
resume_agent
|
32
54
|
)
|
33
55
|
from kalavai_client.utils import (
|
34
56
|
check_gpu_drivers,
|
57
|
+
load_template,
|
35
58
|
run_cmd,
|
36
|
-
decode_dict,
|
37
59
|
generate_join_token,
|
38
60
|
user_confirm,
|
39
|
-
load_template,
|
40
|
-
store_server_info,
|
41
61
|
generate_table,
|
42
62
|
request_to_server,
|
43
|
-
resource_path,
|
44
63
|
safe_remove,
|
45
64
|
leave_vpn,
|
46
|
-
get_vpn_details,
|
47
65
|
load_server_info,
|
48
66
|
user_login,
|
49
67
|
user_logout,
|
@@ -51,9 +69,6 @@ from kalavai_client.utils import (
|
|
51
69
|
register_cluster,
|
52
70
|
unregister_cluster,
|
53
71
|
get_public_seeds,
|
54
|
-
validate_join_public_seed,
|
55
|
-
is_storage_compatible,
|
56
|
-
is_watcher_alive,
|
57
72
|
load_user_session,
|
58
73
|
SERVER_IP_KEY,
|
59
74
|
AUTH_KEY,
|
@@ -62,112 +77,30 @@ from kalavai_client.utils import (
|
|
62
77
|
WRITE_AUTH_KEY,
|
63
78
|
PUBLIC_LOCATION_KEY,
|
64
79
|
NODE_NAME_KEY,
|
65
|
-
CLUSTER_NAME_KEY
|
66
|
-
CLUSTER_IP_KEY,
|
67
|
-
CLUSTER_TOKEN_KEY,
|
68
|
-
WATCHER_PORT_KEY,
|
69
|
-
MANDATORY_TOKEN_FIELDS,
|
70
|
-
USER_NODE_LABEL_KEY,
|
71
|
-
ALLOW_UNREGISTERED_USER_KEY
|
72
|
-
)
|
73
|
-
from kalavai_client.cluster import (
|
74
|
-
dockerCluster
|
80
|
+
CLUSTER_NAME_KEY
|
75
81
|
)
|
76
82
|
|
77
83
|
|
78
|
-
KALAVAI_PLATFORM_URL = os.getenv("KALAVAI_PLATFORM_URL", "https://platform.kalavai.net")
|
79
84
|
LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
|
80
85
|
VERSION = 1
|
81
86
|
RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
|
82
87
|
CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
|
83
88
|
RAY_LABEL = "kalavai.ray.name"
|
84
89
|
PVC_NAME_LABEL = "kalavai.storage.name"
|
85
|
-
DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
|
86
90
|
VPN_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/vpn-template.yaml")
|
87
|
-
POOL_CONFIG_TEMPLATE = resource_path("kalavai_client/assets/pool_config_template.yaml")
|
88
|
-
POOL_CONFIG_DEFAULT_VALUES = resource_path("kalavai_client/assets/pool_config_values.yaml")
|
89
|
-
USER_WORKSPACE_TEMPLATE = resource_path("kalavai_client/assets/user_workspace.yaml")
|
90
|
-
DEFAULT_USER_WORKSPACE_VALUES = resource_path("kalavai_client/assets/user_workspace_values.yaml")
|
91
91
|
STORAGE_CLASS_NAME = "local-path"
|
92
92
|
STORAGE_ACCESS_MODE = ["ReadWriteOnce"]
|
93
|
-
STORAGE_CLASS_LABEL = "kalavai.storage.enabled"
|
94
93
|
DEFAULT_STORAGE_NAME = "pool-cache"
|
95
94
|
DEFAULT_STORAGE_SIZE = 20
|
96
|
-
DEFAULT_WATCHER_PORT = 30001
|
97
|
-
USER_NODE_LABEL = "kalavai.cluster.user"
|
98
|
-
KUBE_VERSION = os.getenv("KALAVAI_KUBE_VERSION", "v1.31.1+k3s1")
|
99
|
-
DEFAULT_FLANNEL_IFACE = os.getenv("KALAVAI_FLANNEL_IFACE", "netmaker-1")
|
100
|
-
FORBIDEDEN_IPS = ["127.0.0.1"]
|
101
|
-
# kalavai templates
|
102
|
-
HELM_APPS_FILE = resource_path("kalavai_client/assets/apps.yaml")
|
103
|
-
HELM_APPS_VALUES = resource_path("kalavai_client/assets/apps_values.yaml")
|
104
|
-
# user specific config files
|
105
|
-
DEFAULT_CONTAINER_NAME = "kalavai"
|
106
|
-
DEFAULT_VPN_CONTAINER_NAME = "kalavai-vpn"
|
107
|
-
CONTAINER_HOST_PATH = user_path("pool/", create_path=True)
|
108
|
-
USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
|
109
|
-
USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
|
110
|
-
USER_HELM_APPS_FILE = user_path("apps.yaml")
|
111
|
-
USER_KUBECONFIG_FILE = user_path("kubeconfig")
|
112
|
-
USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
|
113
|
-
|
114
95
|
|
96
|
+
|
115
97
|
console = Console()
|
116
|
-
CLUSTER = dockerCluster(
|
117
|
-
container_name=DEFAULT_CONTAINER_NAME,
|
118
|
-
kube_version=KUBE_VERSION,
|
119
|
-
flannel_iface=DEFAULT_FLANNEL_IFACE,
|
120
|
-
compose_file=USER_COMPOSE_FILE,
|
121
|
-
kubeconfig_file=USER_KUBECONFIG_FILE,
|
122
|
-
poolconfig_file=USER_LOCAL_SERVER_FILE,
|
123
|
-
dependencies_file=USER_HELM_APPS_FILE
|
124
|
-
)
|
125
98
|
|
126
99
|
|
127
100
|
######################
|
128
101
|
## HELPER FUNCTIONS ##
|
129
102
|
######################
|
130
103
|
|
131
|
-
def check_seed_compatibility():
|
132
|
-
"""Check required packages to start pools"""
|
133
|
-
logs = []
|
134
|
-
console.log("[white]Checking system requirements...")
|
135
|
-
# docker
|
136
|
-
try:
|
137
|
-
run_cmd("docker version >/dev/null 2>&1")
|
138
|
-
except:
|
139
|
-
logs.append("[red]Docker not installed. Install instructions:\n")
|
140
|
-
logs.append(" Linux: https://docs.docker.com/engine/install/\n")
|
141
|
-
logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
|
142
|
-
|
143
|
-
if len(logs) == 0:
|
144
|
-
console.log("[green]System is ready to start a pool")
|
145
|
-
return True
|
146
|
-
else:
|
147
|
-
for log in logs:
|
148
|
-
console.log(log)
|
149
|
-
return False
|
150
|
-
|
151
|
-
def check_worker_compatibility():
|
152
|
-
"""Check required packages to join pools"""
|
153
|
-
logs = []
|
154
|
-
console.log("[white]Checking system requirements...")
|
155
|
-
# docker
|
156
|
-
try:
|
157
|
-
run_cmd("docker version >/dev/null 2>&1")
|
158
|
-
except:
|
159
|
-
logs.append("[red]Docker not installed. Install instructions:\n")
|
160
|
-
logs.append(" Linux: https://docs.docker.com/engine/install/\n")
|
161
|
-
logs.append(" Windows/MacOS: https://docs.docker.com/desktop/\n")
|
162
|
-
|
163
|
-
if len(logs) == 0:
|
164
|
-
console.log("[green]System is ready to join a pool")
|
165
|
-
return True
|
166
|
-
else:
|
167
|
-
for log in logs:
|
168
|
-
console.log(log)
|
169
|
-
return False
|
170
|
-
|
171
104
|
|
172
105
|
def cleanup_local():
|
173
106
|
console.log("Removing local cache files...")
|
@@ -178,6 +111,7 @@ def cleanup_local():
|
|
178
111
|
safe_remove(USER_KUBECONFIG_FILE)
|
179
112
|
safe_remove(USER_LOCAL_SERVER_FILE)
|
180
113
|
safe_remove(USER_TEMPLATES_FOLDER)
|
114
|
+
safe_remove(USER_GUI_COMPOSE_FILE)
|
181
115
|
|
182
116
|
def pre_join_check(node_name, server_url, server_key):
|
183
117
|
# check with the server that we can connect
|
@@ -217,75 +151,9 @@ def set_schedulable(schedulable, node_name=load_server_info(data_key=NODE_NAME_K
|
|
217
151
|
except Exception as e:
|
218
152
|
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
219
153
|
|
220
|
-
|
221
|
-
def init_user_workspace(force_namespace=None):
|
222
|
-
|
223
|
-
# load template config and populate with values
|
224
|
-
sidecar_template_yaml = load_template(
|
225
|
-
template_path=USER_WORKSPACE_TEMPLATE,
|
226
|
-
values={},
|
227
|
-
default_values_path=DEFAULT_USER_WORKSPACE_VALUES)
|
228
|
-
|
229
|
-
try:
|
230
|
-
data = {"config": sidecar_template_yaml}
|
231
|
-
if force_namespace is not None:
|
232
|
-
data["force_namespace"] = force_namespace
|
233
|
-
result = request_to_server(
|
234
|
-
method="post",
|
235
|
-
endpoint="/v1/create_user_space",
|
236
|
-
data=data,
|
237
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
238
|
-
user_cookie=USER_COOKIE
|
239
|
-
)
|
240
|
-
console.log(f"Workspace creation (ignore already created warnings): {result}" )
|
241
|
-
except Exception as e:
|
242
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
243
|
-
|
244
|
-
def pool_init(pool_config_values_path=None):
|
245
|
-
"""Deploy configured objects to initialise pool"""
|
246
|
-
if pool_config_values_path is None:
|
247
|
-
return
|
248
|
-
|
249
|
-
# load template config and populate with values
|
250
|
-
sidecar_template_yaml = load_template(
|
251
|
-
template_path=POOL_CONFIG_TEMPLATE,
|
252
|
-
values={},
|
253
|
-
default_values_path=pool_config_values_path)
|
254
|
-
|
255
|
-
try:
|
256
|
-
result = request_to_server(
|
257
|
-
method="post",
|
258
|
-
endpoint="/v1/deploy_generic_model",
|
259
|
-
data={"config": sidecar_template_yaml},
|
260
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
261
|
-
user_cookie=USER_COOKIE
|
262
|
-
)
|
263
|
-
if 'failed' in result and len(result['failed']) > 0:
|
264
|
-
console.log(f"[red]Error when deploying pool config\n\n{result['failed']}")
|
265
|
-
if len(result['successful']) > 0:
|
266
|
-
console.log(f"[green]Deployed pool config!")
|
267
|
-
except Exception as e:
|
268
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
269
|
-
|
270
154
|
def select_ip_address(subnet=None):
|
271
|
-
ips =
|
272
|
-
|
273
|
-
while len(ips) == 0:
|
274
|
-
for iface in ni.interfaces():
|
275
|
-
try:
|
276
|
-
ip = ni.ifaddresses(iface)[ni.AF_INET][0]['addr']
|
277
|
-
if ip in FORBIDEDEN_IPS:
|
278
|
-
continue
|
279
|
-
if subnet is None or ipaddress.ip_address(ip) in ipaddress.ip_network(subnet):
|
280
|
-
ips.append(ip)
|
281
|
-
except:
|
282
|
-
pass
|
283
|
-
if len(ips) == 1:
|
284
|
-
return ips[0]
|
285
|
-
time.sleep(2)
|
286
|
-
retry -= 1
|
287
|
-
if retry < 0:
|
288
|
-
raise ValueError(f"No IPs available on subnet {subnet}")
|
155
|
+
ips = get_ip_addresses(subnet=subnet)
|
156
|
+
|
289
157
|
while True:
|
290
158
|
option = user_confirm(
|
291
159
|
question="Select IP to advertise the node (needs to be visible to other nodes)",
|
@@ -336,50 +204,50 @@ def select_token_type():
|
|
336
204
|
break
|
337
205
|
return {"admin": choice == 0, "user": choice == 1, "worker": choice == 2}
|
338
206
|
|
339
|
-
def
|
207
|
+
def input_gpus():
|
340
208
|
num_gpus = 0
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
if node_labels is not None:
|
353
|
-
node_labels = " ".join([f"--node-label {key}={value}" for key, value in node_labels.items()])
|
354
|
-
compose_values = {
|
355
|
-
"user_path": user_path(""),
|
356
|
-
"service_name": DEFAULT_CONTAINER_NAME,
|
357
|
-
"vpn": is_public,
|
358
|
-
"vpn_name": DEFAULT_VPN_CONTAINER_NAME,
|
359
|
-
"pool_ip": pool_ip,
|
360
|
-
"pool_token": pool_token,
|
361
|
-
"vpn_token": vpn_token,
|
362
|
-
"node_name": node_name,
|
363
|
-
"command": role,
|
364
|
-
"storage_enabled": "True",
|
365
|
-
"num_gpus": num_gpus,
|
366
|
-
"k3s_path": f"{CONTAINER_HOST_PATH}/k3s",
|
367
|
-
"etc_path": f"{CONTAINER_HOST_PATH}/etc",
|
368
|
-
"node_labels": node_labels,
|
369
|
-
"flannel_iface": DEFAULT_FLANNEL_IFACE if is_public else ""
|
370
|
-
}
|
371
|
-
# generate local config files
|
372
|
-
compose_yaml = load_template(
|
373
|
-
template_path=DOCKER_COMPOSE_TEMPLATE,
|
374
|
-
values=compose_values)
|
375
|
-
with open(USER_COMPOSE_FILE, "w") as f:
|
376
|
-
f.write(compose_yaml)
|
377
|
-
return compose_yaml
|
209
|
+
try:
|
210
|
+
has_gpus = check_gpu_drivers()
|
211
|
+
if has_gpus:
|
212
|
+
max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
|
213
|
+
num_gpus = user_confirm(
|
214
|
+
question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
|
215
|
+
options=range(max_gpus+1)
|
216
|
+
)
|
217
|
+
except:
|
218
|
+
console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
|
219
|
+
return num_gpus
|
378
220
|
|
379
221
|
##################
|
380
222
|
## CLI COMMANDS ##
|
381
223
|
##################
|
382
224
|
|
225
|
+
@arguably.command
|
226
|
+
def gui__start(*others, gui_port=3000, backend_port=8000):
|
227
|
+
"""Run GUI"""
|
228
|
+
values = {
|
229
|
+
"path": user_path(""),
|
230
|
+
"gui_port": gui_port,
|
231
|
+
"backend_port": backend_port
|
232
|
+
}
|
233
|
+
compose_yaml = load_template(
|
234
|
+
template_path=DOCKER_COMPOSE_GUI,
|
235
|
+
values=values)
|
236
|
+
with open(USER_GUI_COMPOSE_FILE, "w") as f:
|
237
|
+
f.write(compose_yaml)
|
238
|
+
|
239
|
+
run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} up -d")
|
240
|
+
|
241
|
+
console.log(f"[green]Loading GUI, may take a few minutes. It will be available at http://localhost:{gui_port}")
|
242
|
+
|
243
|
+
@arguably.command
|
244
|
+
def gui__stop(*others):
|
245
|
+
"""Stop GUI"""
|
246
|
+
run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} down")
|
247
|
+
|
248
|
+
console.log("[green]Kalavai GUI has been stopped")
|
249
|
+
|
250
|
+
|
383
251
|
@arguably.command
|
384
252
|
def login(*others, username: str=None):
|
385
253
|
"""
|
@@ -461,8 +329,9 @@ def pool__publish(*others, description=None):
|
|
461
329
|
description = description
|
462
330
|
|
463
331
|
try:
|
464
|
-
|
465
|
-
|
332
|
+
valid = check_token(token=token, public=True)
|
333
|
+
if "error" in valid:
|
334
|
+
raise ValueError(f"[red]Cluster must be started with a valid vpn_location to publish: {valid}")
|
466
335
|
cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
|
467
336
|
|
468
337
|
register_cluster(
|
@@ -523,7 +392,7 @@ def pool__list(*others, user_only=False):
|
|
523
392
|
|
524
393
|
|
525
394
|
@arguably.command
|
526
|
-
def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=
|
395
|
+
def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_address: str=None, location: str=None, app_values: str=None, pool_config_values: str=None):
|
527
396
|
"""
|
528
397
|
Start Kalavai pool and start/resume sharing resources.
|
529
398
|
|
@@ -531,9 +400,6 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
|
|
531
400
|
*others: all the other positional arguments go here
|
532
401
|
"""
|
533
402
|
|
534
|
-
if not check_seed_compatibility():
|
535
|
-
return
|
536
|
-
|
537
403
|
if CLUSTER.is_cluster_init():
|
538
404
|
console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
|
539
405
|
return
|
@@ -547,127 +413,25 @@ def pool__start(cluster_name, *others, only_registered_users: bool=False, ip_ad
|
|
547
413
|
console.log("Installation was cancelled and did not complete.")
|
548
414
|
return
|
549
415
|
|
550
|
-
# if only registered users are allowed, check user has logged in
|
551
|
-
user = defaultdict(lambda: None)
|
552
|
-
if only_registered_users or location is not None:
|
553
|
-
user = user_login(user_cookie=USER_COOKIE)
|
554
|
-
if user is None:
|
555
|
-
console.log("[white]--only-registered-users [red]or [white]--location[red] can only be used if the host is authenticated. Run [yellow]kalavai login[red] to authenticate")
|
556
|
-
exit()
|
557
|
-
|
558
|
-
# join private network if provided
|
559
|
-
vpn = defaultdict(lambda: None)
|
560
|
-
node_labels = {
|
561
|
-
STORAGE_CLASS_LABEL: is_storage_compatible()
|
562
|
-
}
|
563
|
-
if location is not None:
|
564
|
-
console.log("Fetching VPN credentials")
|
565
|
-
try:
|
566
|
-
vpn = get_vpn_details(
|
567
|
-
location=location,
|
568
|
-
user_cookie=USER_COOKIE)
|
569
|
-
node_labels[USER_NODE_LABEL] = user["username"]
|
570
|
-
except Exception as e:
|
571
|
-
console.log(f"[red]Error when joining network: {str(e)}")
|
572
|
-
return
|
573
|
-
|
574
|
-
# Generate docker compose recipe
|
575
|
-
generate_compose_config(
|
576
|
-
role="server",
|
577
|
-
vpn_token=vpn["key"],
|
578
|
-
node_name=socket.gethostname(),
|
579
|
-
node_labels=node_labels,
|
580
|
-
is_public=location is not None
|
581
|
-
)
|
582
|
-
|
583
|
-
# start server
|
584
|
-
console.log("Deploying seed...")
|
585
|
-
CLUSTER.start_seed_node()
|
586
|
-
|
587
|
-
while not CLUSTER.is_agent_running():
|
588
|
-
console.log("Waiting for seed to start...")
|
589
|
-
time.sleep(10)
|
590
|
-
|
591
416
|
# select IP address (for external discovery)
|
592
417
|
if ip_address is None and location is None:
|
593
418
|
# local IP
|
594
419
|
console.log(f"Scanning for valid IPs")
|
595
420
|
ip_address = select_ip_address()
|
596
|
-
|
597
|
-
# load VPN ip
|
598
|
-
ip_address = CLUSTER.get_vpn_ip()
|
421
|
+
|
599
422
|
console.log(f"Using {ip_address} address for server")
|
600
423
|
|
601
|
-
|
602
|
-
auth_key = str(uuid.uuid4())
|
603
|
-
write_auth_key = str(uuid.uuid4())
|
604
|
-
readonly_auth_key = str(uuid.uuid4())
|
605
|
-
|
606
|
-
watcher_service = f"{ip_address}:{DEFAULT_WATCHER_PORT}"
|
607
|
-
values = {
|
608
|
-
CLUSTER_NAME_KEY: cluster_name,
|
609
|
-
CLUSTER_IP_KEY: ip_address,
|
610
|
-
AUTH_KEY: auth_key,
|
611
|
-
READONLY_AUTH_KEY: readonly_auth_key,
|
612
|
-
WRITE_AUTH_KEY: write_auth_key,
|
613
|
-
WATCHER_PORT_KEY: DEFAULT_WATCHER_PORT,
|
614
|
-
WATCHER_SERVICE_KEY: watcher_service,
|
615
|
-
USER_NODE_LABEL_KEY: USER_NODE_LABEL,
|
616
|
-
ALLOW_UNREGISTERED_USER_KEY: not only_registered_users
|
617
|
-
}
|
424
|
+
console.log(f"[green]Creating {cluster_name} pool, this may take a few minutes...")
|
618
425
|
|
619
|
-
|
620
|
-
server_ip=ip_address,
|
621
|
-
auth_key=auth_key,
|
622
|
-
readonly_auth_key=readonly_auth_key,
|
623
|
-
write_auth_key=write_auth_key,
|
624
|
-
file=USER_LOCAL_SERVER_FILE,
|
625
|
-
watcher_service=watcher_service,
|
626
|
-
node_name=socket.gethostname(),
|
426
|
+
create_pool(
|
627
427
|
cluster_name=cluster_name,
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
default_values_path=app_values,
|
636
|
-
force_defaults=True)
|
637
|
-
with open(USER_HELM_APPS_FILE, "w") as f:
|
638
|
-
f.write(helm_yaml)
|
639
|
-
|
640
|
-
console.log("[green]Config files have been generated in your local machine\n")
|
641
|
-
|
642
|
-
console.log("Setting pool dependencies...")
|
643
|
-
# set template values in helmfile
|
644
|
-
try:
|
645
|
-
CLUSTER.update_dependencies(
|
646
|
-
dependencies_file=USER_HELM_APPS_FILE
|
647
|
-
)
|
648
|
-
except Exception as e:
|
649
|
-
console.log(f"Error: {str(e)}")
|
650
|
-
exit()
|
651
|
-
console.log("[green]Your pool is ready! Grow it by sharing your joining token with others. Run [yellow]kalavai pool token[green] to generate one.")
|
652
|
-
|
653
|
-
if location is not None:
|
654
|
-
# register with kalavai if it's a public cluster
|
655
|
-
console.log("Registering public cluster with Kalavai...")
|
656
|
-
pool__publish()
|
657
|
-
|
658
|
-
# wait until the server is ready to create objects
|
659
|
-
while True:
|
660
|
-
console.log("Waiting for core services to be ready, may take a few minutes...")
|
661
|
-
time.sleep(30)
|
662
|
-
if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
|
663
|
-
break
|
664
|
-
console.log("Initialise user workspace...")
|
665
|
-
pool_init(pool_config_values_path=pool_config_values)
|
666
|
-
# init default namespace
|
667
|
-
init_user_workspace(force_namespace="default")
|
668
|
-
if only_registered_users:
|
669
|
-
# init user namespace
|
670
|
-
init_user_workspace()
|
428
|
+
ip_address=ip_address,
|
429
|
+
app_values=app_values,
|
430
|
+
pool_config_values=pool_config_values,
|
431
|
+
num_gpus=input_gpus(),
|
432
|
+
only_registered_users=only_registered_users,
|
433
|
+
location=location
|
434
|
+
)
|
671
435
|
|
672
436
|
return None
|
673
437
|
|
@@ -720,20 +484,13 @@ def pool__check_token(token, *others, public=False):
|
|
720
484
|
"""
|
721
485
|
Utility to check the validity of a join token
|
722
486
|
"""
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
assert field in data
|
727
|
-
if public:
|
728
|
-
if data[PUBLIC_LOCATION_KEY] is None:
|
729
|
-
raise ValueError("Token is not valid for public pools. Did you start the cluster with a public_location?")
|
730
|
-
console.log("[green]Token format is correct")
|
731
|
-
return True
|
732
|
-
except Exception as e:
|
733
|
-
console.log(f"[white]{str(e)}")
|
734
|
-
console.log("[red]Token is invalid.")
|
487
|
+
result = check_token(token=token, public=public)
|
488
|
+
if "error" in result:
|
489
|
+
console.log(f"[red]Error in token: {result}")
|
735
490
|
return False
|
736
|
-
|
491
|
+
|
492
|
+
console.log("[green]Token format is correct")
|
493
|
+
return True
|
737
494
|
|
738
495
|
@arguably.command
|
739
496
|
def pool__join(token, *others, node_name=None):
|
@@ -743,9 +500,6 @@ def pool__join(token, *others, node_name=None):
|
|
743
500
|
Args:
|
744
501
|
*others: all the other positional arguments go here
|
745
502
|
"""
|
746
|
-
|
747
|
-
if not check_worker_compatibility():
|
748
|
-
return
|
749
503
|
|
750
504
|
# check that k3s is not running already in the host
|
751
505
|
# k3s service running or preinstalled
|
@@ -763,119 +517,26 @@ def pool__join(token, *others, node_name=None):
|
|
763
517
|
console.log("[green]Nothing happened.")
|
764
518
|
return
|
765
519
|
|
766
|
-
|
767
|
-
node_name = socket.gethostname()
|
768
|
-
|
769
|
-
# check token
|
770
|
-
if not pool__check_token(token):
|
771
|
-
return
|
772
|
-
|
773
|
-
try:
|
774
|
-
data = decode_dict(token)
|
775
|
-
kalavai_seed_ip = data[CLUSTER_IP_KEY]
|
776
|
-
kalavai_token = data[CLUSTER_TOKEN_KEY]
|
777
|
-
cluster_name = data[CLUSTER_NAME_KEY]
|
778
|
-
auth_key = data[AUTH_KEY]
|
779
|
-
watcher_service = data[WATCHER_SERVICE_KEY]
|
780
|
-
public_location = data[PUBLIC_LOCATION_KEY]
|
781
|
-
vpn = defaultdict(lambda: None)
|
782
|
-
except Exception as e:
|
783
|
-
console.log(str(e))
|
784
|
-
console.log("[red] Invalid token")
|
785
|
-
return
|
786
|
-
|
787
|
-
# join private network if provided
|
788
|
-
node_labels = {
|
789
|
-
STORAGE_CLASS_LABEL: is_storage_compatible()
|
790
|
-
}
|
791
|
-
user = defaultdict(lambda: None)
|
792
|
-
if public_location is not None:
|
793
|
-
user = user_login(user_cookie=USER_COOKIE)
|
794
|
-
if user is None:
|
795
|
-
console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
|
796
|
-
exit()
|
797
|
-
console.log("Fetching VPN credentials")
|
798
|
-
try:
|
799
|
-
vpn = get_vpn_details(
|
800
|
-
location=public_location,
|
801
|
-
user_cookie=USER_COOKIE)
|
802
|
-
node_labels[USER_NODE_LABEL] = user["username"]
|
803
|
-
except Exception as e:
|
804
|
-
console.log(f"[red]Error when joining network: {str(e)}")
|
805
|
-
console.log("Are you authenticated? Try [yellow]kalavai login")
|
806
|
-
return
|
807
|
-
try:
|
808
|
-
validate_join_public_seed(
|
809
|
-
cluster_name=cluster_name,
|
810
|
-
join_key=token,
|
811
|
-
user_cookie=USER_COOKIE
|
812
|
-
)
|
813
|
-
except Exception as e:
|
814
|
-
console.log(f"[red]Error when joining network: {str(e)}")
|
815
|
-
return
|
816
|
-
|
817
|
-
# local agent join
|
818
|
-
# 1. Generate local cache files
|
819
|
-
console.log("Generating config files...")
|
820
|
-
|
821
|
-
# Generate docker compose recipe
|
822
|
-
generate_compose_config(
|
823
|
-
role="agent",
|
824
|
-
pool_ip=f"https://{kalavai_seed_ip}:6443",
|
825
|
-
pool_token=kalavai_token,
|
826
|
-
vpn_token=vpn["key"],
|
827
|
-
node_name=node_name,
|
828
|
-
node_labels=node_labels,
|
829
|
-
is_public=public_location is not None)
|
830
|
-
|
831
|
-
store_server_info(
|
832
|
-
server_ip=kalavai_seed_ip,
|
833
|
-
auth_key=auth_key,
|
834
|
-
file=USER_LOCAL_SERVER_FILE,
|
835
|
-
watcher_service=watcher_service,
|
836
|
-
node_name=node_name,
|
837
|
-
cluster_name=cluster_name,
|
838
|
-
public_location=public_location,
|
839
|
-
user_api_key=user["api_key"])
|
520
|
+
num_gpus = input_gpus()
|
840
521
|
|
841
522
|
option = user_confirm(
|
842
523
|
question="Docker compose ready. Would you like Kalavai to deploy it?",
|
843
524
|
options=["no", "yes"]
|
844
525
|
)
|
845
526
|
if option == 0:
|
846
|
-
console.log("
|
847
|
-
print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
|
527
|
+
console.log("[red]Installation aborted")
|
848
528
|
return
|
849
529
|
|
850
|
-
console.log(
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
console.log("Waiting for core services to be ready, may take a few minutes...")
|
861
|
-
time.sleep(30)
|
862
|
-
if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
|
863
|
-
break
|
864
|
-
|
865
|
-
# check the node has connected successfully
|
866
|
-
try:
|
867
|
-
while not CLUSTER.is_agent_running():
|
868
|
-
console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
|
869
|
-
time.sleep(30)
|
870
|
-
except KeyboardInterrupt:
|
871
|
-
console.log("[red]Installation aborted. Leaving pool.")
|
872
|
-
pool__stop()
|
873
|
-
return
|
874
|
-
|
875
|
-
init_user_workspace()
|
876
|
-
|
877
|
-
# set status to schedulable
|
878
|
-
console.log(f"[green] You are connected to {cluster_name}")
|
530
|
+
console.log("Connecting worker to the pool...")
|
531
|
+
result = join_pool(
|
532
|
+
token=token,
|
533
|
+
node_name=node_name,
|
534
|
+
num_gpus=num_gpus
|
535
|
+
)
|
536
|
+
if "error" in result:
|
537
|
+
console.log(f"[red]Error when connecting: {result}")
|
538
|
+
else:
|
539
|
+
console.log(f"[green] You are connected to {result}")
|
879
540
|
|
880
541
|
@arguably.command
|
881
542
|
def pool__stop(*others, skip_node_deletion=False):
|
@@ -929,11 +590,12 @@ def pool__pause(*others):
|
|
929
590
|
"""
|
930
591
|
# k3s stop locally
|
931
592
|
console.log("[white] Pausing kalavai app...")
|
932
|
-
success =
|
933
|
-
if success:
|
934
|
-
console.log("[
|
593
|
+
success = pause_agent()
|
594
|
+
if "error" in success:
|
595
|
+
console.log(f"[red] Error when stopping. {success['error']}")
|
935
596
|
else:
|
936
|
-
console.log("[
|
597
|
+
console.log("[white] Kalava sharing paused. Resume with [yellow]kalavai pool resume")
|
598
|
+
|
937
599
|
|
938
600
|
@arguably.command
|
939
601
|
def pool__resume(*others):
|
@@ -948,10 +610,12 @@ def pool__resume(*others):
|
|
948
610
|
console.log("[red] Kalavai app was not started before, please run [yellow]kalavai pool start[red] to start a pool or [yellow]kalavai pool join[red] to join one first")
|
949
611
|
return
|
950
612
|
console.log("[white] Restarting sharing (may take a few minutes)...")
|
951
|
-
|
952
|
-
|
613
|
+
success = resume_agent()
|
614
|
+
if "error" in success:
|
615
|
+
console.log(f"[red] Error when restarting. {success['error']}")
|
953
616
|
else:
|
954
|
-
console.log("[
|
617
|
+
console.log("[white] Kalava sharing resumed")
|
618
|
+
|
955
619
|
|
956
620
|
|
957
621
|
@arguably.command
|
@@ -1090,7 +754,7 @@ def pool__attach(token, *others, node_name=None):
|
|
1090
754
|
"""
|
1091
755
|
|
1092
756
|
if node_name is None:
|
1093
|
-
node_name = socket.gethostname()
|
757
|
+
node_name = f"{socket.gethostname()}-{uuid.uuid4().hex[:6]}"
|
1094
758
|
|
1095
759
|
# check that is not attached to another instance
|
1096
760
|
if os.path.exists(USER_LOCAL_SERVER_FILE):
|
@@ -1102,70 +766,6 @@ def pool__attach(token, *others, node_name=None):
|
|
1102
766
|
console.log("[green]Nothing happened.")
|
1103
767
|
return
|
1104
768
|
|
1105
|
-
# check token
|
1106
|
-
if not pool__check_token(token):
|
1107
|
-
return
|
1108
|
-
|
1109
|
-
try:
|
1110
|
-
data = decode_dict(token)
|
1111
|
-
kalavai_seed_ip = data[CLUSTER_IP_KEY]
|
1112
|
-
cluster_name = data[CLUSTER_NAME_KEY]
|
1113
|
-
auth_key = data[AUTH_KEY]
|
1114
|
-
watcher_service = data[WATCHER_SERVICE_KEY]
|
1115
|
-
public_location = data[PUBLIC_LOCATION_KEY]
|
1116
|
-
vpn = defaultdict(lambda: None)
|
1117
|
-
except Exception as e:
|
1118
|
-
console.log(str(e))
|
1119
|
-
console.log("[red] Invalid token")
|
1120
|
-
return
|
1121
|
-
|
1122
|
-
user = defaultdict(lambda: None)
|
1123
|
-
if public_location is not None:
|
1124
|
-
user = user_login(user_cookie=USER_COOKIE)
|
1125
|
-
if user is None:
|
1126
|
-
console.log("[red]Must be logged in to join public pools. Run [yellow]kalavai login[red] to authenticate")
|
1127
|
-
exit()
|
1128
|
-
console.log("Fetching VPN credentials")
|
1129
|
-
try:
|
1130
|
-
vpn = get_vpn_details(
|
1131
|
-
location=public_location,
|
1132
|
-
user_cookie=USER_COOKIE)
|
1133
|
-
except Exception as e:
|
1134
|
-
console.log(f"[red]Error when joining network: {str(e)}")
|
1135
|
-
console.log("Are you authenticated? Try [yellow]kalavai login")
|
1136
|
-
return
|
1137
|
-
try:
|
1138
|
-
validate_join_public_seed(
|
1139
|
-
cluster_name=cluster_name,
|
1140
|
-
join_key=token,
|
1141
|
-
user_cookie=USER_COOKIE
|
1142
|
-
)
|
1143
|
-
except Exception as e:
|
1144
|
-
console.log(f"[red]Error when joining network: {str(e)}")
|
1145
|
-
return
|
1146
|
-
|
1147
|
-
# local agent join
|
1148
|
-
# 1. Generate local cache files
|
1149
|
-
console.log("Generating config files...")
|
1150
|
-
|
1151
|
-
# Generate docker compose recipe
|
1152
|
-
generate_compose_config(
|
1153
|
-
use_gpus=False,
|
1154
|
-
role="",
|
1155
|
-
vpn_token=vpn["key"],
|
1156
|
-
node_name=node_name,
|
1157
|
-
is_public=public_location is not None)
|
1158
|
-
|
1159
|
-
store_server_info(
|
1160
|
-
server_ip=kalavai_seed_ip,
|
1161
|
-
auth_key=auth_key,
|
1162
|
-
file=USER_LOCAL_SERVER_FILE,
|
1163
|
-
watcher_service=watcher_service,
|
1164
|
-
node_name=node_name,
|
1165
|
-
cluster_name=cluster_name,
|
1166
|
-
public_location=public_location,
|
1167
|
-
user_api_key=user["api_key"])
|
1168
|
-
|
1169
769
|
option = user_confirm(
|
1170
770
|
question="Docker compose ready. Would you like Kalavai to deploy it?",
|
1171
771
|
options=["no", "yes"]
|
@@ -1175,17 +775,13 @@ def pool__attach(token, *others, node_name=None):
|
|
1175
775
|
print(f"docker compose -f {USER_COMPOSE_FILE} up -d")
|
1176
776
|
return
|
1177
777
|
|
1178
|
-
|
1179
|
-
run_cmd(f"docker compose -f {USER_COMPOSE_FILE} up -d")
|
1180
|
-
# ensure we are connected
|
1181
|
-
while True:
|
1182
|
-
console.log("Waiting for core services to be ready, may take a few minutes...")
|
1183
|
-
time.sleep(30)
|
1184
|
-
if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
|
1185
|
-
break
|
778
|
+
result = attach_to_pool(token=token, node_name=node_name)
|
1186
779
|
|
780
|
+
if "error" in result:
|
781
|
+
console.log(f"[red]Error when attaching to pool: {result}")
|
782
|
+
return
|
1187
783
|
# set status to schedulable
|
1188
|
-
console.log(f"[green] You are connected to {
|
784
|
+
console.log(f"[green] You are connected to {result}")
|
1189
785
|
|
1190
786
|
|
1191
787
|
@arguably.command
|
@@ -1409,18 +1005,13 @@ def job__templates(*others):
|
|
1409
1005
|
console.log(f"[red]Problems with your pool: {str(e)}")
|
1410
1006
|
return
|
1411
1007
|
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1418
|
-
|
1419
|
-
)
|
1420
|
-
console.log("Templates available in the pool")
|
1421
|
-
console.log(result)
|
1422
|
-
except Exception as e:
|
1423
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1008
|
+
templates = fetch_job_templates()
|
1009
|
+
if "error" in templates:
|
1010
|
+
console.log(f"[red]Error when fetching templates: {str(e)}")
|
1011
|
+
return
|
1012
|
+
|
1013
|
+
console.log("Templates available in the pool")
|
1014
|
+
console.log(templates)
|
1424
1015
|
|
1425
1016
|
|
1426
1017
|
@arguably.command
|
@@ -1476,26 +1067,16 @@ def job__run(template_name, *others, values: str=None, force_namespace: str=None
|
|
1476
1067
|
annotation_key="nvidia.com/nouse-gputype"
|
1477
1068
|
)
|
1478
1069
|
|
1479
|
-
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1484
|
-
if force_namespace is not None:
|
1485
|
-
data["force_namespace"] = force_namespace
|
1070
|
+
result = deploy_job(
|
1071
|
+
template_name=template_name,
|
1072
|
+
values_dict=values_dict,
|
1073
|
+
force_namespace=force_namespace
|
1074
|
+
)
|
1486
1075
|
|
1487
|
-
|
1488
|
-
|
1489
|
-
|
1490
|
-
endpoint="/v1/deploy_job",
|
1491
|
-
data=data,
|
1492
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1493
|
-
user_cookie=USER_COOKIE
|
1494
|
-
)
|
1076
|
+
if "error" in result:
|
1077
|
+
console.log(f"[red]Error when deploying job: {str(e)}")
|
1078
|
+
else:
|
1495
1079
|
console.log(f"[green]{template_name} job deployed")
|
1496
|
-
except Exception as e:
|
1497
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1498
|
-
return
|
1499
1080
|
|
1500
1081
|
@arguably.command
|
1501
1082
|
def job__test(local_template_dir, *others, values, defaults, force_namespace: str=None):
|
@@ -1568,22 +1149,12 @@ def job__defaults(template_name, *others):
|
|
1568
1149
|
return
|
1569
1150
|
|
1570
1151
|
# deploy template with kube-watcher
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
endpoint="/v1/job_defaults",
|
1578
|
-
data=data,
|
1579
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1580
|
-
user_cookie=USER_COOKIE
|
1581
|
-
)
|
1582
|
-
print(
|
1583
|
-
json.dumps(result,indent=3)
|
1584
|
-
)
|
1585
|
-
except Exception as e:
|
1586
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1152
|
+
defaults = fetch_job_defaults(name=template_name)
|
1153
|
+
if "error" in defaults:
|
1154
|
+
console.log(f"[red]Error when fetching job defaults: {defaults}")
|
1155
|
+
print(
|
1156
|
+
json.dumps(defaults, indent=3)
|
1157
|
+
)
|
1587
1158
|
|
1588
1159
|
|
1589
1160
|
@arguably.command
|
@@ -1601,23 +1172,11 @@ def job__delete(name, *others, force_namespace: str=None):
|
|
1601
1172
|
console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
|
1602
1173
|
|
1603
1174
|
# deploy template with kube-watcher
|
1604
|
-
|
1605
|
-
|
1606
|
-
"
|
1607
|
-
|
1608
|
-
if force_namespace is not None:
|
1609
|
-
data["force_namespace"] = force_namespace
|
1610
|
-
try:
|
1611
|
-
result = request_to_server(
|
1612
|
-
method="post",
|
1613
|
-
endpoint="/v1/delete_labeled_resources",
|
1614
|
-
data=data,
|
1615
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1616
|
-
user_cookie=USER_COOKIE
|
1617
|
-
)
|
1175
|
+
result = delete_job(name=name, force_namespace=force_namespace)
|
1176
|
+
if "error" in result:
|
1177
|
+
console.log(f"[red]Error when deleting job: {str(e)}")
|
1178
|
+
else:
|
1618
1179
|
console.log(f"{result}")
|
1619
|
-
except Exception as e:
|
1620
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1621
1180
|
|
1622
1181
|
|
1623
1182
|
@arguably.command
|