kalavai-client 0.5.19__py3-none-any.whl → 0.5.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kalavai_client/__init__.py +1 -1
- kalavai_client/bridge_api.py +216 -0
- kalavai_client/bridge_models.py +37 -0
- kalavai_client/cli.py +42 -70
- kalavai_client/core.py +85 -9
- {kalavai_client-0.5.19.dist-info → kalavai_client-0.5.21.dist-info}/METADATA +53 -46
- {kalavai_client-0.5.19.dist-info → kalavai_client-0.5.21.dist-info}/RECORD +10 -8
- {kalavai_client-0.5.19.dist-info → kalavai_client-0.5.21.dist-info}/LICENSE +0 -0
- {kalavai_client-0.5.19.dist-info → kalavai_client-0.5.21.dist-info}/WHEEL +0 -0
- {kalavai_client-0.5.19.dist-info → kalavai_client-0.5.21.dist-info}/entry_points.txt +0 -0
kalavai_client/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
|
2
|
-
__version__ = "0.5.
|
2
|
+
__version__ = "0.5.21"
|
@@ -0,0 +1,216 @@
|
|
1
|
+
"""
|
2
|
+
Core kalavai service.
|
3
|
+
Used as a bridge between the kalavai-client app and the reflex frontend
|
4
|
+
"""
|
5
|
+
from fastapi import FastAPI
|
6
|
+
import uvicorn
|
7
|
+
|
8
|
+
from kalavai_client.bridge_models import (
|
9
|
+
CreatePoolRequest,
|
10
|
+
JoinPoolRequest,
|
11
|
+
StopPoolRequest,
|
12
|
+
DeployJobRequest,
|
13
|
+
DeleteJobRequest,
|
14
|
+
JobDetailsRequest,
|
15
|
+
DeleteNodesRequest
|
16
|
+
)
|
17
|
+
from kalavai_client.core import (
|
18
|
+
create_pool,
|
19
|
+
join_pool,
|
20
|
+
attach_to_pool,
|
21
|
+
stop_pool,
|
22
|
+
fetch_devices,
|
23
|
+
fetch_resources,
|
24
|
+
fetch_job_names,
|
25
|
+
fetch_gpus,
|
26
|
+
fetch_job_details,
|
27
|
+
fetch_job_logs,
|
28
|
+
fetch_job_templates,
|
29
|
+
fetch_job_defaults,
|
30
|
+
deploy_job,
|
31
|
+
delete_job,
|
32
|
+
authenticate_user,
|
33
|
+
load_user_session,
|
34
|
+
user_logout,
|
35
|
+
is_connected,
|
36
|
+
list_available_pools,
|
37
|
+
is_agent_running,
|
38
|
+
is_server,
|
39
|
+
pause_agent,
|
40
|
+
resume_agent,
|
41
|
+
get_ip_addresses,
|
42
|
+
get_pool_token,
|
43
|
+
delete_nodes,
|
44
|
+
TokenType
|
45
|
+
)
|
46
|
+
|
47
|
+
app = FastAPI()
|
48
|
+
|
49
|
+
@app.post("/create_pool")
|
50
|
+
def pool_create(request: CreatePoolRequest):
|
51
|
+
result = create_pool(
|
52
|
+
cluster_name=request.cluster_name,
|
53
|
+
ip_address=request.ip_address,
|
54
|
+
app_values=request.app_values,
|
55
|
+
num_gpus=request.num_gpus,
|
56
|
+
node_name=request.node_name,
|
57
|
+
only_registered_users=request.only_registered_users,
|
58
|
+
location=request.location
|
59
|
+
)
|
60
|
+
return result
|
61
|
+
|
62
|
+
@app.post("/join_pool")
|
63
|
+
def pool_join(request: JoinPoolRequest):
|
64
|
+
result = join_pool(
|
65
|
+
token=request.token,
|
66
|
+
num_gpus=request.num_gpus,
|
67
|
+
node_name=request.node_name
|
68
|
+
)
|
69
|
+
return result
|
70
|
+
|
71
|
+
@app.post("/attach_to_pool")
|
72
|
+
def pool_attach(request: JoinPoolRequest):
|
73
|
+
result = attach_to_pool(
|
74
|
+
token=request.token,
|
75
|
+
node_name=request.node_name
|
76
|
+
)
|
77
|
+
return result
|
78
|
+
|
79
|
+
@app.post("/stop_pool")
|
80
|
+
def pool_stop(request: StopPoolRequest):
|
81
|
+
result = stop_pool(
|
82
|
+
skip_node_deletion=request.skip_node_deletion
|
83
|
+
)
|
84
|
+
return result
|
85
|
+
|
86
|
+
@app.post("/delete_nodes")
|
87
|
+
def device_delete(request: DeleteNodesRequest):
|
88
|
+
result = delete_nodes(
|
89
|
+
nodes=request.nodes
|
90
|
+
)
|
91
|
+
return result
|
92
|
+
|
93
|
+
@app.get("/get_pool_token")
|
94
|
+
def devices(mode: int):
|
95
|
+
|
96
|
+
return get_pool_token(mode=TokenType(mode))
|
97
|
+
|
98
|
+
@app.get("/fetch_devices")
|
99
|
+
def devices():
|
100
|
+
return fetch_devices()
|
101
|
+
|
102
|
+
@app.get("/fetch_resources")
|
103
|
+
def resources():
|
104
|
+
return fetch_resources()
|
105
|
+
|
106
|
+
@app.get("/fetch_job_names")
|
107
|
+
def job_names():
|
108
|
+
return fetch_job_names()
|
109
|
+
|
110
|
+
@app.get("/fetch_gpus")
|
111
|
+
def gpus(available: bool = False):
|
112
|
+
return fetch_gpus(available=available)
|
113
|
+
|
114
|
+
@app.post("/fetch_job_details")
|
115
|
+
def job_details(request: JobDetailsRequest):
|
116
|
+
return fetch_job_details(jobs=request.jobs)
|
117
|
+
|
118
|
+
@app.get("/fetch_job_logs")
|
119
|
+
def job_logs(job_name: str, force_namespace: str=None, pod_name: str=None, tail: int=100):
|
120
|
+
return fetch_job_logs(
|
121
|
+
job_name=job_name,
|
122
|
+
force_namespace=force_namespace,
|
123
|
+
pod_name=pod_name,
|
124
|
+
tail=tail
|
125
|
+
)
|
126
|
+
|
127
|
+
@app.get("/fetch_job_templates")
|
128
|
+
def job_templates():
|
129
|
+
return fetch_job_templates()
|
130
|
+
|
131
|
+
@app.get("/fetch_job_defaults")
|
132
|
+
def job_templates(name: str):
|
133
|
+
return fetch_job_defaults(name=name)
|
134
|
+
|
135
|
+
@app.post("/deploy_job")
|
136
|
+
def job_deploy(request: DeployJobRequest):
|
137
|
+
result = deploy_job(
|
138
|
+
template_name=request.template_name,
|
139
|
+
values_dict=request.values,
|
140
|
+
force_namespace=request.force_namespace
|
141
|
+
)
|
142
|
+
return result
|
143
|
+
|
144
|
+
@app.post("/delete_job")
|
145
|
+
def job_delete(request: DeleteJobRequest):
|
146
|
+
result = delete_job(
|
147
|
+
name=request.name,
|
148
|
+
force_namespace=request.force_namespace
|
149
|
+
)
|
150
|
+
return result
|
151
|
+
|
152
|
+
@app.get("/authenticate_user")
|
153
|
+
def user_authenticate(username: str, password: str):
|
154
|
+
result = authenticate_user(
|
155
|
+
username=username,
|
156
|
+
password=password
|
157
|
+
)
|
158
|
+
return result
|
159
|
+
|
160
|
+
@app.get("/load_user_session")
|
161
|
+
def user_session():
|
162
|
+
result = load_user_session()
|
163
|
+
return result
|
164
|
+
|
165
|
+
@app.get("/user_logout")
|
166
|
+
def logout_user():
|
167
|
+
result = user_logout()
|
168
|
+
return result
|
169
|
+
|
170
|
+
@app.get("/is_connected")
|
171
|
+
def pool_connected():
|
172
|
+
result = is_connected()
|
173
|
+
return result
|
174
|
+
|
175
|
+
@app.get("/is_agent_running")
|
176
|
+
def agent_running():
|
177
|
+
result = is_agent_running()
|
178
|
+
return result
|
179
|
+
|
180
|
+
@app.get("/is_server")
|
181
|
+
def server():
|
182
|
+
result = is_server()
|
183
|
+
return result
|
184
|
+
|
185
|
+
@app.post("/pause_agent")
|
186
|
+
def agent_pause():
|
187
|
+
result = pause_agent()
|
188
|
+
return result
|
189
|
+
|
190
|
+
@app.post("/resume_agent")
|
191
|
+
def agent_resume():
|
192
|
+
result = resume_agent()
|
193
|
+
return result
|
194
|
+
|
195
|
+
@app.get("/get_ip_addresses")
|
196
|
+
def ip_addresses(subnet: str=None):
|
197
|
+
result = get_ip_addresses(subnet=subnet)
|
198
|
+
return result
|
199
|
+
|
200
|
+
@app.get("/list_available_pools")
|
201
|
+
def pool_connected(user_only: bool=False):
|
202
|
+
result = list_available_pools(user_only=user_only)
|
203
|
+
return result
|
204
|
+
|
205
|
+
|
206
|
+
def run_api(host="0.0.0.0", port=8001, log_level="critical"):
|
207
|
+
uvicorn.run(
|
208
|
+
app,
|
209
|
+
host=host,
|
210
|
+
port=port,
|
211
|
+
log_level=log_level
|
212
|
+
)
|
213
|
+
|
214
|
+
if __name__ == "__main__":
|
215
|
+
run_api()
|
216
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
|
3
|
+
from kalavai_client.core import Job
|
4
|
+
|
5
|
+
|
6
|
+
class CreatePoolRequest(BaseModel):
|
7
|
+
cluster_name: str
|
8
|
+
ip_address: str
|
9
|
+
app_values: dict = None
|
10
|
+
num_gpus: int = None
|
11
|
+
node_name: str = None
|
12
|
+
only_registered_users: bool = False
|
13
|
+
location: str = None
|
14
|
+
|
15
|
+
class DeleteNodesRequest(BaseModel):
|
16
|
+
nodes: list[str]
|
17
|
+
|
18
|
+
class JoinPoolRequest(BaseModel):
|
19
|
+
token: str
|
20
|
+
node_name: str = None
|
21
|
+
num_gpus: int = None
|
22
|
+
|
23
|
+
class JobDetailsRequest(BaseModel):
|
24
|
+
jobs: list[Job]
|
25
|
+
|
26
|
+
|
27
|
+
class StopPoolRequest(BaseModel):
|
28
|
+
skip_node_deletion: bool = False
|
29
|
+
|
30
|
+
class DeployJobRequest(BaseModel):
|
31
|
+
template_name: str
|
32
|
+
values: dict
|
33
|
+
force_namespace: str = None
|
34
|
+
|
35
|
+
class DeleteJobRequest(BaseModel):
|
36
|
+
name: str
|
37
|
+
force_namespace: str = None
|
kalavai_client/cli.py
CHANGED
@@ -15,6 +15,7 @@ import arguably
|
|
15
15
|
from rich.console import Console
|
16
16
|
|
17
17
|
from kalavai_client.cluster import CLUSTER
|
18
|
+
from kalavai_client.bridge_api import run_api
|
18
19
|
from kalavai_client.env import (
|
19
20
|
USER_COOKIE,
|
20
21
|
USER_LOCAL_SERVER_FILE,
|
@@ -50,13 +51,15 @@ from kalavai_client.core import (
|
|
50
51
|
create_pool,
|
51
52
|
get_ip_addresses,
|
52
53
|
pause_agent,
|
53
|
-
resume_agent
|
54
|
+
resume_agent,
|
55
|
+
get_pool_token,
|
56
|
+
delete_nodes,
|
57
|
+
TokenType
|
54
58
|
)
|
55
59
|
from kalavai_client.utils import (
|
56
60
|
check_gpu_drivers,
|
57
61
|
load_template,
|
58
62
|
run_cmd,
|
59
|
-
generate_join_token,
|
60
63
|
user_confirm,
|
61
64
|
generate_table,
|
62
65
|
request_to_server,
|
@@ -71,11 +74,6 @@ from kalavai_client.utils import (
|
|
71
74
|
get_public_seeds,
|
72
75
|
load_user_session,
|
73
76
|
SERVER_IP_KEY,
|
74
|
-
AUTH_KEY,
|
75
|
-
WATCHER_SERVICE_KEY,
|
76
|
-
READONLY_AUTH_KEY,
|
77
|
-
WRITE_AUTH_KEY,
|
78
|
-
PUBLIC_LOCATION_KEY,
|
79
77
|
NODE_NAME_KEY,
|
80
78
|
CLUSTER_NAME_KEY
|
81
79
|
)
|
@@ -225,29 +223,30 @@ def input_gpus():
|
|
225
223
|
##################
|
226
224
|
|
227
225
|
@arguably.command
|
228
|
-
def gui__start(*others):
|
229
|
-
"""Run GUI"""
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
def gui__stop(*others):
|
245
|
-
"""Stop GUI"""
|
246
|
-
run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} down")
|
226
|
+
def gui__start(*others, backend_only=False, gui_frontend_port=3000, gui_backend_port=8000, bridge_port=8001):
|
227
|
+
"""Run GUI (docker) and kalavai core backend (api)"""
|
228
|
+
|
229
|
+
if not backend_only:
|
230
|
+
values = {
|
231
|
+
"gui_frontend_port": gui_frontend_port,
|
232
|
+
"gui_backend_port": gui_backend_port,
|
233
|
+
"path": user_path("")
|
234
|
+
}
|
235
|
+
compose_yaml = load_template(
|
236
|
+
template_path=DOCKER_COMPOSE_GUI,
|
237
|
+
values=values)
|
238
|
+
with open(USER_GUI_COMPOSE_FILE, "w") as f:
|
239
|
+
f.write(compose_yaml)
|
240
|
+
|
241
|
+
run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} up -d")
|
247
242
|
|
243
|
+
console.log(f"[green]Loading GUI, may take a few minutes. It will be available at http://localhost:{gui_frontend_port}")
|
244
|
+
run_api(port=bridge_port)
|
245
|
+
|
246
|
+
if not backend_only:
|
247
|
+
run_cmd(f"docker compose --file {USER_GUI_COMPOSE_FILE} down")
|
248
248
|
console.log("[green]Kalavai GUI has been stopped")
|
249
249
|
|
250
|
-
|
251
250
|
@arguably.command
|
252
251
|
def login(*others, username: str=None):
|
253
252
|
"""
|
@@ -320,14 +319,12 @@ def pool__publish(*others, description=None):
|
|
320
319
|
console.log(f"[red]Problems with your pool: {str(e)}")
|
321
320
|
return
|
322
321
|
choices = select_token_type()
|
323
|
-
token = pool__token(**choices)
|
322
|
+
token = pool__token(**choices)["token"]
|
324
323
|
|
325
324
|
if description is None:
|
326
325
|
console.log("[yellow] [Markdown] In a few words (max 500 chars), describe your goals with this cluster. Remember, this is what other users will see to decide whether to share their resources with you, [blue]so inspire them!")
|
327
326
|
description = input(f"(You can edit this later in {KALAVAI_PLATFORM_URL}\n")
|
328
327
|
|
329
|
-
description = description
|
330
|
-
|
331
328
|
try:
|
332
329
|
valid = check_token(token=token, public=True)
|
333
330
|
if "error" in valid:
|
@@ -451,32 +448,19 @@ def pool__token(*others, admin=False, user=False, worker=False):
|
|
451
448
|
return
|
452
449
|
|
453
450
|
if admin:
|
454
|
-
|
451
|
+
mode = TokenType.ADMIN
|
455
452
|
elif user:
|
456
|
-
|
453
|
+
mode = TokenType.USER
|
457
454
|
else:
|
458
|
-
|
459
|
-
|
460
|
-
watcher_service = load_server_info(data_key=WATCHER_SERVICE_KEY, file=USER_LOCAL_SERVER_FILE)
|
461
|
-
public_location = load_server_info(data_key=PUBLIC_LOCATION_KEY, file=USER_LOCAL_SERVER_FILE)
|
462
|
-
|
463
|
-
cluster_token = CLUSTER.get_cluster_token()
|
464
|
-
|
465
|
-
ip_address = load_server_info(SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)
|
466
|
-
cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
|
467
|
-
|
468
|
-
join_token = generate_join_token(
|
469
|
-
cluster_ip=ip_address,
|
470
|
-
cluster_name=cluster_name,
|
471
|
-
cluster_token=cluster_token,
|
472
|
-
auth_key=auth_key,
|
473
|
-
watcher_service=watcher_service,
|
474
|
-
public_location=public_location
|
475
|
-
)
|
455
|
+
mode = TokenType.WORKER
|
476
456
|
|
477
|
-
|
478
|
-
print(join_token)
|
457
|
+
join_token = get_pool_token(mode=mode)
|
479
458
|
|
459
|
+
if "error" in join_token:
|
460
|
+
console.log(f"[red]{join_token}")
|
461
|
+
else:
|
462
|
+
console.log("[green]Join token:")
|
463
|
+
print(join_token["token"])
|
480
464
|
return join_token
|
481
465
|
|
482
466
|
@arguably.command
|
@@ -949,24 +933,12 @@ def node__delete(name, *others):
|
|
949
933
|
console.log(f"[red]Problems with your pool: {str(e)}")
|
950
934
|
return
|
951
935
|
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
endpoint="/v1/delete_nodes",
|
959
|
-
data=data,
|
960
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
961
|
-
user_cookie=USER_COOKIE
|
962
|
-
)
|
963
|
-
if result is None or result is True:
|
964
|
-
console.log(f"Node {name} deleted successfully")
|
965
|
-
else:
|
966
|
-
console.log(f"{result}")
|
967
|
-
except Exception as e:
|
968
|
-
console.log(f"[yellow](ignore if stopping worker from dead server). Error when removing node {name}: {str(e)}")
|
969
|
-
|
936
|
+
result = delete_nodes(nodes=[name])
|
937
|
+
|
938
|
+
if "error" in result:
|
939
|
+
console.log(f"[red]{result}")
|
940
|
+
else:
|
941
|
+
console.log(f"[green]{result}")
|
970
942
|
|
971
943
|
@arguably.command
|
972
944
|
def node__cordon(node_name, *others):
|
kalavai_client/core.py
CHANGED
@@ -6,11 +6,14 @@ import uuid
|
|
6
6
|
import socket
|
7
7
|
import ipaddress
|
8
8
|
import netifaces as ni
|
9
|
-
|
9
|
+
from typing import Optional
|
10
10
|
from pydantic import BaseModel
|
11
|
+
from enum import Enum
|
11
12
|
|
12
13
|
from kalavai_client.cluster import CLUSTER
|
13
14
|
from kalavai_client.utils import (
|
15
|
+
check_gpu_drivers,
|
16
|
+
generate_join_token,
|
14
17
|
request_to_server,
|
15
18
|
load_server_info,
|
16
19
|
decode_dict,
|
@@ -68,11 +71,11 @@ from kalavai_client.env import (
|
|
68
71
|
)
|
69
72
|
|
70
73
|
class Job(BaseModel):
|
71
|
-
owner: str = None
|
72
|
-
name: str = None
|
73
|
-
workers: str = None
|
74
|
-
endpoint: str = None
|
75
|
-
status: str = None
|
74
|
+
owner: Optional[str] = None
|
75
|
+
name: Optional[str] = None
|
76
|
+
workers: Optional[str] = None
|
77
|
+
endpoint: Optional[str] = None
|
78
|
+
status: Optional[str] = None
|
76
79
|
|
77
80
|
class DeviceStatus(BaseModel):
|
78
81
|
name: str
|
@@ -89,6 +92,11 @@ class GPU(BaseModel):
|
|
89
92
|
ready: bool
|
90
93
|
model: str
|
91
94
|
|
95
|
+
class TokenType(Enum):
|
96
|
+
ADMIN = 0
|
97
|
+
USER = 1
|
98
|
+
WORKER = 2
|
99
|
+
|
92
100
|
|
93
101
|
def init_user_workspace(force_namespace=None):
|
94
102
|
|
@@ -281,8 +289,10 @@ def fetch_job_details(jobs: list[Job]):
|
|
281
289
|
status = "running"
|
282
290
|
elif any([st in workers_status for st in ["Failed", "Completed"]]):
|
283
291
|
status = "error"
|
284
|
-
|
292
|
+
elif any([st in workers_status for st in ["Pending"]]):
|
285
293
|
status = "pending"
|
294
|
+
else:
|
295
|
+
status = "working"
|
286
296
|
job_details.append(
|
287
297
|
Job(owner=namespace,
|
288
298
|
name=deployment,
|
@@ -461,6 +471,25 @@ def check_token(token, public=False):
|
|
461
471
|
return {"status": True}
|
462
472
|
except Exception as e:
|
463
473
|
return {"error": str(e)}
|
474
|
+
|
475
|
+
def delete_nodes(nodes):
|
476
|
+
data = {
|
477
|
+
"node_names": nodes
|
478
|
+
}
|
479
|
+
try:
|
480
|
+
result = request_to_server(
|
481
|
+
method="post",
|
482
|
+
endpoint="/v1/delete_nodes",
|
483
|
+
data=data,
|
484
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
485
|
+
user_cookie=USER_COOKIE
|
486
|
+
)
|
487
|
+
if result is None or result is True:
|
488
|
+
return {"success": nodes}
|
489
|
+
else:
|
490
|
+
return {"error": result}
|
491
|
+
except Exception as e:
|
492
|
+
return {"error": f"Error when removing nodes {nodes}: {str(e)}"}
|
464
493
|
|
465
494
|
def attach_to_pool(token, node_name=None):
|
466
495
|
if node_name is None:
|
@@ -530,11 +559,24 @@ def attach_to_pool(token, node_name=None):
|
|
530
559
|
|
531
560
|
return cluster_name
|
532
561
|
|
533
|
-
def
|
562
|
+
def get_max_gpus():
|
563
|
+
try:
|
564
|
+
has_gpus = check_gpu_drivers()
|
565
|
+
if has_gpus:
|
566
|
+
return int(run_cmd("nvidia-smi -L | wc -l").decode())
|
567
|
+
else:
|
568
|
+
return 0
|
569
|
+
except:
|
570
|
+
return 0
|
571
|
+
|
572
|
+
def join_pool(token, num_gpus=None, node_name=None):
|
534
573
|
compatibility = check_worker_compatibility()
|
535
574
|
if len(compatibility["issues"]) > 0:
|
536
575
|
return {"error": compatibility["issues"]}
|
537
|
-
|
576
|
+
|
577
|
+
if num_gpus is None:
|
578
|
+
num_gpus = get_max_gpus()
|
579
|
+
|
538
580
|
if node_name is None:
|
539
581
|
node_name = f"{socket.gethostname()}-{uuid.uuid4().hex[:6]}"
|
540
582
|
|
@@ -751,6 +793,40 @@ def create_pool(cluster_name: str, ip_address: str, app_values: str=None, pool_c
|
|
751
793
|
|
752
794
|
return {"success"}
|
753
795
|
|
796
|
+
def get_pool_token(mode: TokenType):
|
797
|
+
|
798
|
+
try:
|
799
|
+
match mode:
|
800
|
+
case TokenType.ADMIN:
|
801
|
+
auth_key = load_server_info(data_key=AUTH_KEY, file=USER_LOCAL_SERVER_FILE)
|
802
|
+
case TokenType.USER:
|
803
|
+
auth_key = load_server_info(data_key=WRITE_AUTH_KEY, file=USER_LOCAL_SERVER_FILE)
|
804
|
+
case _:
|
805
|
+
auth_key = load_server_info(data_key=READONLY_AUTH_KEY, file=USER_LOCAL_SERVER_FILE)
|
806
|
+
if auth_key is None:
|
807
|
+
return {"error": "Cannot generate selected token mode. Are you the seed node?"}
|
808
|
+
|
809
|
+
watcher_service = load_server_info(data_key=WATCHER_SERVICE_KEY, file=USER_LOCAL_SERVER_FILE)
|
810
|
+
public_location = load_server_info(data_key=PUBLIC_LOCATION_KEY, file=USER_LOCAL_SERVER_FILE)
|
811
|
+
|
812
|
+
cluster_token = CLUSTER.get_cluster_token()
|
813
|
+
|
814
|
+
ip_address = load_server_info(SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)
|
815
|
+
cluster_name = load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)
|
816
|
+
|
817
|
+
join_token = generate_join_token(
|
818
|
+
cluster_ip=ip_address,
|
819
|
+
cluster_name=cluster_name,
|
820
|
+
cluster_token=cluster_token,
|
821
|
+
auth_key=auth_key,
|
822
|
+
watcher_service=watcher_service,
|
823
|
+
public_location=public_location
|
824
|
+
)
|
825
|
+
|
826
|
+
return {"token": join_token}
|
827
|
+
except Exception as e:
|
828
|
+
return {"error": f"Error when generating token: {str(e)}"}
|
829
|
+
|
754
830
|
def pool_init(pool_config_values_path=None):
|
755
831
|
"""Deploy configured objects to initialise pool"""
|
756
832
|
if pool_config_values_path is None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: kalavai-client
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.21
|
4
4
|
Summary: Client app for kalavai platform
|
5
5
|
License: Apache-2.0
|
6
6
|
Keywords: LLM,platform
|
@@ -26,6 +26,7 @@ Requires-Dist: Pillow (==10.3.0)
|
|
26
26
|
Requires-Dist: anvil-uplink (==0.5.1)
|
27
27
|
Requires-Dist: arguably (>=1.2.5)
|
28
28
|
Requires-Dist: build ; extra == "dev"
|
29
|
+
Requires-Dist: fastapi (==0.115.8)
|
29
30
|
Requires-Dist: importlib_resources (==6.5.2)
|
30
31
|
Requires-Dist: jinja2 (==3.1.4)
|
31
32
|
Requires-Dist: netifaces (==0.11.0)
|
@@ -38,6 +39,7 @@ Requires-Dist: requests (>=2.25)
|
|
38
39
|
Requires-Dist: rich (==13.7.1)
|
39
40
|
Requires-Dist: setuptools (>75.0.0)
|
40
41
|
Requires-Dist: twine ; extra == "dev"
|
42
|
+
Requires-Dist: uvicorn (==0.34.0)
|
41
43
|
Project-URL: Homepage, https://platform.kalavai.net
|
42
44
|
Project-URL: Website, https://kalavai.net
|
43
45
|
Description-Content-Type: text/markdown
|
@@ -71,6 +73,19 @@ Description-Content-Type: text/markdown
|
|
71
73
|
|
72
74
|
Kalavai's goal is to make using LLMs in real applications accessible and affordable to all. It's a _magic box_ that **integrates all the components required to make LLM useful in the age of massive computing**, from sourcing computing power, managing distributed infrastructure and storage, using industry-standard model engines and orchestration of LLMs.
|
73
75
|
|
76
|
+
### Core features
|
77
|
+
|
78
|
+
- Manage **multiple devices resources as one**. One pool of RAM, CPUs and GPUs
|
79
|
+
- **Deploy Large Language Models seamlessly across devices**, wherever they are (multiple clouds, on premises, personal devices)
|
80
|
+
- Auto-discovery: all **models are automatically exposed** through a single OpenAI-like API and a ChatGPT-like UI playground
|
81
|
+
- Compatible with [most popular model engines](#support-for-llm-engines)
|
82
|
+
- [Easy to expand](https://github.com/kalavai-net/kube-watcher/tree/main/templates) to custom workloads
|
83
|
+
|
84
|
+
|
85
|
+
<details>
|
86
|
+
|
87
|
+
**<summary>Video tutorials</summary>**
|
88
|
+
|
74
89
|
### Aggregate multiple devices in an LLM pool
|
75
90
|
|
76
91
|
https://github.com/user-attachments/assets/4be59886-1b76-4400-ab5c-c803e3e414ec
|
@@ -88,12 +103,16 @@ https://github.com/user-attachments/assets/7df73bbc-d129-46aa-8ce5-0735177dedeb
|
|
88
103
|
https://github.com/user-attachments/assets/0d2316f3-79ea-46ac-b41e-8ef720f52672
|
89
104
|
|
90
105
|
|
91
|
-
|
106
|
+
</details>
|
92
107
|
|
93
|
-
|
108
|
+
### Latest updates
|
94
109
|
|
110
|
+
- 20 February 2025: New shiny GUI interface to control LLM pools and deploy models
|
95
111
|
- 6 February 2025: 🔥🔥🔥 Access **DeepSeek R1 model for free** when you join our [public LLM pool](https://kalavai-net.github.io/kalavai-client/public_llm_pool/)
|
96
112
|
- 31 January 2025: `kalavai-client` is now a [PyPI package](https://pypi.org/project/kalavai-client/), easier to install than ever!
|
113
|
+
<details>
|
114
|
+
<summary>More news</summary>
|
115
|
+
|
97
116
|
- 27 January 2025: Support for accessing pools from remote computers
|
98
117
|
- 9 January 2025: Added support for [Aphrodite Engine](https://github.com/aphrodite-engine/aphrodite-engine) models
|
99
118
|
- 8 January 2025: Release of [a free, public, shared pool](/docs/docs/public_llm_pool.md) for community LLM deployment
|
@@ -102,6 +121,7 @@ https://github.com/user-attachments/assets/0d2316f3-79ea-46ac-b41e-8ef720f52672
|
|
102
121
|
- 24 November 2024: Common pools with private user spaces
|
103
122
|
- 30 October 2024: Release of our [public pool platform](https://platform.kalavai.net)
|
104
123
|
|
124
|
+
</details>
|
105
125
|
|
106
126
|
### Support for LLM engines
|
107
127
|
|
@@ -136,8 +156,10 @@ Not what you were looking for? [Tell us](https://github.com/kalavai-net/kalavai-
|
|
136
156
|
|
137
157
|
The `kalavai-client` is the main tool to interact with the Kalavai platform, to create and manage both local and public pools and also to interact with them (e.g. deploy models). Let's go over its installation.
|
138
158
|
|
139
|
-
From release **v0.5.0, you can now install `kalavai-client` in non-worker computers**. You can run a pool on a set of machines and have the client on a remote computer from which you access the LLM pool. Because the client only requires having python installed, this means more computers are now supported to run it.
|
140
159
|
|
160
|
+
<details>
|
161
|
+
|
162
|
+
<summary>Requirements</summary>
|
141
163
|
|
142
164
|
### Requirements
|
143
165
|
|
@@ -150,8 +172,11 @@ For workers sharing resources with the pool:
|
|
150
172
|
|
151
173
|
Any system that runs python 3.6+ is able to run the `kalavai-client` and therefore connect and operate an LLM pool, [without sharing with the pool](). Your computer won't be adding its capacity to the pool, but it wil be able to deploy jobs and interact with models.
|
152
174
|
|
175
|
+
</details>
|
176
|
+
|
177
|
+
<details>
|
153
178
|
|
154
|
-
|
179
|
+
<summary> Common issues</summary>
|
155
180
|
|
156
181
|
If you see the following error:
|
157
182
|
|
@@ -175,6 +200,7 @@ Upgrade your setuptools:
|
|
175
200
|
```bash
|
176
201
|
pip install -U setuptools
|
177
202
|
```
|
203
|
+
</details>
|
178
204
|
|
179
205
|
### Install the client
|
180
206
|
|
@@ -184,66 +210,44 @@ The client is a python package and can be installed with one command:
|
|
184
210
|
pip install kalavai-client
|
185
211
|
```
|
186
212
|
|
187
|
-
## Public LLM pools: crowdsource community resources
|
188
|
-
|
189
|
-
This is the **easiest and most powerful** way to experience Kalavai. It affords users the full resource capabilities of the community and access to all its deployed LLMs, via an [OpenAI-compatible endpoint](https://kalavai-net.github.io/kalavai-client/public_llm_pool/#single-api-endpoint) as well as a [UI-based playground](https://kalavai-net.github.io/kalavai-client/public_llm_pool/#ui-playground).
|
190
|
-
|
191
|
-
Check out [our guide](https://kalavai-net.github.io/kalavai-client/public_llm_pool/) on how to join and start deploying LLMs.
|
192
|
-
|
193
|
-
|
194
|
-
## Createa a local, private LLM pool
|
195
|
-
|
196
|
-
Kalavai is **free to use, no caps, for both commercial and non-commercial purposes**. All you need to get started is one or more computers that can see each other (i.e. within the same network), and you are good to go. If you wish to join computers in different locations / networks, check [managed kalavai](#public-pools-crowdsource-community-resources).
|
197
213
|
|
198
|
-
|
214
|
+
## Create a a local, private LLM pool
|
199
215
|
|
200
|
-
|
216
|
+
> Kalavai is **free to use, no caps, for both commercial and non-commercial purposes**. All you need to get started is one or more computers that can see each other (i.e. within the same network), and you are good to go. If you are interested in join computers in different locations / networks, [contact us](mailto:info@kalavai.net) or [book a demo](https://app.onecal.io/b/kalavai/book-a-demo) with the founders.
|
201
217
|
|
202
|
-
|
203
|
-
kalavai pool start <pool-name>
|
204
|
-
```
|
218
|
+
You can create and manage your pools with the new kalavai GUI, which can be started with:
|
205
219
|
|
206
|
-
Now you are ready to add worker nodes to this seed. To do so, generate a joining token:
|
207
220
|
```bash
|
208
|
-
|
209
|
-
|
210
|
-
Join token: <token>
|
221
|
+
kalavai gui start
|
211
222
|
```
|
212
223
|
|
213
|
-
|
214
|
-
|
215
|
-
Increase the power of your AI pool by inviting others to join.
|
224
|
+
This will expose the GUI and the backend services in localhost. By default, the GUI is accessible via [http://localhost:3000](http://localhost:3000). In the UI users can create and join LLM pools, monitor devices, deploy LLMs and more.
|
216
225
|
|
217
|
-
|
226
|
+

|
218
227
|
|
219
|
-
|
220
|
-
kalavai pool join <token>
|
221
|
-
```
|
228
|
+
Check out our [getting started guide](https://kalavai-net.github.io/kalavai-client/getting_started/) for next steps.
|
222
229
|
|
223
|
-
### 3. Attach more clients
|
224
230
|
|
225
|
-
|
231
|
+
## Public LLM pools: crowdsource community resources
|
226
232
|
|
227
|
-
|
228
|
-
kalavai pool attach <token>
|
229
|
-
```
|
233
|
+
This is the **easiest and most powerful** way to experience Kalavai. It affords users the full resource capabilities of the community and access to all its deployed LLMs, via an [OpenAI-compatible endpoint](https://kalavai-net.github.io/kalavai-client/public_llm_pool/#single-api-endpoint) as well as a [UI-based playground](https://kalavai-net.github.io/kalavai-client/public_llm_pool/#ui-playground).
|
230
234
|
|
231
|
-
|
235
|
+
Check out [our guide](https://kalavai-net.github.io/kalavai-client/public_llm_pool/) on how to join and start deploying LLMs.
|
232
236
|
|
233
237
|
|
234
|
-
|
238
|
+
## Enough already, let's run stuff!
|
235
239
|
|
236
|
-
Check our [examples](examples/) to put your new AI pool to good use!
|
237
|
-
- [Single node vLLM GPU LLM](examples/singlenode_gpu_vllm.md) deployment
|
238
|
-
- [Multi node vLLM GPU LLM](examples/multinode_gpu_vllm.md) deployment
|
239
|
-
- [Aphrodite-engine quantized LLM](examples/quantized_gpu_llm.md) deployment, including Kobold interface
|
240
|
-
- [Ray cluster](examples/ray_cluster.md) for distributed computation.
|
240
|
+
Check our [examples](examples/) to put your new AI pool to good use! For an end to end tour, check our [self-hosted](https://kalavai-net.github.io/kalavai-client/self_hosted_llm_pool/) and [public LLM pools](https://kalavai-net.github.io/kalavai-client/public_llm_pool/) guides.
|
241
241
|
|
242
242
|
|
243
243
|
## Compatibility matrix
|
244
244
|
|
245
245
|
If your system is not currently supported, [open an issue](https://github.com/kalavai-net/kalavai-client/issues) and request it. We are expanding this list constantly.
|
246
246
|
|
247
|
+
<details>
|
248
|
+
|
249
|
+
**<summary>Hardware and OS compatibility </summary>**
|
250
|
+
|
247
251
|
### OS compatibility
|
248
252
|
|
249
253
|
Since **worker nodes** run inside docker, any machine that can run docker **should** be compatible with Kalavai. Here are instructions for [linux](https://docs.docker.com/engine/install/), [Windows](https://docs.docker.com/desktop/setup/install/windows-install/) and [MacOS](https://docs.docker.com/desktop/setup/install/mac-install/).
|
@@ -257,6 +261,7 @@ The kalavai client, which controls and access pools, can be installed on any mac
|
|
257
261
|
- NVIDIA GPU
|
258
262
|
- AMD and Intel GPUs are currently not supported ([interested in helping us test it?](https://kalavai-net.github.io/kalavai-client/compatibility/#help-testing-amd-gpus))
|
259
263
|
|
264
|
+
</details>
|
260
265
|
|
261
266
|
## Roadmap
|
262
267
|
|
@@ -268,6 +273,7 @@ The kalavai client, which controls and access pools, can be installed on any mac
|
|
268
273
|
- [x] Collaborative LLM deployment
|
269
274
|
- [x] Ray cluster support
|
270
275
|
- [x] Kalavai client on Mac
|
276
|
+
- [x] Kalavai pools UI
|
271
277
|
- [ ] [TEMPLATE] [GPUStack](https://github.com/gpustack/gpustack) support
|
272
278
|
- [ ] [TEMPLATE] [exo](https://github.com/exo-explore/exo) support
|
273
279
|
- [ ] Support for AMD GPUs
|
@@ -293,7 +299,9 @@ Anything missing here? Give us a shout in the [discussion board](https://github.
|
|
293
299
|
|
294
300
|
## Build from source
|
295
301
|
|
296
|
-
|
302
|
+
<details>
|
303
|
+
|
304
|
+
<summary>Expand</summary>
|
297
305
|
|
298
306
|
Python version >= 3.6.
|
299
307
|
|
@@ -313,6 +321,7 @@ Build python wheels:
|
|
313
321
|
bash publish.sh build
|
314
322
|
```
|
315
323
|
|
324
|
+
</details>
|
316
325
|
|
317
326
|
### Unit tests
|
318
327
|
|
@@ -322,5 +331,3 @@ To run the unit tests, use:
|
|
322
331
|
python -m unittest
|
323
332
|
```
|
324
333
|
|
325
|
-
docker run --rm --net=host -v /root/.cache/kalavai/:/root/.cache/kalavai/ ghcr.io/helmfile/helmfile:v0.169.2 helmfile sync --file /root/.cache/kalavai/apps.yaml --kubeconfig /root/.cache/kalavai/kubeconfig
|
326
|
-
|
@@ -1,4 +1,4 @@
|
|
1
|
-
kalavai_client/__init__.py,sha256=
|
1
|
+
kalavai_client/__init__.py,sha256=LIT9uJV0JSGoxjwoHAvYM7EHKvgVBbXXas273l5C2UM,23
|
2
2
|
kalavai_client/__main__.py,sha256=WQUfxvRsBJH5gsCJg8pLz95QnZIj7Ol8psTO77m0QE0,73
|
3
3
|
kalavai_client/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
kalavai_client/assets/apps.yaml,sha256=V1x1FY-fyYsYrXvcIMv3QrBCgJ7jNunluRyJh67eWB0,5983
|
@@ -11,13 +11,15 @@ kalavai_client/assets/pool_config_values.yaml,sha256=VrM3XHQfQo6QLZ68qvagooUptaY
|
|
11
11
|
kalavai_client/assets/user_workspace.yaml,sha256=wDvlMYknOPABAEo0dsQwU7bac8iubjAG9tdkFbJZ5Go,476
|
12
12
|
kalavai_client/assets/user_workspace_values.yaml,sha256=G0HOzQUxrDMCwuW9kbWUZaKMzDDPVwDwzBHCL2Xi2ZM,542
|
13
13
|
kalavai_client/auth.py,sha256=QsBh28L2LwjBBK6pTUE4Xu36lLDTyetyU1YfS1Hbb6g,1717
|
14
|
-
kalavai_client/
|
14
|
+
kalavai_client/bridge_api.py,sha256=hp5YjMu0HBI9VGMx6hahXfMIGPLwNtSd09UKxmKnGXc,4852
|
15
|
+
kalavai_client/bridge_models.py,sha256=rXBnE5r6Oe9GxGkk1ITkvp6YQqahp72Rrzf-QM2quH8,771
|
16
|
+
kalavai_client/cli.py,sha256=ef4ZZJcRVP5PWS77XpsYZgwYK3CEny5o3Z7_JlLooN4,47845
|
15
17
|
kalavai_client/cluster.py,sha256=gwjmdsd--YrffT0BmZDOEpbrdm3lPskUuN5jdgcrOR0,12947
|
16
|
-
kalavai_client/core.py,sha256=
|
18
|
+
kalavai_client/core.py,sha256=V-WfeI3QLo82GpNC-UZ_MtvdsE4njkT4ic1Pxe3HHWo,29756
|
17
19
|
kalavai_client/env.py,sha256=Zg2pP-xGJpQumo56KMBxBLgIsBmcNN0S9R-ZP2-s630,2604
|
18
20
|
kalavai_client/utils.py,sha256=rz5W9PRZrTpgdmOs6yeqUi4f_q_L-3BJ5g1o7Asgnyo,13386
|
19
|
-
kalavai_client-0.5.
|
20
|
-
kalavai_client-0.5.
|
21
|
-
kalavai_client-0.5.
|
22
|
-
kalavai_client-0.5.
|
23
|
-
kalavai_client-0.5.
|
21
|
+
kalavai_client-0.5.21.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
22
|
+
kalavai_client-0.5.21.dist-info/METADATA,sha256=UBVRAEyPpQF2-f6sGiEHiu81oL2VyjW2InPUWltbI_8,14443
|
23
|
+
kalavai_client-0.5.21.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
24
|
+
kalavai_client-0.5.21.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
|
25
|
+
kalavai_client-0.5.21.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|