kalavai-client 0.6.12__tar.gz → 0.6.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/PKG-INFO +1 -1
- kalavai_client-0.6.14/kalavai_client/__init__.py +2 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/apps.yaml +1 -1
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/docker-compose-template.yaml +4 -0
- kalavai_client-0.6.14/kalavai_client/bridge_api.py +518 -0
- kalavai_client-0.6.14/kalavai_client/bridge_models.py +53 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/cli.py +3 -1
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/cluster.py +36 -26
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/core.py +4 -14
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/utils.py +28 -13
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/pyproject.toml +1 -1
- kalavai_client-0.6.12/kalavai_client/__init__.py +0 -2
- kalavai_client-0.6.12/kalavai_client/bridge_api.py +0 -276
- kalavai_client-0.6.12/kalavai_client/bridge_models.py +0 -53
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/LICENSE +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/README.md +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/__main__.py +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/__init__.py +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/apps_values.yaml +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/docker-compose-gui.yaml +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/nginx.conf +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/pool_config_template.yaml +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/pool_config_values.yaml +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/user_workspace.yaml +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/user_workspace_values.yaml +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/auth.py +0 -0
- {kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/env.py +0 -0
{kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/docker-compose-template.yaml
RENAMED
@@ -18,6 +18,7 @@ services:
|
|
18
18
|
{%if command %}
|
19
19
|
{{service_name}}:
|
20
20
|
image: docker.io/bundenth/kalavai-runner:gpu-latest
|
21
|
+
pull_policy: always
|
21
22
|
container_name: {{service_name}}
|
22
23
|
{% if vpn %}
|
23
24
|
depends_on:
|
@@ -35,6 +36,9 @@ services:
|
|
35
36
|
{% endif %}
|
36
37
|
--node_name="{{node_name}}"
|
37
38
|
--node_ip="{{node_ip_address}}"
|
39
|
+
{% if random_suffix %}
|
40
|
+
--random_suffix="{{random_suffix}}"
|
41
|
+
{% endif %}
|
38
42
|
{% if command == "server" %}
|
39
43
|
--port_range="30000-32767"
|
40
44
|
{% else %}
|
@@ -0,0 +1,518 @@
|
|
1
|
+
"""
|
2
|
+
Core kalavai service.
|
3
|
+
Used as a bridge between the kalavai-client app and the reflex frontend
|
4
|
+
"""
|
5
|
+
from fastapi import FastAPI, HTTPException, Depends
|
6
|
+
from starlette.requests import Request
|
7
|
+
import uvicorn
|
8
|
+
|
9
|
+
from kalavai_client.bridge_models import (
|
10
|
+
CreatePoolRequest,
|
11
|
+
InvitesRequest,
|
12
|
+
JoinPoolRequest,
|
13
|
+
StopPoolRequest,
|
14
|
+
DeployJobRequest,
|
15
|
+
DeleteJobRequest,
|
16
|
+
JobDetailsRequest,
|
17
|
+
NodesActionRequest,
|
18
|
+
NodeLabelsRequest,
|
19
|
+
GetNodeLabelsRequest
|
20
|
+
)
|
21
|
+
from kalavai_client.core import (
|
22
|
+
create_pool,
|
23
|
+
join_pool,
|
24
|
+
attach_to_pool,
|
25
|
+
send_invites,
|
26
|
+
stop_pool,
|
27
|
+
fetch_devices,
|
28
|
+
fetch_resources,
|
29
|
+
fetch_job_names,
|
30
|
+
fetch_gpus,
|
31
|
+
fetch_job_details,
|
32
|
+
fetch_job_logs,
|
33
|
+
fetch_job_templates,
|
34
|
+
fetch_job_defaults,
|
35
|
+
deploy_job,
|
36
|
+
delete_job,
|
37
|
+
authenticate_user,
|
38
|
+
load_user_session,
|
39
|
+
user_logout,
|
40
|
+
is_connected,
|
41
|
+
list_available_pools,
|
42
|
+
is_agent_running,
|
43
|
+
is_server,
|
44
|
+
pause_agent,
|
45
|
+
resume_agent,
|
46
|
+
get_ip_addresses,
|
47
|
+
get_pool_token,
|
48
|
+
delete_nodes,
|
49
|
+
cordon_nodes,
|
50
|
+
uncordon_nodes,
|
51
|
+
add_node_labels,
|
52
|
+
get_node_labels,
|
53
|
+
TokenType
|
54
|
+
)
|
55
|
+
from kalavai_client.utils import load_user_id
|
56
|
+
|
57
|
+
app = FastAPI(
|
58
|
+
title="Kalavai Bridge API",
|
59
|
+
description="API for managing Kalavai pools, jobs, and nodes",
|
60
|
+
version="1.0.0",
|
61
|
+
docs_url="/docs",
|
62
|
+
redoc_url="/redoc",
|
63
|
+
)
|
64
|
+
|
65
|
+
################################
|
66
|
+
## API Key Validation methods ##
|
67
|
+
################################
|
68
|
+
async def verify_api_key(request: Request):
|
69
|
+
"""
|
70
|
+
Verify the API key from the request headers.
|
71
|
+
The API key must match the user ID.
|
72
|
+
"""
|
73
|
+
user_id = load_user_id()
|
74
|
+
if user_id is None:
|
75
|
+
return None
|
76
|
+
api_key = request.headers.get("X-API-KEY")
|
77
|
+
if api_key != user_id:
|
78
|
+
raise HTTPException(status_code=401, detail="Request requires API Key")
|
79
|
+
return api_key
|
80
|
+
|
81
|
+
@app.post("/create_pool",
|
82
|
+
summary="Create a new pool",
|
83
|
+
description="Creates a new pool with the specified configuration",
|
84
|
+
response_description="Result of pool creation")
|
85
|
+
def pool_create(request: CreatePoolRequest, api_key: str = Depends(verify_api_key)):
|
86
|
+
"""
|
87
|
+
Create a new pool with the following parameters:
|
88
|
+
|
89
|
+
- **cluster_name**: Name of the cluster
|
90
|
+
- **ip_address**: IP address for the pool
|
91
|
+
- **app_values**: Application configuration values
|
92
|
+
- **num_gpus**: Number of GPUs to allocate
|
93
|
+
- **node_name**: Name of the node
|
94
|
+
- **only_registered_users**: Whether to restrict to registered users
|
95
|
+
- **location**: Location of the pool
|
96
|
+
- **description**: Pool description
|
97
|
+
- **token_mode**: Token type for authentication
|
98
|
+
- **frontend**: Whether this is a frontend request
|
99
|
+
"""
|
100
|
+
result = create_pool(
|
101
|
+
cluster_name=request.cluster_name,
|
102
|
+
ip_address=request.ip_address,
|
103
|
+
app_values=request.app_values,
|
104
|
+
num_gpus=request.num_gpus,
|
105
|
+
node_name=request.node_name,
|
106
|
+
only_registered_users=request.only_registered_users,
|
107
|
+
location=request.location,
|
108
|
+
description=request.description,
|
109
|
+
token_mode=request.token_mode,
|
110
|
+
frontend=request.frontend
|
111
|
+
)
|
112
|
+
return result
|
113
|
+
|
114
|
+
@app.post("/join_pool",
|
115
|
+
summary="Join an existing pool",
|
116
|
+
description="Join a pool using a token",
|
117
|
+
response_description="Result of joining the pool")
|
118
|
+
def pool_join(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
|
119
|
+
"""
|
120
|
+
Join a pool with the following parameters:
|
121
|
+
|
122
|
+
- **token**: Pool join token
|
123
|
+
- **ip_address**: IP address for the node
|
124
|
+
- **node_name**: Name of the node
|
125
|
+
- **num_gpus**: Number of GPUs to allocate
|
126
|
+
- **frontend**: Whether this is a frontend request
|
127
|
+
"""
|
128
|
+
result = join_pool(
|
129
|
+
token=request.token,
|
130
|
+
num_gpus=request.num_gpus,
|
131
|
+
node_name=request.node_name,
|
132
|
+
ip_address=request.ip_address,
|
133
|
+
frontend=request.frontend
|
134
|
+
)
|
135
|
+
return result
|
136
|
+
|
137
|
+
@app.post("/attach_to_pool",
|
138
|
+
summary="Attach to an existing pool",
|
139
|
+
description="Attach to a pool using a token",
|
140
|
+
response_description="Result of attaching to the pool")
|
141
|
+
def pool_attach(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
|
142
|
+
"""
|
143
|
+
Attach to a pool with the following parameters:
|
144
|
+
|
145
|
+
- **token**: Pool token
|
146
|
+
- **node_name**: Name of the node
|
147
|
+
- **frontend**: Whether this is a frontend request
|
148
|
+
"""
|
149
|
+
result = attach_to_pool(
|
150
|
+
token=request.token,
|
151
|
+
node_name=request.node_name,
|
152
|
+
frontend=request.frontend
|
153
|
+
)
|
154
|
+
return result
|
155
|
+
|
156
|
+
@app.post("/stop_pool",
|
157
|
+
summary="Stop a pool",
|
158
|
+
description="Stop the current pool",
|
159
|
+
response_description="Result of stopping the pool")
|
160
|
+
def pool_stop(request: StopPoolRequest, api_key: str = Depends(verify_api_key)):
|
161
|
+
"""
|
162
|
+
Stop the pool with the following parameters:
|
163
|
+
|
164
|
+
- **skip_node_deletion**: Whether to skip node deletion
|
165
|
+
"""
|
166
|
+
result = stop_pool(
|
167
|
+
skip_node_deletion=request.skip_node_deletion
|
168
|
+
)
|
169
|
+
return result
|
170
|
+
|
171
|
+
@app.post("/delete_nodes",
|
172
|
+
summary="Delete nodes",
|
173
|
+
description="Delete specified nodes from the pool",
|
174
|
+
response_description="Result of node deletion")
|
175
|
+
def device_delete(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
|
176
|
+
"""
|
177
|
+
Delete nodes with the following parameters:
|
178
|
+
|
179
|
+
- **nodes**: List of node names to delete
|
180
|
+
"""
|
181
|
+
result = delete_nodes(
|
182
|
+
nodes=request.nodes
|
183
|
+
)
|
184
|
+
return result
|
185
|
+
|
186
|
+
@app.post("/cordon_nodes",
|
187
|
+
summary="Cordon nodes",
|
188
|
+
description="Mark nodes as unschedulable",
|
189
|
+
response_description="Result of cordoning nodes")
|
190
|
+
def device_cordon(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
|
191
|
+
"""
|
192
|
+
Cordon nodes with the following parameters:
|
193
|
+
|
194
|
+
- **nodes**: List of node names to cordon
|
195
|
+
"""
|
196
|
+
result = cordon_nodes(
|
197
|
+
nodes=request.nodes
|
198
|
+
)
|
199
|
+
return result
|
200
|
+
|
201
|
+
@app.post("/uncordon_nodes",
|
202
|
+
summary="Uncordon nodes",
|
203
|
+
description="Mark nodes as schedulable",
|
204
|
+
response_description="Result of uncordoning nodes")
|
205
|
+
def device_uncordon(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
|
206
|
+
"""
|
207
|
+
Uncordon nodes with the following parameters:
|
208
|
+
|
209
|
+
- **nodes**: List of node names to uncordon
|
210
|
+
"""
|
211
|
+
result = uncordon_nodes(
|
212
|
+
nodes=request.nodes
|
213
|
+
)
|
214
|
+
return result
|
215
|
+
|
216
|
+
@app.get("/get_pool_token",
|
217
|
+
summary="Get pool token",
|
218
|
+
description="Get a token for the pool",
|
219
|
+
response_description="Pool token")
|
220
|
+
def get_token(mode: int, api_key: str = Depends(verify_api_key)):
|
221
|
+
"""
|
222
|
+
Get pool token with the following parameters:
|
223
|
+
|
224
|
+
- **mode**: Token type mode
|
225
|
+
"""
|
226
|
+
return get_pool_token(mode=TokenType(mode))
|
227
|
+
|
228
|
+
@app.get("/fetch_devices",
|
229
|
+
summary="Fetch devices",
|
230
|
+
description="Get list of available devices",
|
231
|
+
response_description="List of devices")
|
232
|
+
def get_devices(api_key: str = Depends(verify_api_key)):
|
233
|
+
"""Get list of available devices"""
|
234
|
+
return fetch_devices()
|
235
|
+
|
236
|
+
@app.post("/send_pool_invites",
|
237
|
+
summary="Send pool invites",
|
238
|
+
description="Send invites to join the pool",
|
239
|
+
response_description="Result of sending invites")
|
240
|
+
def send_pool_invites(request: InvitesRequest, api_key: str = Depends(verify_api_key)):
|
241
|
+
"""
|
242
|
+
Send pool invites with the following parameters:
|
243
|
+
|
244
|
+
- **invitees**: List of invitee identifiers
|
245
|
+
"""
|
246
|
+
return send_invites(invitees=request.invitees)
|
247
|
+
|
248
|
+
@app.get("/fetch_resources",
|
249
|
+
summary="Fetch resources",
|
250
|
+
description="Get available resources",
|
251
|
+
response_description="Resource information")
|
252
|
+
def resources(api_key: str = Depends(verify_api_key)):
|
253
|
+
"""Get available resources"""
|
254
|
+
return fetch_resources()
|
255
|
+
|
256
|
+
@app.get("/fetch_job_names",
|
257
|
+
summary="Fetch job names",
|
258
|
+
description="Get list of job names",
|
259
|
+
response_description="List of job names")
|
260
|
+
def job_names(api_key: str = Depends(verify_api_key)):
|
261
|
+
"""Get list of job names"""
|
262
|
+
return fetch_job_names()
|
263
|
+
|
264
|
+
@app.get("/fetch_gpus",
|
265
|
+
summary="Fetch GPUs",
|
266
|
+
description="Get list of available GPUs",
|
267
|
+
response_description="List of GPUs")
|
268
|
+
def gpus(available: bool = False, api_key: str = Depends(verify_api_key)):
|
269
|
+
"""
|
270
|
+
Get list of GPUs with the following parameters:
|
271
|
+
|
272
|
+
- **available**: Whether to show only available GPUs
|
273
|
+
"""
|
274
|
+
return fetch_gpus(available=available)
|
275
|
+
|
276
|
+
@app.post("/fetch_job_details",
|
277
|
+
summary="Fetch job details",
|
278
|
+
description="Get details for specified jobs",
|
279
|
+
response_description="Job details")
|
280
|
+
def job_details(request: JobDetailsRequest, api_key: str = Depends(verify_api_key)):
|
281
|
+
"""
|
282
|
+
Get job details with the following parameters:
|
283
|
+
|
284
|
+
- **jobs**: List of jobs to get details for
|
285
|
+
"""
|
286
|
+
return fetch_job_details(jobs=request.jobs)
|
287
|
+
|
288
|
+
@app.get("/fetch_job_logs",
|
289
|
+
summary="Fetch job logs",
|
290
|
+
description="Get logs for a specific job",
|
291
|
+
response_description="Job logs")
|
292
|
+
def job_logs(
|
293
|
+
job_name: str,
|
294
|
+
force_namespace: str = None,
|
295
|
+
pod_name: str = None,
|
296
|
+
tail: int = 100,
|
297
|
+
api_key: str = Depends(verify_api_key)
|
298
|
+
):
|
299
|
+
"""
|
300
|
+
Get job logs with the following parameters:
|
301
|
+
|
302
|
+
- **job_name**: Name of the job
|
303
|
+
- **force_namespace**: Optional namespace override
|
304
|
+
- **pod_name**: Optional pod name
|
305
|
+
- **tail**: Number of log lines to return
|
306
|
+
"""
|
307
|
+
return fetch_job_logs(
|
308
|
+
job_name=job_name,
|
309
|
+
force_namespace=force_namespace,
|
310
|
+
pod_name=pod_name,
|
311
|
+
tail=tail
|
312
|
+
)
|
313
|
+
|
314
|
+
@app.get("/fetch_job_templates",
|
315
|
+
summary="Fetch job templates",
|
316
|
+
description="Get available job templates",
|
317
|
+
response_description="List of job templates")
|
318
|
+
def job_templates(api_key: str = Depends(verify_api_key)):
|
319
|
+
"""Get available job templates"""
|
320
|
+
return fetch_job_templates()
|
321
|
+
|
322
|
+
@app.get("/fetch_job_defaults",
|
323
|
+
summary="Fetch job defaults",
|
324
|
+
description="Get default values for a job template",
|
325
|
+
response_description="Job default values")
|
326
|
+
def job_templates(name: str, api_key: str = Depends(verify_api_key)):
|
327
|
+
"""
|
328
|
+
Get job defaults with the following parameters:
|
329
|
+
|
330
|
+
- **name**: Name of the job template
|
331
|
+
"""
|
332
|
+
return fetch_job_defaults(name=name)
|
333
|
+
|
334
|
+
@app.post("/deploy_job",
|
335
|
+
summary="Deploy job",
|
336
|
+
description="Deploy a new job",
|
337
|
+
response_description="Result of job deployment")
|
338
|
+
def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)):
|
339
|
+
"""
|
340
|
+
Deploy a job with the following parameters:
|
341
|
+
|
342
|
+
- **template_name**: Name of the job template
|
343
|
+
- **values**: Job configuration values
|
344
|
+
- **force_namespace**: Optional namespace override
|
345
|
+
- **target_labels**: Optional target node labels
|
346
|
+
"""
|
347
|
+
result = deploy_job(
|
348
|
+
template_name=request.template_name,
|
349
|
+
values_dict=request.values,
|
350
|
+
force_namespace=request.force_namespace,
|
351
|
+
target_labels=request.target_labels
|
352
|
+
)
|
353
|
+
return result
|
354
|
+
|
355
|
+
@app.post("/delete_job",
|
356
|
+
summary="Delete job",
|
357
|
+
description="Delete a job",
|
358
|
+
response_description="Result of job deletion")
|
359
|
+
def job_delete(request: DeleteJobRequest, api_key: str = Depends(verify_api_key)):
|
360
|
+
"""
|
361
|
+
Delete a job with the following parameters:
|
362
|
+
|
363
|
+
- **name**: Name of the job to delete
|
364
|
+
- **force_namespace**: Optional namespace override
|
365
|
+
"""
|
366
|
+
result = delete_job(
|
367
|
+
name=request.name,
|
368
|
+
force_namespace=request.force_namespace
|
369
|
+
)
|
370
|
+
return result
|
371
|
+
|
372
|
+
@app.get("/authenticate_user",
|
373
|
+
summary="Authenticate user",
|
374
|
+
description="Authenticate a user",
|
375
|
+
response_description="Authentication result")
|
376
|
+
def user_authenticate(user_id: str, api_key: str = Depends(verify_api_key)):
|
377
|
+
"""
|
378
|
+
Authenticate user with the following parameters:
|
379
|
+
|
380
|
+
- **user_id**: User identifier
|
381
|
+
"""
|
382
|
+
result = authenticate_user(
|
383
|
+
user_id=user_id
|
384
|
+
)
|
385
|
+
return result
|
386
|
+
|
387
|
+
@app.get("/load_user_session",
|
388
|
+
summary="Load user session",
|
389
|
+
description="Load the current user session",
|
390
|
+
response_description="User session information")
|
391
|
+
def user_session(api_key: str = Depends(verify_api_key)):
|
392
|
+
"""Load the current user session"""
|
393
|
+
result = load_user_session()
|
394
|
+
return result
|
395
|
+
|
396
|
+
@app.get("/user_logout",
|
397
|
+
summary="User logout",
|
398
|
+
description="Log out the current user",
|
399
|
+
response_description="Logout result")
|
400
|
+
def logout_user():
|
401
|
+
"""Log out the current user"""
|
402
|
+
result = user_logout()
|
403
|
+
return result
|
404
|
+
|
405
|
+
@app.get("/is_connected",
|
406
|
+
summary="Check connection",
|
407
|
+
description="Check if connected to a pool",
|
408
|
+
response_description="Connection status")
|
409
|
+
def pool_connected():
|
410
|
+
"""Check if connected to a pool"""
|
411
|
+
result = is_connected()
|
412
|
+
return result
|
413
|
+
|
414
|
+
@app.get("/is_agent_running",
|
415
|
+
summary="Check agent status",
|
416
|
+
description="Check if the agent is running",
|
417
|
+
response_description="Agent status")
|
418
|
+
def agent_running():
|
419
|
+
"""Check if the agent is running"""
|
420
|
+
result = is_agent_running()
|
421
|
+
return result
|
422
|
+
|
423
|
+
@app.get("/is_server",
|
424
|
+
summary="Check server status",
|
425
|
+
description="Check if running as server",
|
426
|
+
response_description="Server status")
|
427
|
+
def server():
|
428
|
+
"""Check if running as server"""
|
429
|
+
result = is_server()
|
430
|
+
return result
|
431
|
+
|
432
|
+
@app.post("/pause_agent",
|
433
|
+
summary="Pause agent",
|
434
|
+
description="Pause the agent",
|
435
|
+
response_description="Result of pausing agent")
|
436
|
+
def agent_pause():
|
437
|
+
"""Pause the agent"""
|
438
|
+
result = pause_agent()
|
439
|
+
return result
|
440
|
+
|
441
|
+
@app.post("/resume_agent",
|
442
|
+
summary="Resume agent",
|
443
|
+
description="Resume the agent",
|
444
|
+
response_description="Result of resuming agent")
|
445
|
+
def agent_resume():
|
446
|
+
"""Resume the agent"""
|
447
|
+
result = resume_agent()
|
448
|
+
return result
|
449
|
+
|
450
|
+
@app.get("/get_ip_addresses",
|
451
|
+
summary="Get IP addresses",
|
452
|
+
description="Get available IP addresses",
|
453
|
+
response_description="List of IP addresses")
|
454
|
+
def ip_addresses(subnet: str = None, api_key: str = Depends(verify_api_key)):
|
455
|
+
"""
|
456
|
+
Get IP addresses with the following parameters:
|
457
|
+
|
458
|
+
- **subnet**: Optional subnet to filter by
|
459
|
+
"""
|
460
|
+
result = get_ip_addresses(subnet=subnet)
|
461
|
+
return result
|
462
|
+
|
463
|
+
@app.get("/list_available_pools",
|
464
|
+
summary="List available pools",
|
465
|
+
description="Get list of available pools",
|
466
|
+
response_description="List of available pools")
|
467
|
+
def pool_connected(user_only: bool = False, api_key: str = Depends(verify_api_key)):
|
468
|
+
"""
|
469
|
+
List available pools with the following parameters:
|
470
|
+
|
471
|
+
- **user_only**: Whether to show only user's pools
|
472
|
+
"""
|
473
|
+
result = list_available_pools(user_only=user_only)
|
474
|
+
return result
|
475
|
+
|
476
|
+
@app.post("/add_node_labels",
|
477
|
+
summary="Add node labels",
|
478
|
+
description="Add labels to a node",
|
479
|
+
response_description="Result of adding labels")
|
480
|
+
def node_labels(request: NodeLabelsRequest, api_key: str = Depends(verify_api_key)):
|
481
|
+
"""
|
482
|
+
Add node labels with the following parameters:
|
483
|
+
|
484
|
+
- **node_name**: Name of the node
|
485
|
+
- **labels**: Dictionary of labels to add
|
486
|
+
"""
|
487
|
+
result = add_node_labels(
|
488
|
+
node_name=request.node_name,
|
489
|
+
labels=request.labels
|
490
|
+
)
|
491
|
+
return result
|
492
|
+
|
493
|
+
@app.post("/get_node_labels",
|
494
|
+
summary="Get node labels",
|
495
|
+
description="Get labels for specified nodes",
|
496
|
+
response_description="Node labels")
|
497
|
+
def node_labels_get(request: GetNodeLabelsRequest, api_key: str = Depends(verify_api_key)):
|
498
|
+
"""
|
499
|
+
Get node labels with the following parameters:
|
500
|
+
|
501
|
+
- **node_names**: List of node names to get labels for
|
502
|
+
"""
|
503
|
+
result = get_node_labels(
|
504
|
+
node_names=request.node_names
|
505
|
+
)
|
506
|
+
return result
|
507
|
+
|
508
|
+
def run_api(host="0.0.0.0", port=8001, log_level="critical"):
|
509
|
+
uvicorn.run(
|
510
|
+
app,
|
511
|
+
host=host,
|
512
|
+
port=port,
|
513
|
+
log_level=log_level
|
514
|
+
)
|
515
|
+
|
516
|
+
if __name__ == "__main__":
|
517
|
+
run_api()
|
518
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
from pydantic import BaseModel, Field
|
2
|
+
from typing import List, Dict, Optional
|
3
|
+
|
4
|
+
from kalavai_client.core import Job, TokenType
|
5
|
+
|
6
|
+
|
7
|
+
class InvitesRequest(BaseModel):
|
8
|
+
invitees: list[str] = Field(description="List of user identifiers to invite to the pool")
|
9
|
+
|
10
|
+
class CreatePoolRequest(BaseModel):
|
11
|
+
cluster_name: str = Field(description="Name of the cluster to create")
|
12
|
+
ip_address: str = Field(description="IP address for the pool")
|
13
|
+
app_values: dict = Field(None, description="Application configuration values")
|
14
|
+
num_gpus: int = Field(None, description="Number of GPUs to allocate")
|
15
|
+
node_name: str = Field(None, description="Name of the node")
|
16
|
+
only_registered_users: bool = Field(False, description="Whether to restrict access to registered users only")
|
17
|
+
location: str = Field(None, description="Geographic location of the pool")
|
18
|
+
token_mode: TokenType = Field(TokenType.USER, description="Token type for authentication")
|
19
|
+
description: str = Field("", description="Description of the pool")
|
20
|
+
frontend: bool = Field(False, description="Whether this is a frontend request")
|
21
|
+
|
22
|
+
class NodesActionRequest(BaseModel):
|
23
|
+
nodes: list[str] = Field(description="List of node names to perform the action on")
|
24
|
+
|
25
|
+
class JoinPoolRequest(BaseModel):
|
26
|
+
token: str = Field(description="Token to join the pool")
|
27
|
+
ip_address: str = Field(None, description="IP address for the node")
|
28
|
+
node_name: str = Field(None, description="Name of the node")
|
29
|
+
num_gpus: int = Field(None, description="Number of GPUs to allocate")
|
30
|
+
frontend: bool = Field(False, description="Whether this is a frontend request")
|
31
|
+
class JobDetailsRequest(BaseModel):
|
32
|
+
jobs: list[Job] = Field(description="List of jobs to get details for")
|
33
|
+
|
34
|
+
|
35
|
+
class StopPoolRequest(BaseModel):
|
36
|
+
skip_node_deletion: bool = Field(False, description="Whether to skip node deletion when stopping the pool")
|
37
|
+
|
38
|
+
class DeployJobRequest(BaseModel):
|
39
|
+
template_name: str = Field(description="Name of the job template to use")
|
40
|
+
values: dict = Field(description="Job configuration values")
|
41
|
+
force_namespace: str = Field(None, description="Optional namespace override")
|
42
|
+
target_labels: dict[str, str] = Field(None, description="Optional target node labels")
|
43
|
+
|
44
|
+
class DeleteJobRequest(BaseModel):
|
45
|
+
name: str = Field(description="Name of the job to delete")
|
46
|
+
force_namespace: str = Field(None, description="Optional namespace override")
|
47
|
+
|
48
|
+
class NodeLabelsRequest(BaseModel):
|
49
|
+
node_name: str = Field(description="Name of the node to add labels to")
|
50
|
+
labels: Dict[str, str] = Field(description="Dictionary of labels to add to the node")
|
51
|
+
|
52
|
+
class GetNodeLabelsRequest(BaseModel):
|
53
|
+
node_names: List[str] = Field(description="List of node names to get labels for")
|
@@ -186,7 +186,9 @@ def input_gpus(non_interactive=False):
|
|
186
186
|
try:
|
187
187
|
has_gpus = check_gpu_drivers()
|
188
188
|
if has_gpus:
|
189
|
-
max_gpus =
|
189
|
+
max_gpus = len(
|
190
|
+
[r for r in run_cmd("nvidia-smi -L").decode().split("\n") if len(r.strip())>0]
|
191
|
+
)
|
190
192
|
if non_interactive:
|
191
193
|
num_gpus = max_gpus
|
192
194
|
else:
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import os
|
2
|
+
import platform
|
2
3
|
import time
|
3
4
|
from pathlib import Path
|
4
5
|
from abc import ABC, abstractmethod
|
@@ -96,7 +97,7 @@ class dockerCluster(Cluster):
|
|
96
97
|
# wait for container to be setup
|
97
98
|
while True:
|
98
99
|
try:
|
99
|
-
run_cmd(f"docker cp {self.container_name}:/etc/rancher/k3s/k3s.yaml {self.kubeconfig_file}
|
100
|
+
run_cmd(f"docker cp {self.container_name}:/etc/rancher/k3s/k3s.yaml {self.kubeconfig_file}", hide_output=True)
|
100
101
|
break
|
101
102
|
except:
|
102
103
|
pass
|
@@ -115,15 +116,15 @@ class dockerCluster(Cluster):
|
|
115
116
|
def update_dependencies(self, dependencies_file=None, debug=False, retries=3):
|
116
117
|
if dependencies_file is not None:
|
117
118
|
self.dependencies_file = dependencies_file
|
118
|
-
if debug:
|
119
|
-
output = ""
|
120
|
-
else:
|
121
|
-
output = " >/dev/null 2>&1"
|
122
119
|
while True:
|
123
120
|
try:
|
124
121
|
home = user_path("")
|
125
|
-
|
126
|
-
|
122
|
+
# convert path on host to path on container (will be different in windows os)
|
123
|
+
target_path = "/cache/kalavai"
|
124
|
+
kubeconfig_path = f"{target_path}/{Path(self.kubeconfig_file).name}"
|
125
|
+
dependencies_path = f"{target_path}/{Path(self.dependencies_file).name}"
|
126
|
+
|
127
|
+
run_cmd(f"docker run --rm --net=host -v {home}:{target_path} ghcr.io/helmfile/helmfile:v0.169.2 helmfile sync --file {dependencies_path} --kubeconfig {kubeconfig_path}", hide_output=not debug)
|
127
128
|
break
|
128
129
|
except Exception as e:
|
129
130
|
if retries > 0:
|
@@ -142,11 +143,18 @@ class dockerCluster(Cluster):
|
|
142
143
|
def is_agent_running(self):
|
143
144
|
if not os.path.isfile(self.compose_file):
|
144
145
|
return False
|
145
|
-
|
146
|
-
|
146
|
+
try:
|
147
|
+
status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --status=running").decode()
|
148
|
+
if not status:
|
149
|
+
return False
|
150
|
+
if "windows" in platform.system().lower():
|
151
|
+
status = (0 == os.system(f'docker exec {self.container_name} ps aux | findstr /n /c:"k3s server" /c:"k3s agent"'))
|
152
|
+
else:
|
153
|
+
status = (0 == os.system(f'docker exec {self.container_name} ps aux | grep -v grep | grep -E "k3s (server|agent)"'))
|
154
|
+
return status
|
155
|
+
except Exception as e:
|
156
|
+
print(f"Error when checking agent. Is Docker installed and running?\n\n{str(e)}")
|
147
157
|
return False
|
148
|
-
status = (0 == os.system(f'docker exec {self.container_name} ps aux | grep -v grep | grep -E "k3s (server|agent)"'))
|
149
|
-
return status
|
150
158
|
|
151
159
|
def is_seed_node(self):
|
152
160
|
if not os.path.isfile(self.compose_file):
|
@@ -154,7 +162,7 @@ class dockerCluster(Cluster):
|
|
154
162
|
if not self.is_agent_running():
|
155
163
|
return False
|
156
164
|
try:
|
157
|
-
run_cmd(f"docker container exec {self.container_name} cat /var/lib/rancher/k3s/server/node-token
|
165
|
+
run_cmd(f"docker container exec {self.container_name} cat /var/lib/rancher/k3s/server/node-token", hide_output=True)
|
158
166
|
return True
|
159
167
|
except:
|
160
168
|
return False
|
@@ -162,8 +170,12 @@ class dockerCluster(Cluster):
|
|
162
170
|
def is_cluster_init(self):
|
163
171
|
if not os.path.isfile(self.compose_file):
|
164
172
|
return False
|
165
|
-
|
166
|
-
|
173
|
+
try:
|
174
|
+
status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --all").decode()
|
175
|
+
return status
|
176
|
+
except Exception as e:
|
177
|
+
print(f"Error when checking cluster. Is Docker installed and running?\n\n{str(e)}")
|
178
|
+
return False
|
167
179
|
|
168
180
|
def pause_agent(self):
|
169
181
|
status = False
|
@@ -177,7 +189,6 @@ class dockerCluster(Cluster):
|
|
177
189
|
def restart_agent(self):
|
178
190
|
try:
|
179
191
|
run_cmd(f'docker compose -f {self.compose_file} start')
|
180
|
-
|
181
192
|
except:
|
182
193
|
pass
|
183
194
|
time.sleep(5)
|
@@ -186,7 +197,6 @@ class dockerCluster(Cluster):
|
|
186
197
|
def get_cluster_token(self):
|
187
198
|
if self.is_seed_node():
|
188
199
|
return run_cmd(f"docker container exec {self.container_name} cat /var/lib/rancher/k3s/server/node-token").decode()
|
189
|
-
#return run_cmd("sudo k3s token create --kubeconfig /etc/rancher/k3s/k3s.yaml --ttl 0").decode()
|
190
200
|
else:
|
191
201
|
return None
|
192
202
|
|
@@ -231,7 +241,7 @@ class k3sCluster(Cluster):
|
|
231
241
|
flannel_iface = f"--flannel-iface {self.default_flannel_iface}"
|
232
242
|
else:
|
233
243
|
flannel_iface = ""
|
234
|
-
run_cmd(f'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="{self.kube_version}" INSTALL_K3S_EXEC="server --node-ip {ip_address} --node-external-ip {ip_address} {flannel_iface} --flannel-backend wireguard-native {node_labels}" sh -
|
244
|
+
run_cmd(f'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="{self.kube_version}" INSTALL_K3S_EXEC="server --node-ip {ip_address} --node-external-ip {ip_address} {flannel_iface} --flannel-backend wireguard-native {node_labels}" sh - ', hide_output=True)
|
235
245
|
run_cmd(f"sudo cp /etc/rancher/k3s/k3s.yaml {self.kubeconfig_file}")
|
236
246
|
run_cmd(f"sudo chown $USER {self.kubeconfig_file}")
|
237
247
|
|
@@ -245,8 +255,8 @@ class k3sCluster(Cluster):
|
|
245
255
|
flannel_iface = f"--flannel-iface {self.default_flannel_iface}"
|
246
256
|
else:
|
247
257
|
flannel_iface = ""
|
248
|
-
command = f'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="{self.kube_version}" INSTALL_K3S_EXEC="agent --token {token} --server https://{url}:6443 --node-name {node_name} --node-ip {ip_address} --node-external-ip {ip_address} {flannel_iface} {node_labels}" sh -
|
249
|
-
run_cmd(command)
|
258
|
+
command = f'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="{self.kube_version}" INSTALL_K3S_EXEC="agent --token {token} --server https://{url}:6443 --node-name {node_name} --node-ip {ip_address} --node-external-ip {ip_address} {flannel_iface} {node_labels}" sh - '
|
259
|
+
run_cmd(command, hide_output=True)
|
250
260
|
|
251
261
|
|
252
262
|
def update_dependencies(self, dependencies_file=None, debug=False, retries=3):
|
@@ -270,13 +280,13 @@ class k3sCluster(Cluster):
|
|
270
280
|
|
271
281
|
def remove_agent(self):
|
272
282
|
try:
|
273
|
-
run_cmd('/usr/local/bin/k3s-uninstall.sh
|
274
|
-
run_cmd('sudo rm -r /etc/rancher/node/
|
283
|
+
run_cmd('/usr/local/bin/k3s-uninstall.sh', hide_output=True)
|
284
|
+
run_cmd('sudo rm -r /etc/rancher/node/', hide_output=True)
|
275
285
|
return True
|
276
286
|
except:
|
277
287
|
pass
|
278
288
|
try:
|
279
|
-
run_cmd('/usr/local/bin/k3s-agent-uninstall.sh
|
289
|
+
run_cmd('/usr/local/bin/k3s-agent-uninstall.sh', hide_output=True)
|
280
290
|
return True
|
281
291
|
except:
|
282
292
|
pass
|
@@ -296,12 +306,12 @@ class k3sCluster(Cluster):
|
|
296
306
|
def pause_agent(self):
|
297
307
|
status = False
|
298
308
|
try:
|
299
|
-
run_cmd('sudo systemctl stop k3s
|
309
|
+
run_cmd('sudo systemctl stop k3s', hide_output=True)
|
300
310
|
status = True
|
301
311
|
except:
|
302
312
|
pass
|
303
313
|
try:
|
304
|
-
run_cmd('sudo systemctl stop k3s-agent
|
314
|
+
run_cmd('sudo systemctl stop k3s-agent', hide_output=True)
|
305
315
|
status = True
|
306
316
|
except:
|
307
317
|
pass
|
@@ -309,11 +319,11 @@ class k3sCluster(Cluster):
|
|
309
319
|
|
310
320
|
def restart_agent(self):
|
311
321
|
try:
|
312
|
-
run_cmd('sudo systemctl start k3s
|
322
|
+
run_cmd('sudo systemctl start k3s', hide_output=True)
|
313
323
|
except:
|
314
324
|
pass
|
315
325
|
try:
|
316
|
-
run_cmd('sudo systemctl start k3s-agent
|
326
|
+
run_cmd('sudo systemctl start k3s-agent', hide_output=True)
|
317
327
|
except:
|
318
328
|
pass
|
319
329
|
return self.is_agent_running()
|
@@ -33,6 +33,7 @@ from kalavai_client.utils import (
|
|
33
33
|
get_public_seeds,
|
34
34
|
load_template,
|
35
35
|
is_storage_compatible,
|
36
|
+
get_max_gpus,
|
36
37
|
NODE_NAME_KEY,
|
37
38
|
MANDATORY_TOKEN_FIELDS,
|
38
39
|
PUBLIC_LOCATION_KEY,
|
@@ -157,7 +158,7 @@ def check_seed_compatibility():
|
|
157
158
|
logs = []
|
158
159
|
# docker
|
159
160
|
try:
|
160
|
-
run_cmd("docker
|
161
|
+
run_cmd("docker ps", hide_output=True)
|
161
162
|
except:
|
162
163
|
logs.append("[red]Docker not installed. Install instructions:\n")
|
163
164
|
logs.append(" Linux: https://docs.docker.com/engine/install/\n")
|
@@ -170,7 +171,7 @@ def check_worker_compatibility():
|
|
170
171
|
logs = []
|
171
172
|
# docker
|
172
173
|
try:
|
173
|
-
run_cmd("docker
|
174
|
+
run_cmd("docker ps", hide_output=True)
|
174
175
|
except:
|
175
176
|
logs.append("[red]Docker not installed. Install instructions:\n")
|
176
177
|
logs.append(" Linux: https://docs.docker.com/engine/install/\n")
|
@@ -594,16 +595,6 @@ def attach_to_pool(token, node_name=None):
|
|
594
595
|
|
595
596
|
return cluster_name
|
596
597
|
|
597
|
-
def get_max_gpus():
|
598
|
-
try:
|
599
|
-
has_gpus = check_gpu_drivers()
|
600
|
-
if has_gpus:
|
601
|
-
return int(run_cmd("nvidia-smi -L | wc -l").decode())
|
602
|
-
else:
|
603
|
-
return 0
|
604
|
-
except:
|
605
|
-
return 0
|
606
|
-
|
607
598
|
def generate_worker_package(num_gpus=0, node_name=None, ip_address="0.0.0.0", storage_compatible=True):
|
608
599
|
# get pool data from token
|
609
600
|
token = get_pool_token(mode=TokenType.WORKER)
|
@@ -772,10 +763,9 @@ def create_pool(
|
|
772
763
|
node_name=node_name,
|
773
764
|
node_labels=node_labels
|
774
765
|
)
|
775
|
-
|
766
|
+
|
776
767
|
# start server
|
777
768
|
CLUSTER.start_seed_node()
|
778
|
-
|
779
769
|
while not CLUSTER.is_agent_running():
|
780
770
|
time.sleep(10)
|
781
771
|
|
@@ -73,17 +73,20 @@ KALAVAI_AUTH = KalavaiAuth(
|
|
73
73
|
|
74
74
|
####### Methods to check OS compatibility ########
|
75
75
|
def check_gpu_drivers():
|
76
|
-
value = run_cmd("
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
76
|
+
value = run_cmd("nvidia-smi", hide_output=True)
|
77
|
+
return len(value.decode("utf-8")) == 0
|
78
|
+
|
79
|
+
def get_max_gpus():
|
80
|
+
try:
|
81
|
+
has_gpus = check_gpu_drivers()
|
82
|
+
if has_gpus:
|
83
|
+
return len(
|
84
|
+
[r for r in run_cmd("nvidia-smi -L").decode().split("\n") if len(r.strip())>0]
|
85
|
+
)
|
86
|
+
else:
|
87
|
+
return 0
|
88
|
+
except:
|
89
|
+
return 0
|
87
90
|
|
88
91
|
def is_storage_compatible():
|
89
92
|
"""
|
@@ -92,6 +95,9 @@ def is_storage_compatible():
|
|
92
95
|
Exclude: WSL
|
93
96
|
"""
|
94
97
|
try:
|
98
|
+
import platform
|
99
|
+
if "windows" in platform.system().lower():
|
100
|
+
return True
|
95
101
|
flagged = any([
|
96
102
|
"microsoft" in run_cmd("cat /proc/version").decode().lower()
|
97
103
|
])
|
@@ -120,6 +126,7 @@ def generate_compose_config(role, node_name, write_to_file=True, node_ip_address
|
|
120
126
|
"num_gpus": num_gpus,
|
121
127
|
"k3s_path": f"{CONTAINER_HOST_PATH}/{rand_suffix}/k3s",
|
122
128
|
"etc_path": f"{CONTAINER_HOST_PATH}/{rand_suffix}/etc",
|
129
|
+
"random_suffix": rand_suffix,
|
123
130
|
"node_labels": node_labels,
|
124
131
|
"flannel_iface": DEFAULT_FLANNEL_IFACE if vpn_token is not None else "",
|
125
132
|
"user_id": load_user_id()
|
@@ -231,9 +238,17 @@ def validate_poolconfig(poolconfig_file):
|
|
231
238
|
return False
|
232
239
|
return True
|
233
240
|
|
234
|
-
def run_cmd(command):
|
241
|
+
def run_cmd(command, hide_output=False):
|
235
242
|
try:
|
236
|
-
|
243
|
+
import platform
|
244
|
+
if "windows" in platform.system().lower():
|
245
|
+
if hide_output:
|
246
|
+
command = command + " > $nul 2>&1"
|
247
|
+
return_value = subprocess.check_output(command, shell=True)
|
248
|
+
else:
|
249
|
+
if hide_output:
|
250
|
+
command = command + " >/dev/null 2>&1"
|
251
|
+
return_value = subprocess.check_output(command, shell=True, executable="/bin/bash")
|
237
252
|
return return_value
|
238
253
|
except OSError as error:
|
239
254
|
return error # for exit code
|
@@ -1,276 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Core kalavai service.
|
3
|
-
Used as a bridge between the kalavai-client app and the reflex frontend
|
4
|
-
"""
|
5
|
-
from fastapi import FastAPI, HTTPException, Depends
|
6
|
-
from starlette.requests import Request
|
7
|
-
import uvicorn
|
8
|
-
|
9
|
-
from kalavai_client.bridge_models import (
|
10
|
-
CreatePoolRequest,
|
11
|
-
InvitesRequest,
|
12
|
-
JoinPoolRequest,
|
13
|
-
StopPoolRequest,
|
14
|
-
DeployJobRequest,
|
15
|
-
DeleteJobRequest,
|
16
|
-
JobDetailsRequest,
|
17
|
-
NodesActionRequest,
|
18
|
-
NodeLabelsRequest,
|
19
|
-
GetNodeLabelsRequest
|
20
|
-
)
|
21
|
-
from kalavai_client.core import (
|
22
|
-
create_pool,
|
23
|
-
join_pool,
|
24
|
-
attach_to_pool,
|
25
|
-
send_invites,
|
26
|
-
stop_pool,
|
27
|
-
fetch_devices,
|
28
|
-
fetch_resources,
|
29
|
-
fetch_job_names,
|
30
|
-
fetch_gpus,
|
31
|
-
fetch_job_details,
|
32
|
-
fetch_job_logs,
|
33
|
-
fetch_job_templates,
|
34
|
-
fetch_job_defaults,
|
35
|
-
deploy_job,
|
36
|
-
delete_job,
|
37
|
-
authenticate_user,
|
38
|
-
load_user_session,
|
39
|
-
user_logout,
|
40
|
-
is_connected,
|
41
|
-
list_available_pools,
|
42
|
-
is_agent_running,
|
43
|
-
is_server,
|
44
|
-
pause_agent,
|
45
|
-
resume_agent,
|
46
|
-
get_ip_addresses,
|
47
|
-
get_pool_token,
|
48
|
-
delete_nodes,
|
49
|
-
cordon_nodes,
|
50
|
-
uncordon_nodes,
|
51
|
-
add_node_labels,
|
52
|
-
get_node_labels,
|
53
|
-
TokenType
|
54
|
-
)
|
55
|
-
from kalavai_client.utils import load_user_id
|
56
|
-
|
57
|
-
app = FastAPI()
|
58
|
-
|
59
|
-
################################
|
60
|
-
## API Key Validation methods ##
|
61
|
-
################################
|
62
|
-
async def verify_api_key(request: Request):
|
63
|
-
user_id = load_user_id()
|
64
|
-
if user_id is None:
|
65
|
-
return None
|
66
|
-
api_key = request.headers.get("X-API-KEY")
|
67
|
-
if api_key != user_id:
|
68
|
-
raise HTTPException(status_code=401, detail="Request requires API Key")
|
69
|
-
return api_key
|
70
|
-
|
71
|
-
@app.post("/create_pool")
|
72
|
-
def pool_create(request: CreatePoolRequest, api_key: str = Depends(verify_api_key)):
|
73
|
-
result = create_pool(
|
74
|
-
cluster_name=request.cluster_name,
|
75
|
-
ip_address=request.ip_address,
|
76
|
-
app_values=request.app_values,
|
77
|
-
num_gpus=request.num_gpus,
|
78
|
-
node_name=request.node_name,
|
79
|
-
only_registered_users=request.only_registered_users,
|
80
|
-
location=request.location,
|
81
|
-
description=request.description,
|
82
|
-
token_mode=request.token_mode,
|
83
|
-
frontend=request.frontend
|
84
|
-
)
|
85
|
-
return result
|
86
|
-
|
87
|
-
@app.post("/join_pool")
|
88
|
-
def pool_join(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
|
89
|
-
result = join_pool(
|
90
|
-
token=request.token,
|
91
|
-
num_gpus=request.num_gpus,
|
92
|
-
node_name=request.node_name,
|
93
|
-
ip_address=request.ip_address,
|
94
|
-
frontend=request.frontend
|
95
|
-
)
|
96
|
-
return result
|
97
|
-
|
98
|
-
@app.post("/attach_to_pool")
|
99
|
-
def pool_attach(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
|
100
|
-
result = attach_to_pool(
|
101
|
-
token=request.token,
|
102
|
-
node_name=request.node_name,
|
103
|
-
frontend=request.frontend
|
104
|
-
)
|
105
|
-
return result
|
106
|
-
|
107
|
-
@app.post("/stop_pool")
|
108
|
-
def pool_stop(request: StopPoolRequest, api_key: str = Depends(verify_api_key)):
|
109
|
-
result = stop_pool(
|
110
|
-
skip_node_deletion=request.skip_node_deletion
|
111
|
-
)
|
112
|
-
return result
|
113
|
-
|
114
|
-
@app.post("/delete_nodes")
|
115
|
-
def device_delete(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
|
116
|
-
result = delete_nodes(
|
117
|
-
nodes=request.nodes
|
118
|
-
)
|
119
|
-
return result
|
120
|
-
|
121
|
-
@app.post("/cordon_nodes")
|
122
|
-
def device_cordon(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
|
123
|
-
result = cordon_nodes(
|
124
|
-
nodes=request.nodes
|
125
|
-
)
|
126
|
-
return result
|
127
|
-
|
128
|
-
@app.post("/uncordon_nodes")
|
129
|
-
def device_uncordon(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
|
130
|
-
result = uncordon_nodes(
|
131
|
-
nodes=request.nodes
|
132
|
-
)
|
133
|
-
return result
|
134
|
-
|
135
|
-
@app.get("/get_pool_token")
|
136
|
-
def get_token(mode: int, api_key: str = Depends(verify_api_key)):
|
137
|
-
|
138
|
-
return get_pool_token(mode=TokenType(mode))
|
139
|
-
|
140
|
-
@app.get("/fetch_devices")
|
141
|
-
def get_devices(api_key: str = Depends(verify_api_key)):
|
142
|
-
return fetch_devices()
|
143
|
-
|
144
|
-
@app.post("/send_pool_invites")
|
145
|
-
def send_pool_invites(request: InvitesRequest, api_key: str = Depends(verify_api_key)):
|
146
|
-
return send_invites(invitees=request.invitees)
|
147
|
-
|
148
|
-
@app.get("/fetch_resources")
|
149
|
-
def resources(api_key: str = Depends(verify_api_key)):
|
150
|
-
return fetch_resources()
|
151
|
-
|
152
|
-
@app.get("/fetch_job_names")
|
153
|
-
def job_names(api_key: str = Depends(verify_api_key)):
|
154
|
-
return fetch_job_names()
|
155
|
-
|
156
|
-
@app.get("/fetch_gpus")
|
157
|
-
def gpus(available: bool = False, api_key: str = Depends(verify_api_key)):
|
158
|
-
return fetch_gpus(available=available)
|
159
|
-
|
160
|
-
@app.post("/fetch_job_details")
|
161
|
-
def job_details(request: JobDetailsRequest, api_key: str = Depends(verify_api_key)):
|
162
|
-
return fetch_job_details(jobs=request.jobs)
|
163
|
-
|
164
|
-
@app.get("/fetch_job_logs")
|
165
|
-
def job_logs(job_name: str, force_namespace: str=None, pod_name: str=None, tail: int=100, api_key: str = Depends(verify_api_key)):
|
166
|
-
return fetch_job_logs(
|
167
|
-
job_name=job_name,
|
168
|
-
force_namespace=force_namespace,
|
169
|
-
pod_name=pod_name,
|
170
|
-
tail=tail
|
171
|
-
)
|
172
|
-
|
173
|
-
@app.get("/fetch_job_templates")
|
174
|
-
def job_templates(api_key: str = Depends(verify_api_key)):
|
175
|
-
return fetch_job_templates()
|
176
|
-
|
177
|
-
@app.get("/fetch_job_defaults")
|
178
|
-
def job_templates(name: str, api_key: str = Depends(verify_api_key)):
|
179
|
-
return fetch_job_defaults(name=name)
|
180
|
-
|
181
|
-
@app.post("/deploy_job")
|
182
|
-
def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)):
|
183
|
-
result = deploy_job(
|
184
|
-
template_name=request.template_name,
|
185
|
-
values_dict=request.values,
|
186
|
-
force_namespace=request.force_namespace,
|
187
|
-
target_labels=request.target_labels
|
188
|
-
)
|
189
|
-
return result
|
190
|
-
|
191
|
-
@app.post("/delete_job")
|
192
|
-
def job_delete(request: DeleteJobRequest, api_key: str = Depends(verify_api_key)):
|
193
|
-
result = delete_job(
|
194
|
-
name=request.name,
|
195
|
-
force_namespace=request.force_namespace
|
196
|
-
)
|
197
|
-
return result
|
198
|
-
|
199
|
-
@app.get("/authenticate_user")
|
200
|
-
def user_authenticate(user_id: str, api_key: str = Depends(verify_api_key)):
|
201
|
-
result = authenticate_user(
|
202
|
-
user_id=user_id
|
203
|
-
)
|
204
|
-
return result
|
205
|
-
|
206
|
-
@app.get("/load_user_session")
|
207
|
-
def user_session(api_key: str = Depends(verify_api_key)):
|
208
|
-
result = load_user_session()
|
209
|
-
return result
|
210
|
-
|
211
|
-
@app.get("/user_logout")
|
212
|
-
def logout_user():
|
213
|
-
result = user_logout()
|
214
|
-
return result
|
215
|
-
|
216
|
-
@app.get("/is_connected")
|
217
|
-
def pool_connected():
|
218
|
-
result = is_connected()
|
219
|
-
return result
|
220
|
-
|
221
|
-
@app.get("/is_agent_running")
|
222
|
-
def agent_running():
|
223
|
-
result = is_agent_running()
|
224
|
-
return result
|
225
|
-
|
226
|
-
@app.get("/is_server")
|
227
|
-
def server():
|
228
|
-
result = is_server()
|
229
|
-
return result
|
230
|
-
|
231
|
-
@app.post("/pause_agent")
|
232
|
-
def agent_pause():
|
233
|
-
result = pause_agent()
|
234
|
-
return result
|
235
|
-
|
236
|
-
@app.post("/resume_agent")
|
237
|
-
def agent_resume():
|
238
|
-
result = resume_agent()
|
239
|
-
return result
|
240
|
-
|
241
|
-
@app.get("/get_ip_addresses")
|
242
|
-
def ip_addresses(subnet: str=None, api_key: str = Depends(verify_api_key)):
|
243
|
-
result = get_ip_addresses(subnet=subnet)
|
244
|
-
return result
|
245
|
-
|
246
|
-
@app.get("/list_available_pools")
|
247
|
-
def pool_connected(user_only: bool=False, api_key: str = Depends(verify_api_key)):
|
248
|
-
result = list_available_pools(user_only=user_only)
|
249
|
-
return result
|
250
|
-
|
251
|
-
@app.post("/add_node_labels")
|
252
|
-
def node_labels(request: NodeLabelsRequest, api_key: str = Depends(verify_api_key)):
|
253
|
-
result = add_node_labels(
|
254
|
-
node_name=request.node_name,
|
255
|
-
labels=request.labels
|
256
|
-
)
|
257
|
-
return result
|
258
|
-
|
259
|
-
@app.post("/get_node_labels")
|
260
|
-
def node_labels_get(request: GetNodeLabelsRequest, api_key: str = Depends(verify_api_key)):
|
261
|
-
result = get_node_labels(
|
262
|
-
node_names=request.node_names
|
263
|
-
)
|
264
|
-
return result
|
265
|
-
|
266
|
-
def run_api(host="0.0.0.0", port=8001, log_level="critical"):
|
267
|
-
uvicorn.run(
|
268
|
-
app,
|
269
|
-
host=host,
|
270
|
-
port=port,
|
271
|
-
log_level=log_level
|
272
|
-
)
|
273
|
-
|
274
|
-
if __name__ == "__main__":
|
275
|
-
run_api()
|
276
|
-
|
@@ -1,53 +0,0 @@
|
|
1
|
-
from pydantic import BaseModel
|
2
|
-
from typing import List, Dict, Optional
|
3
|
-
|
4
|
-
from kalavai_client.core import Job, TokenType
|
5
|
-
|
6
|
-
|
7
|
-
class InvitesRequest(BaseModel):
|
8
|
-
invitees: list[str]
|
9
|
-
|
10
|
-
class CreatePoolRequest(BaseModel):
|
11
|
-
cluster_name: str
|
12
|
-
ip_address: str
|
13
|
-
app_values: dict = None
|
14
|
-
num_gpus: int = None
|
15
|
-
node_name: str = None
|
16
|
-
only_registered_users: bool = False
|
17
|
-
location: str = None
|
18
|
-
token_mode: TokenType = TokenType.USER
|
19
|
-
description: str = ""
|
20
|
-
frontend: bool = False
|
21
|
-
|
22
|
-
class NodesActionRequest(BaseModel):
|
23
|
-
nodes: list[str]
|
24
|
-
|
25
|
-
class JoinPoolRequest(BaseModel):
|
26
|
-
token: str
|
27
|
-
ip_address: str = None
|
28
|
-
node_name: str = None
|
29
|
-
num_gpus: int = None
|
30
|
-
frontend: bool = False
|
31
|
-
class JobDetailsRequest(BaseModel):
|
32
|
-
jobs: list[Job]
|
33
|
-
|
34
|
-
|
35
|
-
class StopPoolRequest(BaseModel):
|
36
|
-
skip_node_deletion: bool = False
|
37
|
-
|
38
|
-
class DeployJobRequest(BaseModel):
|
39
|
-
template_name: str
|
40
|
-
values: dict
|
41
|
-
force_namespace: str = None
|
42
|
-
target_labels: dict[str, str] = None
|
43
|
-
|
44
|
-
class DeleteJobRequest(BaseModel):
|
45
|
-
name: str
|
46
|
-
force_namespace: str = None
|
47
|
-
|
48
|
-
class NodeLabelsRequest(BaseModel):
|
49
|
-
node_name: str
|
50
|
-
labels: Dict[str, str]
|
51
|
-
|
52
|
-
class GetNodeLabelsRequest(BaseModel):
|
53
|
-
node_names: List[str]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/docker-compose-gui.yaml
RENAMED
File without changes
|
File without changes
|
{kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/pool_config_template.yaml
RENAMED
File without changes
|
{kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/pool_config_values.yaml
RENAMED
File without changes
|
File without changes
|
{kalavai_client-0.6.12 → kalavai_client-0.6.14}/kalavai_client/assets/user_workspace_values.yaml
RENAMED
File without changes
|
File without changes
|
File without changes
|