kalavai-client 0.6.17__py3-none-any.whl → 0.6.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kalavai_client/__init__.py +1 -1
- kalavai_client/assets/apps.yaml +3 -3
- kalavai_client/assets/apps_values.yaml +5 -1
- kalavai_client/bridge_api.py +202 -88
- kalavai_client/bridge_models.py +2 -5
- kalavai_client/core.py +6 -2
- kalavai_client/utils.py +25 -0
- {kalavai_client-0.6.17.dist-info → kalavai_client-0.6.18.dist-info}/METADATA +3 -8
- {kalavai_client-0.6.17.dist-info → kalavai_client-0.6.18.dist-info}/RECORD +12 -12
- {kalavai_client-0.6.17.dist-info → kalavai_client-0.6.18.dist-info}/LICENSE +0 -0
- {kalavai_client-0.6.17.dist-info → kalavai_client-0.6.18.dist-info}/WHEEL +0 -0
- {kalavai_client-0.6.17.dist-info → kalavai_client-0.6.18.dist-info}/entry_points.txt +0 -0
kalavai_client/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
|
2
|
-
__version__ = "0.6.
|
2
|
+
__version__ = "0.6.18"
|
kalavai_client/assets/apps.yaml
CHANGED
@@ -113,7 +113,7 @@ releases:
|
|
113
113
|
- name: kuberay
|
114
114
|
namespace: kuberay
|
115
115
|
chart: kuberay/kuberay-operator
|
116
|
-
installed:
|
116
|
+
installed: {{deploy_kuberay}}
|
117
117
|
version: "1.2.2"
|
118
118
|
- name: kuberay-apiserver
|
119
119
|
namespace: kuberay
|
@@ -154,7 +154,7 @@ releases:
|
|
154
154
|
- name: replicas
|
155
155
|
value: 1
|
156
156
|
- name: image_tag
|
157
|
-
value: "v2025.
|
157
|
+
value: "v2025.07.31"
|
158
158
|
- name: deployment.in_cluster
|
159
159
|
value: "True"
|
160
160
|
- name: deployment.kalavai_username_key
|
@@ -184,7 +184,7 @@ releases:
|
|
184
184
|
- name: nvidia-gpu-operator
|
185
185
|
namespace: kalavai
|
186
186
|
chart: kalavai/gpu
|
187
|
-
installed:
|
187
|
+
installed: false
|
188
188
|
- name: hami-vgpu
|
189
189
|
namespace: kalavai
|
190
190
|
chart: kalavai/hami
|
@@ -8,13 +8,17 @@
|
|
8
8
|
description: "Deploy Opencost cost monitoring system"
|
9
9
|
|
10
10
|
- name: deploy_prometheus
|
11
|
-
default: "
|
11
|
+
default: "False"
|
12
12
|
description: "Deploy Prometheus system monitoring system"
|
13
13
|
|
14
14
|
- name: deploy_langfuse
|
15
15
|
default: "False"
|
16
16
|
description: "Deploy Langfuse LLM tracing system"
|
17
17
|
|
18
|
+
- name: deploy_kuberay
|
19
|
+
default: "False"
|
20
|
+
description: "Deploy Langfuse LLM tracing system"
|
21
|
+
|
18
22
|
######
|
19
23
|
|
20
24
|
### VARIABLES ###
|
kalavai_client/bridge_api.py
CHANGED
@@ -2,10 +2,13 @@
|
|
2
2
|
Core kalavai service.
|
3
3
|
Used as a bridge between the kalavai-client app and the reflex frontend
|
4
4
|
"""
|
5
|
-
from fastapi import FastAPI, HTTPException, Depends
|
5
|
+
from fastapi import FastAPI, HTTPException, Depends, Query, Body
|
6
|
+
from typing import Optional, List
|
7
|
+
from fastapi_mcp import FastApiMCP
|
6
8
|
from starlette.requests import Request
|
7
9
|
import uvicorn
|
8
10
|
|
11
|
+
from kalavai_client.core import Job
|
9
12
|
from kalavai_client.bridge_models import (
|
10
13
|
CreatePoolRequest,
|
11
14
|
InvitesRequest,
|
@@ -15,8 +18,7 @@ from kalavai_client.bridge_models import (
|
|
15
18
|
DeleteJobRequest,
|
16
19
|
JobDetailsRequest,
|
17
20
|
NodesActionRequest,
|
18
|
-
NodeLabelsRequest
|
19
|
-
GetNodeLabelsRequest
|
21
|
+
NodeLabelsRequest
|
20
22
|
)
|
21
23
|
from kalavai_client.core import (
|
22
24
|
create_pool,
|
@@ -52,7 +54,10 @@ from kalavai_client.core import (
|
|
52
54
|
get_node_labels,
|
53
55
|
TokenType
|
54
56
|
)
|
55
|
-
from kalavai_client.utils import
|
57
|
+
from kalavai_client.utils import (
|
58
|
+
load_user_id,
|
59
|
+
extract_auth_token
|
60
|
+
)
|
56
61
|
|
57
62
|
app = FastAPI(
|
58
63
|
title="Kalavai Bridge API",
|
@@ -73,14 +78,16 @@ async def verify_api_key(request: Request):
|
|
73
78
|
user_id = load_user_id()
|
74
79
|
if user_id is None:
|
75
80
|
return None
|
76
|
-
api_key = request.headers
|
81
|
+
api_key = extract_auth_token(headers=request.headers)
|
77
82
|
if api_key != user_id:
|
78
83
|
raise HTTPException(status_code=401, detail="Request requires API Key")
|
79
84
|
return api_key
|
80
85
|
|
81
86
|
@app.post("/create_pool",
|
82
|
-
|
83
|
-
|
87
|
+
operation_id="create_pool",
|
88
|
+
summary="Create a new Kalavai compute pool",
|
89
|
+
tags=["pool_management"],
|
90
|
+
description="Creates a new distributed compute pool that allows multiple nodes to join and share GPU resources. The pool acts as a Kubernetes cluster where users can deploy and manage machine learning jobs across multiple devices.",
|
84
91
|
response_description="Result of pool creation")
|
85
92
|
def pool_create(request: CreatePoolRequest, api_key: str = Depends(verify_api_key)):
|
86
93
|
"""
|
@@ -109,8 +116,10 @@ def pool_create(request: CreatePoolRequest, api_key: str = Depends(verify_api_ke
|
|
109
116
|
return result
|
110
117
|
|
111
118
|
@app.post("/join_pool",
|
112
|
-
|
113
|
-
|
119
|
+
operation_id="join_pool",
|
120
|
+
summary="Join an existing Kalavai pool as a compute node",
|
121
|
+
description="Joins a running Kalavai pool by providing a valid join token. This endpoint registers the current machine as a compute node in the pool, making its GPU resources available for job scheduling. The node will receive workloads based on the pool's scheduling policy.",
|
122
|
+
tags=["pool_management"],
|
114
123
|
response_description="Result of joining the pool")
|
115
124
|
def pool_join(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
|
116
125
|
"""
|
@@ -130,8 +139,10 @@ def pool_join(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
|
|
130
139
|
return result
|
131
140
|
|
132
141
|
@app.post("/attach_to_pool",
|
133
|
-
|
134
|
-
|
142
|
+
operation_id="attach_to_pool",
|
143
|
+
summary="Attach to a pool for management purposes",
|
144
|
+
description="Attaches to an existing Kalavai pool for administrative and monitoring purposes without contributing compute resources. This is typically used by frontend applications or management tools that need to interact with the pool but don't provide GPU resources.",
|
145
|
+
tags=["pool_management"],
|
135
146
|
response_description="Result of attaching to the pool")
|
136
147
|
def pool_attach(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
|
137
148
|
"""
|
@@ -149,8 +160,10 @@ def pool_attach(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)
|
|
149
160
|
return result
|
150
161
|
|
151
162
|
@app.post("/stop_pool",
|
152
|
-
|
153
|
-
|
163
|
+
operation_id="stop_pool",
|
164
|
+
summary="Stop and clean up the current Kalavai pool",
|
165
|
+
description="Gracefully shuts down the current Kalavai pool, terminating all running jobs and optionally removing all compute nodes from the cluster. This operation is irreversible and will disconnect all nodes from the pool.",
|
166
|
+
tags=["pool_management"],
|
154
167
|
response_description="Result of stopping the pool")
|
155
168
|
def pool_stop(request: StopPoolRequest, api_key: str = Depends(verify_api_key)):
|
156
169
|
"""
|
@@ -164,8 +177,10 @@ def pool_stop(request: StopPoolRequest, api_key: str = Depends(verify_api_key)):
|
|
164
177
|
return result
|
165
178
|
|
166
179
|
@app.post("/delete_nodes",
|
167
|
-
|
168
|
-
|
180
|
+
operation_id="delete_nodes",
|
181
|
+
summary="Remove specific nodes from the pool",
|
182
|
+
description="Removes specified compute nodes from the Kalavai pool. This operation will terminate any jobs running on the target nodes and clean up their resources. Use with caution as it may interrupt running workloads.",
|
183
|
+
tags=["pool_management"],
|
169
184
|
response_description="Result of node deletion")
|
170
185
|
def device_delete(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
|
171
186
|
"""
|
@@ -179,8 +194,10 @@ def device_delete(request: NodesActionRequest, api_key: str = Depends(verify_api
|
|
179
194
|
return result
|
180
195
|
|
181
196
|
@app.post("/cordon_nodes",
|
182
|
-
|
183
|
-
|
197
|
+
operation_id="cordon_nodes",
|
198
|
+
summary="Mark nodes as unschedulable",
|
199
|
+
description="Marks specified nodes as unschedulable, preventing new jobs from being assigned to them while allowing existing jobs to complete. This is useful for maintenance operations or when you want to gradually remove nodes from the pool.",
|
200
|
+
tags=["pool_management"],
|
184
201
|
response_description="Result of cordoning nodes")
|
185
202
|
def device_cordon(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
|
186
203
|
"""
|
@@ -194,8 +211,10 @@ def device_cordon(request: NodesActionRequest, api_key: str = Depends(verify_api
|
|
194
211
|
return result
|
195
212
|
|
196
213
|
@app.post("/uncordon_nodes",
|
197
|
-
|
198
|
-
|
214
|
+
operation_id="uncordon_nodes",
|
215
|
+
summary="Mark nodes as schedulable again",
|
216
|
+
description="Re-enables job scheduling on previously cordoned nodes, allowing them to receive new workloads. This reverses the effect of the cordon operation.",
|
217
|
+
tags=["pool_management"],
|
199
218
|
response_description="Result of uncordoning nodes")
|
200
219
|
def device_uncordon(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
|
201
220
|
"""
|
@@ -209,8 +228,10 @@ def device_uncordon(request: NodesActionRequest, api_key: str = Depends(verify_a
|
|
209
228
|
return result
|
210
229
|
|
211
230
|
@app.get("/get_pool_token",
|
212
|
-
|
213
|
-
|
231
|
+
operation_id="get_pool_token",
|
232
|
+
summary="Generate a token for pool access",
|
233
|
+
description="Generates a secure token that can be used to join or attach to the current Kalavai pool. Different token types provide different levels of access - join tokens allow nodes to contribute resources, while attach tokens allow management access.",
|
234
|
+
tags=["auth"],
|
214
235
|
response_description="Pool token")
|
215
236
|
def get_token(mode: int, api_key: str = Depends(verify_api_key)):
|
216
237
|
"""
|
@@ -221,16 +242,20 @@ def get_token(mode: int, api_key: str = Depends(verify_api_key)):
|
|
221
242
|
return get_pool_token(mode=TokenType(mode))
|
222
243
|
|
223
244
|
@app.get("/fetch_devices",
|
224
|
-
|
225
|
-
|
245
|
+
operation_id="fetch_devices",
|
246
|
+
summary="Get list of all compute devices in the pool",
|
247
|
+
description="Retrieves information about all compute devices (nodes) currently connected to the Kalavai pool, including their status, available resources, and current workload distribution.",
|
248
|
+
tags=["info"],
|
226
249
|
response_description="List of devices")
|
227
250
|
def get_devices(api_key: str = Depends(verify_api_key)):
|
228
251
|
"""Get list of available devices"""
|
229
252
|
return fetch_devices()
|
230
253
|
|
231
254
|
@app.post("/send_pool_invites",
|
232
|
-
|
233
|
-
|
255
|
+
operation_id="send_pool_invites",
|
256
|
+
summary="Send invitations to join the pool",
|
257
|
+
description="Sends invitations to potential users or nodes to join the current Kalavai pool. Invitees will receive tokens that allow them to connect to the pool and contribute their resources.",
|
258
|
+
tags=["avoid"],
|
234
259
|
response_description="Result of sending invites")
|
235
260
|
def send_pool_invites(request: InvitesRequest, api_key: str = Depends(verify_api_key)):
|
236
261
|
"""
|
@@ -241,24 +266,30 @@ def send_pool_invites(request: InvitesRequest, api_key: str = Depends(verify_api
|
|
241
266
|
return send_invites(invitees=request.invitees)
|
242
267
|
|
243
268
|
@app.get("/fetch_resources",
|
244
|
-
|
245
|
-
|
269
|
+
operation_id="fetch_resources",
|
270
|
+
summary="Get resource utilization for specific nodes",
|
271
|
+
description="Retrieves detailed resource information (CPU, memory, GPU usage) for the pool; optionally for a list of specified nodes in the pool (as {'nodes': node_list}). This helps monitor resource utilization and plan workload distribution.",
|
272
|
+
tags=["info"],
|
246
273
|
response_description="Resource information")
|
247
|
-
def resources(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
|
274
|
+
def resources(request: Optional[NodesActionRequest]=NodesActionRequest(), api_key: str = Depends(verify_api_key)):
|
248
275
|
"""Get available resources"""
|
249
276
|
return fetch_resources(node_names=request.nodes)
|
250
277
|
|
251
278
|
@app.get("/fetch_job_names",
|
252
|
-
|
253
|
-
|
279
|
+
operation_id="fetch_job_names",
|
280
|
+
summary="Get list of all jobs (model deployments) in the pool",
|
281
|
+
description="Retrieves the names of all jobs and models currently deployed or scheduled in the Kalavai pool. This provides an overview of all workloads in the system.",
|
282
|
+
tags=["info"],
|
254
283
|
response_description="List of job names")
|
255
284
|
def job_names(api_key: str = Depends(verify_api_key)):
|
256
285
|
"""Get list of job names"""
|
257
286
|
return fetch_job_names()
|
258
287
|
|
259
288
|
@app.get("/fetch_gpus",
|
260
|
-
|
261
|
-
|
289
|
+
operation_id="fetch_gpus",
|
290
|
+
summary="Get GPU information across the pool",
|
291
|
+
description="Retrieves detailed information about all GPUs in the Kalavai pool, including their availability status, current utilization, and which jobs are using them. Can filter to show only available GPUs.",
|
292
|
+
tags=["info"],
|
262
293
|
response_description="List of GPUs")
|
263
294
|
def gpus(available: bool = False, api_key: str = Depends(verify_api_key)):
|
264
295
|
"""
|
@@ -269,26 +300,26 @@ def gpus(available: bool = False, api_key: str = Depends(verify_api_key)):
|
|
269
300
|
return fetch_gpus(available=available)
|
270
301
|
|
271
302
|
@app.post("/fetch_job_details",
|
272
|
-
|
273
|
-
|
303
|
+
operation_id="fetch_job_details",
|
304
|
+
summary="Get detailed information about specific job and model deployments",
|
305
|
+
description="Given a list of jobs (as {'jobs': [{'name': job_name}]}'), retrieves comprehensive information about specified jobs or models including their status, resource usage, runtime, and configuration. Useful for monitoring and debugging job execution.",
|
306
|
+
tags=["info"],
|
274
307
|
response_description="Job details")
|
275
308
|
def job_details(request: JobDetailsRequest, api_key: str = Depends(verify_api_key)):
|
276
|
-
"""
|
277
|
-
Get job details with the following parameters:
|
278
|
-
|
279
|
-
- **jobs**: List of jobs to get details for
|
280
|
-
"""
|
309
|
+
"""Get job details"""
|
281
310
|
return fetch_job_details(jobs=request.jobs)
|
282
311
|
|
283
312
|
@app.get("/fetch_job_logs",
|
284
|
-
|
285
|
-
|
313
|
+
operation_id="fetch_job_logs",
|
314
|
+
summary="Get execution logs for a specific job",
|
315
|
+
description="Retrieves the execution logs for a specified job, providing real-time or historical output from the job's containers. Useful for debugging, monitoring progress, and understanding job behavior.",
|
316
|
+
tags=["info", "avoid"],
|
286
317
|
response_description="Job logs")
|
287
318
|
def job_logs(
|
288
319
|
job_name: str,
|
289
|
-
force_namespace: str = None,
|
290
|
-
pod_name: str = None,
|
291
|
-
tail: int = 100,
|
320
|
+
force_namespace: str = Query(None),
|
321
|
+
pod_name: str = Query(None),
|
322
|
+
tail: int = Query(100),
|
292
323
|
api_key: str = Depends(verify_api_key)
|
293
324
|
):
|
294
325
|
"""
|
@@ -307,28 +338,68 @@ def job_logs(
|
|
307
338
|
)
|
308
339
|
|
309
340
|
@app.get("/fetch_job_templates",
|
310
|
-
|
311
|
-
|
341
|
+
operation_id="fetch_job_templates",
|
342
|
+
summary="Get available job templates",
|
343
|
+
description="Retrieves a list of all available job templates that can be used to deploy workloads. Templates provide predefined configurations for frameworks.",
|
344
|
+
tags=["info"],
|
312
345
|
response_description="List of job templates")
|
313
346
|
def job_templates(api_key: str = Depends(verify_api_key)):
|
314
|
-
"""Get available job templates"""
|
315
347
|
return fetch_job_templates()
|
316
348
|
|
349
|
+
@app.get("/fetch_model_templates",
|
350
|
+
operation_id="fetch_model_templates",
|
351
|
+
summary="Get available model engines templates",
|
352
|
+
description="Retrieves a list of all available model engine templates that can be used to deploy models. Templates provide predefined configurations for model engine frameworks.",
|
353
|
+
tags=["info"],
|
354
|
+
response_description="List of model engine templates")
|
355
|
+
def model_templates(api_key: str = Depends(verify_api_key)):
|
356
|
+
return fetch_job_templates(type="model")
|
357
|
+
|
317
358
|
@app.get("/fetch_job_defaults",
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
""
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
359
|
+
operation_id="fetch_job_defaults",
|
360
|
+
summary="Get default values for a job or model engine template deployment",
|
361
|
+
description="Retrieves the default values for a specific job or model engine template deployment. This helps users understand what parameters are required and what their default values are before deploying a job.",
|
362
|
+
tags=["info"],
|
363
|
+
response_description="Job and model engine default values")
|
364
|
+
def job_defaults(name: str, api_key: str = Depends(verify_api_key)):
|
365
|
+
result = fetch_job_defaults(name=name)
|
366
|
+
return result["defaults"]
|
367
|
+
|
368
|
+
@app.get("/fetch_job_metadata",
|
369
|
+
operation_id="fetch_job_metadata",
|
370
|
+
summary="Get metadata with information about a given job or model engine template deployment",
|
371
|
+
description="Retrieves the metadata associated with a specific job or model engine template deployment. This helps users understand what the template can be used for.",
|
372
|
+
tags=["info"],
|
373
|
+
response_description="Job and model engine metadata values")
|
374
|
+
def job_metadata(name: str, api_key: str = Depends(verify_api_key)):
|
375
|
+
result = fetch_job_defaults(name=name)
|
376
|
+
return result["metadata"]
|
377
|
+
|
378
|
+
@app.get("/fetch_job_rules",
|
379
|
+
operation_id="fetch_job_rules",
|
380
|
+
summary="Get the rules associated with the use of a given job or model engine template",
|
381
|
+
description="Retrieves the rules associated with a specific job or model engine template deployment. This helps users and AI agents determine if a given model engine template is adequate for the task.",
|
382
|
+
tags=["info"],
|
383
|
+
response_description="Job and model engine rules")
|
384
|
+
def job_rules(name: str, api_key: str = Depends(verify_api_key)):
|
385
|
+
result = job_metadata(name=name)
|
386
|
+
return result["template_rules"]
|
387
|
+
|
388
|
+
@app.get("/fetch_job_values_rules",
|
389
|
+
operation_id="fetch_job_values_rules",
|
390
|
+
summary="Get information on how to provide values to the parameters of a specific job or model engine template",
|
391
|
+
description="Retrieves information necessary to fill up the values required to deploy a specific job or model engine template. This helps users and AI agents generate the values dictionary for a job or model engine template deployment.",
|
392
|
+
tags=["info"],
|
393
|
+
response_description="Job and model engine info for values")
|
394
|
+
def job_values_rules(name: str, api_key: str = Depends(verify_api_key)):
|
395
|
+
result = job_metadata(name=name)
|
396
|
+
return result["values_rules"]
|
328
397
|
|
329
398
|
@app.post("/deploy_job",
|
330
|
-
|
331
|
-
|
399
|
+
operation_id="deploy_job",
|
400
|
+
summary="Deploy a new job to the pool",
|
401
|
+
description="Deploys a new job to the Kalavai pool using a specified template and configuration. The job will be scheduled on appropriate nodes based on resource availability and any specified target labels.",
|
402
|
+
tags=["job_management"],
|
332
403
|
response_description="Result of job deployment")
|
333
404
|
def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)):
|
334
405
|
"""
|
@@ -348,8 +419,10 @@ def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)
|
|
348
419
|
return result
|
349
420
|
|
350
421
|
@app.post("/delete_job",
|
351
|
-
|
352
|
-
|
422
|
+
operation_id="delete_job",
|
423
|
+
summary="Terminate and remove a job from the pool",
|
424
|
+
description="Terminates a running job and removes it from the Kalavai pool. This will stop all containers associated with the job and free up the resources they were using.",
|
425
|
+
tags=["job_management"],
|
353
426
|
response_description="Result of job deletion")
|
354
427
|
def job_delete(request: DeleteJobRequest, api_key: str = Depends(verify_api_key)):
|
355
428
|
"""
|
@@ -365,8 +438,10 @@ def job_delete(request: DeleteJobRequest, api_key: str = Depends(verify_api_key)
|
|
365
438
|
return result
|
366
439
|
|
367
440
|
@app.get("/authenticate_user",
|
368
|
-
|
369
|
-
|
441
|
+
operation_id="authenticate_user",
|
442
|
+
summary="Authenticate a user with the Kalavai system",
|
443
|
+
description="Authenticates a user against the Kalavai system, establishing their identity and permissions. This is required for accessing pool management features and deploying jobs.",
|
444
|
+
tags=["info", "auth"],
|
370
445
|
response_description="Authentication result")
|
371
446
|
def user_authenticate(user_id: str, api_key: str = Depends(verify_api_key)):
|
372
447
|
"""
|
@@ -380,8 +455,10 @@ def user_authenticate(user_id: str, api_key: str = Depends(verify_api_key)):
|
|
380
455
|
return result
|
381
456
|
|
382
457
|
@app.get("/load_user_session",
|
383
|
-
|
384
|
-
|
458
|
+
operation_id="load_user_session",
|
459
|
+
summary="Load current user session information",
|
460
|
+
description="Retrieves information about the currently authenticated user's session, including their identity, permissions, and any active connections to pools.",
|
461
|
+
tags=["info", "auth"],
|
385
462
|
response_description="User session information")
|
386
463
|
def user_session(api_key: str = Depends(verify_api_key)):
|
387
464
|
"""Load the current user session"""
|
@@ -389,8 +466,10 @@ def user_session(api_key: str = Depends(verify_api_key)):
|
|
389
466
|
return result
|
390
467
|
|
391
468
|
@app.get("/user_logout",
|
392
|
-
|
393
|
-
|
469
|
+
operation_id="user_logout",
|
470
|
+
summary="Log out the current user",
|
471
|
+
description="Terminates the current user's session and clears authentication credentials. This should be called when the user is done using the system to ensure proper cleanup.",
|
472
|
+
tags=["auth"],
|
394
473
|
response_description="Logout result")
|
395
474
|
def logout_user():
|
396
475
|
"""Log out the current user"""
|
@@ -398,8 +477,10 @@ def logout_user():
|
|
398
477
|
return result
|
399
478
|
|
400
479
|
@app.get("/is_connected",
|
401
|
-
|
402
|
-
|
480
|
+
operation_id="is_connected",
|
481
|
+
summary="Check if connected to a Kalavai pool",
|
482
|
+
description="Verifies whether the current instance is connected to a Kalavai pool. Returns connection status and pool information if connected.",
|
483
|
+
tags=["agent_management"],
|
403
484
|
response_description="Connection status")
|
404
485
|
def pool_connected():
|
405
486
|
"""Check if connected to a pool"""
|
@@ -407,8 +488,10 @@ def pool_connected():
|
|
407
488
|
return result
|
408
489
|
|
409
490
|
@app.get("/is_agent_running",
|
410
|
-
|
411
|
-
|
491
|
+
operation_id="is_agent_running",
|
492
|
+
summary="Check if the Kalavai agent is running",
|
493
|
+
description="Verifies whether the Kalavai agent service is currently running on this machine. The agent is responsible for managing pool connections and job execution.",
|
494
|
+
tags=["agent_management"],
|
412
495
|
response_description="Agent status")
|
413
496
|
def agent_running():
|
414
497
|
"""Check if the agent is running"""
|
@@ -416,8 +499,10 @@ def agent_running():
|
|
416
499
|
return result
|
417
500
|
|
418
501
|
@app.get("/is_server",
|
419
|
-
|
420
|
-
|
502
|
+
operation_id="is_server",
|
503
|
+
summary="Check if running as a pool server",
|
504
|
+
description="Determines whether this instance is running as a Kalavai pool server (coordinator) or as a client node. Server instances manage the pool while client instances contribute resources.",
|
505
|
+
tags=["agent_management"],
|
421
506
|
response_description="Server status")
|
422
507
|
def server():
|
423
508
|
"""Check if running as server"""
|
@@ -425,8 +510,10 @@ def server():
|
|
425
510
|
return result
|
426
511
|
|
427
512
|
@app.post("/pause_agent",
|
428
|
-
|
429
|
-
|
513
|
+
operation_id="pause_agent",
|
514
|
+
summary="Pause the Kalavai agent service",
|
515
|
+
description="Temporarily pauses the Kalavai agent, stopping it from accepting new jobs or participating in pool operations. Existing jobs will continue running until completion.",
|
516
|
+
tags=["agent_management"],
|
430
517
|
response_description="Result of pausing agent")
|
431
518
|
def agent_pause():
|
432
519
|
"""Pause the agent"""
|
@@ -434,8 +521,10 @@ def agent_pause():
|
|
434
521
|
return result
|
435
522
|
|
436
523
|
@app.post("/resume_agent",
|
437
|
-
|
438
|
-
|
524
|
+
operation_id="resume_agent",
|
525
|
+
summary="Resume the Kalavai agent service",
|
526
|
+
description="Resumes the previously paused Kalavai agent, allowing it to accept new jobs and participate in pool operations again.",
|
527
|
+
tags=["agent_management"],
|
439
528
|
response_description="Result of resuming agent")
|
440
529
|
def agent_resume():
|
441
530
|
"""Resume the agent"""
|
@@ -443,8 +532,10 @@ def agent_resume():
|
|
443
532
|
return result
|
444
533
|
|
445
534
|
@app.get("/get_ip_addresses",
|
446
|
-
|
447
|
-
|
535
|
+
operation_id="get_ip_addresses",
|
536
|
+
summary="Get available IP addresses for pool configuration",
|
537
|
+
description="Retrieves a list of available IP addresses that can be used for pool configuration. Optionally filters by subnet to help with network planning and pool setup.",
|
538
|
+
tags=["agent_management"],
|
448
539
|
response_description="List of IP addresses")
|
449
540
|
def ip_addresses(subnet: str = None, api_key: str = Depends(verify_api_key)):
|
450
541
|
"""
|
@@ -456,8 +547,10 @@ def ip_addresses(subnet: str = None, api_key: str = Depends(verify_api_key)):
|
|
456
547
|
return result
|
457
548
|
|
458
549
|
@app.get("/list_available_pools",
|
459
|
-
|
460
|
-
|
550
|
+
operation_id="list_available_pools",
|
551
|
+
summary="List all available Kalavai pools",
|
552
|
+
description="Retrieves a list of all Kalavai pools that are currently available for connection. Can filter to show only pools owned by the current user or all public pools.",
|
553
|
+
tags=["agent_management"],
|
461
554
|
response_description="List of available pools")
|
462
555
|
def pool_connected(user_only: bool = False, api_key: str = Depends(verify_api_key)):
|
463
556
|
"""
|
@@ -469,8 +562,10 @@ def pool_connected(user_only: bool = False, api_key: str = Depends(verify_api_ke
|
|
469
562
|
return result
|
470
563
|
|
471
564
|
@app.post("/add_node_labels",
|
472
|
-
|
473
|
-
|
565
|
+
operation_id="add_node_labels",
|
566
|
+
summary="Add custom labels to a compute node",
|
567
|
+
description="Adds custom labels to a specific compute node in the pool. Labels can be used for job scheduling, resource allocation, and organizational purposes. Labels are key-value pairs that help categorize and identify nodes.",
|
568
|
+
tags=["pool_management"],
|
474
569
|
response_description="Result of adding labels")
|
475
570
|
def node_labels(request: NodeLabelsRequest, api_key: str = Depends(verify_api_key)):
|
476
571
|
"""
|
@@ -485,21 +580,40 @@ def node_labels(request: NodeLabelsRequest, api_key: str = Depends(verify_api_ke
|
|
485
580
|
)
|
486
581
|
return result
|
487
582
|
|
488
|
-
@app.
|
489
|
-
|
490
|
-
|
583
|
+
@app.get("/get_node_labels",
|
584
|
+
operation_id="get_node_labels",
|
585
|
+
summary="Get labels for specified compute nodes",
|
586
|
+
description="Retrieves all labels associated with specified compute nodes in the pool. Labels provide metadata about nodes and can be used for filtering and scheduling decisions.",
|
587
|
+
tags=["info"],
|
491
588
|
response_description="Node labels")
|
492
|
-
def node_labels_get(request:
|
589
|
+
def node_labels_get(request: Optional[NodesActionRequest]=NodesActionRequest(), api_key: str = Depends(verify_api_key)):
|
493
590
|
"""
|
494
591
|
Get node labels with the following parameters:
|
495
592
|
|
496
593
|
- **node_names**: List of node names to get labels for
|
497
594
|
"""
|
498
595
|
result = get_node_labels(
|
499
|
-
node_names=request.
|
596
|
+
node_names=request.nodes
|
500
597
|
)
|
501
598
|
return result
|
502
599
|
|
600
|
+
### BUILD MCP WRAPPER ###
|
601
|
+
mcp = FastApiMCP(
|
602
|
+
app,
|
603
|
+
name="Protected MCP",
|
604
|
+
#exclude_operations=[],
|
605
|
+
exclude_tags=[
|
606
|
+
"auth",
|
607
|
+
"agent_management",
|
608
|
+
"job_management",
|
609
|
+
"pool_management",
|
610
|
+
"avoid"
|
611
|
+
]
|
612
|
+
)
|
613
|
+
mcp.mount()
|
614
|
+
##########################
|
615
|
+
|
616
|
+
|
503
617
|
def run_api(host="0.0.0.0", port=8001, log_level="critical"):
|
504
618
|
uvicorn.run(
|
505
619
|
app,
|
kalavai_client/bridge_models.py
CHANGED
@@ -18,7 +18,7 @@ class CreatePoolRequest(BaseModel):
|
|
18
18
|
description: str = Field("", description="Description of the pool")
|
19
19
|
|
20
20
|
class NodesActionRequest(BaseModel):
|
21
|
-
nodes: list[str] = Field(None, description="List of node names to perform the action on")
|
21
|
+
nodes: list[str] = Field(None, description="List of node names to perform the action on, defaults to None")
|
22
22
|
|
23
23
|
class JoinPoolRequest(BaseModel):
|
24
24
|
token: str = Field(description="Token to join the pool")
|
@@ -26,10 +26,10 @@ class JoinPoolRequest(BaseModel):
|
|
26
26
|
node_name: str = Field(None, description="Name of the node")
|
27
27
|
num_gpus: int = Field(None, description="Number of GPUs to allocate")
|
28
28
|
frontend: bool = Field(False, description="Whether this is a frontend request")
|
29
|
+
|
29
30
|
class JobDetailsRequest(BaseModel):
|
30
31
|
jobs: list[Job] = Field(description="List of jobs to get details for")
|
31
32
|
|
32
|
-
|
33
33
|
class StopPoolRequest(BaseModel):
|
34
34
|
skip_node_deletion: bool = Field(False, description="Whether to skip node deletion when stopping the pool")
|
35
35
|
|
@@ -46,6 +46,3 @@ class DeleteJobRequest(BaseModel):
|
|
46
46
|
class NodeLabelsRequest(BaseModel):
|
47
47
|
node_name: str = Field(description="Name of the node to add labels to")
|
48
48
|
labels: Dict[str, str] = Field(description="Dictionary of labels to add to the node")
|
49
|
-
|
50
|
-
class GetNodeLabelsRequest(BaseModel):
|
51
|
-
node_names: List[str] = Field(description="List of node names to get labels for")
|
kalavai_client/core.py
CHANGED
@@ -74,7 +74,7 @@ from kalavai_client.env import (
|
|
74
74
|
)
|
75
75
|
|
76
76
|
class Job(BaseModel):
|
77
|
-
owner: Optional[str] =
|
77
|
+
owner: Optional[str] = "default"
|
78
78
|
name: Optional[str] = None
|
79
79
|
workers: Optional[str] = None
|
80
80
|
endpoint: Optional[str] = None
|
@@ -238,13 +238,17 @@ def fetch_job_defaults(name):
|
|
238
238
|
except Exception as e:
|
239
239
|
return {"error": str(e)}
|
240
240
|
|
241
|
-
def fetch_job_templates():
|
241
|
+
def fetch_job_templates(type: str=None):
|
242
|
+
data = None
|
243
|
+
if type is not None:
|
244
|
+
data = {"type": type}
|
242
245
|
try:
|
243
246
|
templates = request_to_server(
|
244
247
|
method="get",
|
245
248
|
endpoint="/v1/get_job_templates",
|
246
249
|
server_creds=USER_LOCAL_SERVER_FILE,
|
247
250
|
data=None,
|
251
|
+
params=data,
|
248
252
|
user_cookie=USER_COOKIE
|
249
253
|
)
|
250
254
|
return templates
|
kalavai_client/utils.py
CHANGED
@@ -105,6 +105,29 @@ def is_storage_compatible():
|
|
105
105
|
return False
|
106
106
|
################
|
107
107
|
|
108
|
+
def extract_auth_token(headers):
|
109
|
+
"""
|
110
|
+
Extract auth token. Valid headers:
|
111
|
+
X-API-KEY: token
|
112
|
+
X-API-Key: token
|
113
|
+
Authorization: Bearer token
|
114
|
+
authorization: Bearer token
|
115
|
+
"""
|
116
|
+
#return headers.get("X-API-Key")
|
117
|
+
bearer = None
|
118
|
+
try:
|
119
|
+
for header in ["Authorization", "authorization", "X-API-KEY", "X-API-Key"]:
|
120
|
+
bearer = headers.get(header, None)
|
121
|
+
if bearer is not None:
|
122
|
+
break
|
123
|
+
if bearer is not None and " " in bearer:
|
124
|
+
return bearer.split()[-1]
|
125
|
+
else:
|
126
|
+
return bearer
|
127
|
+
except Exception as e:
|
128
|
+
return {"error": str(e)}
|
129
|
+
|
130
|
+
|
108
131
|
def generate_compose_config(role, node_name, target_platform="amd64", write_to_file=True, node_ip_address="0.0.0.0", num_gpus=0, node_labels=None, pool_ip=None, vpn_token=None, pool_token=None):
|
109
132
|
|
110
133
|
if node_labels is not None:
|
@@ -268,6 +291,7 @@ def request_to_server(
|
|
268
291
|
endpoint,
|
269
292
|
data,
|
270
293
|
server_creds,
|
294
|
+
params=None,
|
271
295
|
force_url=None,
|
272
296
|
force_key=None,
|
273
297
|
user_cookie=None,
|
@@ -296,6 +320,7 @@ def request_to_server(
|
|
296
320
|
method=method,
|
297
321
|
url=f"http://{service_url}{endpoint}",
|
298
322
|
json=data,
|
323
|
+
params=params,
|
299
324
|
headers=headers,
|
300
325
|
timeout=timeout
|
301
326
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: kalavai-client
|
3
|
-
Version: 0.6.
|
3
|
+
Version: 0.6.18
|
4
4
|
Summary: Client app for kalavai platform
|
5
5
|
License: Apache-2.0
|
6
6
|
Keywords: LLM,platform
|
@@ -8,15 +8,9 @@ Author: Carlos Fernandez Musoles
|
|
8
8
|
Author-email: carlos@kalavai.net
|
9
9
|
Maintainer: Carlos Fernandez Musoles
|
10
10
|
Maintainer-email: carlos@kalavai.net
|
11
|
-
Requires-Python: >=3.
|
11
|
+
Requires-Python: >=3.10
|
12
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
14
|
-
Classifier: Programming Language :: Python :: 3.4
|
15
|
-
Classifier: Programming Language :: Python :: 3.5
|
16
|
-
Classifier: Programming Language :: Python :: 3.6
|
17
|
-
Classifier: Programming Language :: Python :: 3.7
|
18
|
-
Classifier: Programming Language :: Python :: 3.8
|
19
|
-
Classifier: Programming Language :: Python :: 3.9
|
20
14
|
Classifier: Programming Language :: Python :: 3.10
|
21
15
|
Classifier: Programming Language :: Python :: 3.11
|
22
16
|
Classifier: Programming Language :: Python :: 3.12
|
@@ -26,6 +20,7 @@ Requires-Dist: Pillow (==10.3.0)
|
|
26
20
|
Requires-Dist: arguably (>=1.2.5)
|
27
21
|
Requires-Dist: build ; extra == "dev"
|
28
22
|
Requires-Dist: fastapi (==0.115.8)
|
23
|
+
Requires-Dist: fastapi-mcp (==0.3.0)
|
29
24
|
Requires-Dist: importlib_resources (==6.5.2)
|
30
25
|
Requires-Dist: jinja2 (==3.1.4)
|
31
26
|
Requires-Dist: netifaces (==0.11.0)
|
@@ -1,8 +1,8 @@
|
|
1
|
-
kalavai_client/__init__.py,sha256=
|
1
|
+
kalavai_client/__init__.py,sha256=ErdtY8HWYl_n6MmTR8hlH878NUP9glayQd1egl02vKY,23
|
2
2
|
kalavai_client/__main__.py,sha256=WQUfxvRsBJH5gsCJg8pLz95QnZIj7Ol8psTO77m0QE0,73
|
3
3
|
kalavai_client/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
kalavai_client/assets/apps.yaml,sha256=
|
5
|
-
kalavai_client/assets/apps_values.yaml,sha256=
|
4
|
+
kalavai_client/assets/apps.yaml,sha256=17JuXSv-Qj5Az6ZTRyiEaQXVbI325uTrZzKk2irts2g,6410
|
5
|
+
kalavai_client/assets/apps_values.yaml,sha256=LeSNd3PwkIx0wkTIlEk2KNz3Yy4sXSaHALQEkopdhKE,2165
|
6
6
|
kalavai_client/assets/docker-compose-gui.yaml,sha256=shqN78YLw0QP7bqTKveI4ppz5E-5b1JowmsSB4OG3nA,778
|
7
7
|
kalavai_client/assets/docker-compose-template.yaml,sha256=KHIwJ2WWX7Y7wQKiXRr82Jqd3IKRyls5zhTyl8mSmrc,1805
|
8
8
|
kalavai_client/assets/nginx.conf,sha256=drVVCg8GHucz7hmt_BI6giAhK92OV71257NTs3LthwM,225
|
@@ -11,15 +11,15 @@ kalavai_client/assets/pool_config_values.yaml,sha256=_iAnugramLiwJaaDcPSetThvOdR
|
|
11
11
|
kalavai_client/assets/user_workspace.yaml,sha256=wDvlMYknOPABAEo0dsQwU7bac8iubjAG9tdkFbJZ5Go,476
|
12
12
|
kalavai_client/assets/user_workspace_values.yaml,sha256=G0HOzQUxrDMCwuW9kbWUZaKMzDDPVwDwzBHCL2Xi2ZM,542
|
13
13
|
kalavai_client/auth.py,sha256=EB3PMvKUn5_KAQkezkEHEt-OMZXyfkZguIQlUFkEHcA,3243
|
14
|
-
kalavai_client/bridge_api.py,sha256=
|
15
|
-
kalavai_client/bridge_models.py,sha256=
|
14
|
+
kalavai_client/bridge_api.py,sha256=Hd7whTX2TAiNYX1G237hv2rqtKUBGRJkzUoWOMZm44A,25562
|
15
|
+
kalavai_client/bridge_models.py,sha256=3mHCqIHVysLLkQvGT-DKqKOrtAlQSfEOdrwSq2yTRRU,2439
|
16
16
|
kalavai_client/cli.py,sha256=SzKG7_ZG0ehMQsECQRWSvqj2Fju2Gd5O7uBa60bFBAY,47830
|
17
17
|
kalavai_client/cluster.py,sha256=Z2PIXbZuSAv9xmw-MyZP1M41BpVMpirLzG51bqGA-zc,13548
|
18
|
-
kalavai_client/core.py,sha256=
|
18
|
+
kalavai_client/core.py,sha256=weg54lc03gp2qGwEXl90XEnXGdwFFlaTqZjxyKsngj4,34765
|
19
19
|
kalavai_client/env.py,sha256=YsfZj7LWf6ABquDsoIFFkXCFYwenpDk8zVnGsf7qv98,2823
|
20
|
-
kalavai_client/utils.py,sha256=
|
21
|
-
kalavai_client-0.6.
|
22
|
-
kalavai_client-0.6.
|
23
|
-
kalavai_client-0.6.
|
24
|
-
kalavai_client-0.6.
|
25
|
-
kalavai_client-0.6.
|
20
|
+
kalavai_client/utils.py,sha256=GeX1rKUdlQoOW_K2relER8jRQEN1M0UdhsLKOkv5D_g,13428
|
21
|
+
kalavai_client-0.6.18.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
22
|
+
kalavai_client-0.6.18.dist-info/METADATA,sha256=nCvnC5f8QM1sHV4wk3HI9YjH0c_vkpOIkcVpMFIKEx0,12393
|
23
|
+
kalavai_client-0.6.18.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
24
|
+
kalavai_client-0.6.18.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
|
25
|
+
kalavai_client-0.6.18.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|