kalavai-client 0.6.17__py3-none-any.whl → 0.6.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
1
 
2
- __version__ = "0.6.17"
2
+ __version__ = "0.6.18"
@@ -113,7 +113,7 @@ releases:
113
113
  - name: kuberay
114
114
  namespace: kuberay
115
115
  chart: kuberay/kuberay-operator
116
- installed: true
116
+ installed: {{deploy_kuberay}}
117
117
  version: "1.2.2"
118
118
  - name: kuberay-apiserver
119
119
  namespace: kuberay
@@ -154,7 +154,7 @@ releases:
154
154
  - name: replicas
155
155
  value: 1
156
156
  - name: image_tag
157
- value: "v2025.06.15"
157
+ value: "v2025.07.31"
158
158
  - name: deployment.in_cluster
159
159
  value: "True"
160
160
  - name: deployment.kalavai_username_key
@@ -184,7 +184,7 @@ releases:
184
184
  - name: nvidia-gpu-operator
185
185
  namespace: kalavai
186
186
  chart: kalavai/gpu
187
- installed: true
187
+ installed: false
188
188
  - name: hami-vgpu
189
189
  namespace: kalavai
190
190
  chart: kalavai/hami
@@ -8,13 +8,17 @@
8
8
  description: "Deploy Opencost cost monitoring system"
9
9
 
10
10
  - name: deploy_prometheus
11
- default: "True"
11
+ default: "False"
12
12
  description: "Deploy Prometheus system monitoring system"
13
13
 
14
14
  - name: deploy_langfuse
15
15
  default: "False"
16
16
  description: "Deploy Langfuse LLM tracing system"
17
17
 
18
+ - name: deploy_kuberay
19
+ default: "False"
20
+ description: "Deploy Langfuse LLM tracing system"
21
+
18
22
  ######
19
23
 
20
24
  ### VARIABLES ###
@@ -2,10 +2,13 @@
2
2
  Core kalavai service.
3
3
  Used as a bridge between the kalavai-client app and the reflex frontend
4
4
  """
5
- from fastapi import FastAPI, HTTPException, Depends
5
+ from fastapi import FastAPI, HTTPException, Depends, Query, Body
6
+ from typing import Optional, List
7
+ from fastapi_mcp import FastApiMCP
6
8
  from starlette.requests import Request
7
9
  import uvicorn
8
10
 
11
+ from kalavai_client.core import Job
9
12
  from kalavai_client.bridge_models import (
10
13
  CreatePoolRequest,
11
14
  InvitesRequest,
@@ -15,8 +18,7 @@ from kalavai_client.bridge_models import (
15
18
  DeleteJobRequest,
16
19
  JobDetailsRequest,
17
20
  NodesActionRequest,
18
- NodeLabelsRequest,
19
- GetNodeLabelsRequest
21
+ NodeLabelsRequest
20
22
  )
21
23
  from kalavai_client.core import (
22
24
  create_pool,
@@ -52,7 +54,10 @@ from kalavai_client.core import (
52
54
  get_node_labels,
53
55
  TokenType
54
56
  )
55
- from kalavai_client.utils import load_user_id
57
+ from kalavai_client.utils import (
58
+ load_user_id,
59
+ extract_auth_token
60
+ )
56
61
 
57
62
  app = FastAPI(
58
63
  title="Kalavai Bridge API",
@@ -73,14 +78,16 @@ async def verify_api_key(request: Request):
73
78
  user_id = load_user_id()
74
79
  if user_id is None:
75
80
  return None
76
- api_key = request.headers.get("X-API-KEY")
81
+ api_key = extract_auth_token(headers=request.headers)
77
82
  if api_key != user_id:
78
83
  raise HTTPException(status_code=401, detail="Request requires API Key")
79
84
  return api_key
80
85
 
81
86
  @app.post("/create_pool",
82
- summary="Create a new pool",
83
- description="Creates a new pool with the specified configuration",
87
+ operation_id="create_pool",
88
+ summary="Create a new Kalavai compute pool",
89
+ tags=["pool_management"],
90
+ description="Creates a new distributed compute pool that allows multiple nodes to join and share GPU resources. The pool acts as a Kubernetes cluster where users can deploy and manage machine learning jobs across multiple devices.",
84
91
  response_description="Result of pool creation")
85
92
  def pool_create(request: CreatePoolRequest, api_key: str = Depends(verify_api_key)):
86
93
  """
@@ -109,8 +116,10 @@ def pool_create(request: CreatePoolRequest, api_key: str = Depends(verify_api_ke
109
116
  return result
110
117
 
111
118
  @app.post("/join_pool",
112
- summary="Join an existing pool",
113
- description="Join a pool using a token",
119
+ operation_id="join_pool",
120
+ summary="Join an existing Kalavai pool as a compute node",
121
+ description="Joins a running Kalavai pool by providing a valid join token. This endpoint registers the current machine as a compute node in the pool, making its GPU resources available for job scheduling. The node will receive workloads based on the pool's scheduling policy.",
122
+ tags=["pool_management"],
114
123
  response_description="Result of joining the pool")
115
124
  def pool_join(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
116
125
  """
@@ -130,8 +139,10 @@ def pool_join(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
130
139
  return result
131
140
 
132
141
  @app.post("/attach_to_pool",
133
- summary="Attach to an existing pool",
134
- description="Attach to a pool using a token",
142
+ operation_id="attach_to_pool",
143
+ summary="Attach to a pool for management purposes",
144
+ description="Attaches to an existing Kalavai pool for administrative and monitoring purposes without contributing compute resources. This is typically used by frontend applications or management tools that need to interact with the pool but don't provide GPU resources.",
145
+ tags=["pool_management"],
135
146
  response_description="Result of attaching to the pool")
136
147
  def pool_attach(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
137
148
  """
@@ -149,8 +160,10 @@ def pool_attach(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)
149
160
  return result
150
161
 
151
162
  @app.post("/stop_pool",
152
- summary="Stop a pool",
153
- description="Stop the current pool",
163
+ operation_id="stop_pool",
164
+ summary="Stop and clean up the current Kalavai pool",
165
+ description="Gracefully shuts down the current Kalavai pool, terminating all running jobs and optionally removing all compute nodes from the cluster. This operation is irreversible and will disconnect all nodes from the pool.",
166
+ tags=["pool_management"],
154
167
  response_description="Result of stopping the pool")
155
168
  def pool_stop(request: StopPoolRequest, api_key: str = Depends(verify_api_key)):
156
169
  """
@@ -164,8 +177,10 @@ def pool_stop(request: StopPoolRequest, api_key: str = Depends(verify_api_key)):
164
177
  return result
165
178
 
166
179
  @app.post("/delete_nodes",
167
- summary="Delete nodes",
168
- description="Delete specified nodes from the pool",
180
+ operation_id="delete_nodes",
181
+ summary="Remove specific nodes from the pool",
182
+ description="Removes specified compute nodes from the Kalavai pool. This operation will terminate any jobs running on the target nodes and clean up their resources. Use with caution as it may interrupt running workloads.",
183
+ tags=["pool_management"],
169
184
  response_description="Result of node deletion")
170
185
  def device_delete(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
171
186
  """
@@ -179,8 +194,10 @@ def device_delete(request: NodesActionRequest, api_key: str = Depends(verify_api
179
194
  return result
180
195
 
181
196
  @app.post("/cordon_nodes",
182
- summary="Cordon nodes",
183
- description="Mark nodes as unschedulable",
197
+ operation_id="cordon_nodes",
198
+ summary="Mark nodes as unschedulable",
199
+ description="Marks specified nodes as unschedulable, preventing new jobs from being assigned to them while allowing existing jobs to complete. This is useful for maintenance operations or when you want to gradually remove nodes from the pool.",
200
+ tags=["pool_management"],
184
201
  response_description="Result of cordoning nodes")
185
202
  def device_cordon(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
186
203
  """
@@ -194,8 +211,10 @@ def device_cordon(request: NodesActionRequest, api_key: str = Depends(verify_api
194
211
  return result
195
212
 
196
213
  @app.post("/uncordon_nodes",
197
- summary="Uncordon nodes",
198
- description="Mark nodes as schedulable",
214
+ operation_id="uncordon_nodes",
215
+ summary="Mark nodes as schedulable again",
216
+ description="Re-enables job scheduling on previously cordoned nodes, allowing them to receive new workloads. This reverses the effect of the cordon operation.",
217
+ tags=["pool_management"],
199
218
  response_description="Result of uncordoning nodes")
200
219
  def device_uncordon(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
201
220
  """
@@ -209,8 +228,10 @@ def device_uncordon(request: NodesActionRequest, api_key: str = Depends(verify_a
209
228
  return result
210
229
 
211
230
  @app.get("/get_pool_token",
212
- summary="Get pool token",
213
- description="Get a token for the pool",
231
+ operation_id="get_pool_token",
232
+ summary="Generate a token for pool access",
233
+ description="Generates a secure token that can be used to join or attach to the current Kalavai pool. Different token types provide different levels of access - join tokens allow nodes to contribute resources, while attach tokens allow management access.",
234
+ tags=["auth"],
214
235
  response_description="Pool token")
215
236
  def get_token(mode: int, api_key: str = Depends(verify_api_key)):
216
237
  """
@@ -221,16 +242,20 @@ def get_token(mode: int, api_key: str = Depends(verify_api_key)):
221
242
  return get_pool_token(mode=TokenType(mode))
222
243
 
223
244
  @app.get("/fetch_devices",
224
- summary="Fetch devices",
225
- description="Get list of available devices",
245
+ operation_id="fetch_devices",
246
+ summary="Get list of all compute devices in the pool",
247
+ description="Retrieves information about all compute devices (nodes) currently connected to the Kalavai pool, including their status, available resources, and current workload distribution.",
248
+ tags=["info"],
226
249
  response_description="List of devices")
227
250
  def get_devices(api_key: str = Depends(verify_api_key)):
228
251
  """Get list of available devices"""
229
252
  return fetch_devices()
230
253
 
231
254
  @app.post("/send_pool_invites",
232
- summary="Send pool invites",
233
- description="Send invites to join the pool",
255
+ operation_id="send_pool_invites",
256
+ summary="Send invitations to join the pool",
257
+ description="Sends invitations to potential users or nodes to join the current Kalavai pool. Invitees will receive tokens that allow them to connect to the pool and contribute their resources.",
258
+ tags=["avoid"],
234
259
  response_description="Result of sending invites")
235
260
  def send_pool_invites(request: InvitesRequest, api_key: str = Depends(verify_api_key)):
236
261
  """
@@ -241,24 +266,30 @@ def send_pool_invites(request: InvitesRequest, api_key: str = Depends(verify_api
241
266
  return send_invites(invitees=request.invitees)
242
267
 
243
268
  @app.get("/fetch_resources",
244
- summary="Fetch resources",
245
- description="Get available resources",
269
+ operation_id="fetch_resources",
270
+ summary="Get resource utilization for specific nodes",
271
+ description="Retrieves detailed resource information (CPU, memory, GPU usage) for the pool; optionally for a list of specified nodes in the pool (as {'nodes': node_list}). This helps monitor resource utilization and plan workload distribution.",
272
+ tags=["info"],
246
273
  response_description="Resource information")
247
- def resources(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
274
+ def resources(request: Optional[NodesActionRequest]=NodesActionRequest(), api_key: str = Depends(verify_api_key)):
248
275
  """Get available resources"""
249
276
  return fetch_resources(node_names=request.nodes)
250
277
 
251
278
  @app.get("/fetch_job_names",
252
- summary="Fetch job names",
253
- description="Get list of job names",
279
+ operation_id="fetch_job_names",
280
+ summary="Get list of all jobs (model deployments) in the pool",
281
+ description="Retrieves the names of all jobs and models currently deployed or scheduled in the Kalavai pool. This provides an overview of all workloads in the system.",
282
+ tags=["info"],
254
283
  response_description="List of job names")
255
284
  def job_names(api_key: str = Depends(verify_api_key)):
256
285
  """Get list of job names"""
257
286
  return fetch_job_names()
258
287
 
259
288
  @app.get("/fetch_gpus",
260
- summary="Fetch GPUs",
261
- description="Get list of available GPUs",
289
+ operation_id="fetch_gpus",
290
+ summary="Get GPU information across the pool",
291
+ description="Retrieves detailed information about all GPUs in the Kalavai pool, including their availability status, current utilization, and which jobs are using them. Can filter to show only available GPUs.",
292
+ tags=["info"],
262
293
  response_description="List of GPUs")
263
294
  def gpus(available: bool = False, api_key: str = Depends(verify_api_key)):
264
295
  """
@@ -269,26 +300,26 @@ def gpus(available: bool = False, api_key: str = Depends(verify_api_key)):
269
300
  return fetch_gpus(available=available)
270
301
 
271
302
  @app.post("/fetch_job_details",
272
- summary="Fetch job details",
273
- description="Get details for specified jobs",
303
+ operation_id="fetch_job_details",
304
+ summary="Get detailed information about specific job and model deployments",
305
+ description="Given a list of jobs (as {'jobs': [{'name': job_name}]}'), retrieves comprehensive information about specified jobs or models including their status, resource usage, runtime, and configuration. Useful for monitoring and debugging job execution.",
306
+ tags=["info"],
274
307
  response_description="Job details")
275
308
  def job_details(request: JobDetailsRequest, api_key: str = Depends(verify_api_key)):
276
- """
277
- Get job details with the following parameters:
278
-
279
- - **jobs**: List of jobs to get details for
280
- """
309
+ """Get job details"""
281
310
  return fetch_job_details(jobs=request.jobs)
282
311
 
283
312
  @app.get("/fetch_job_logs",
284
- summary="Fetch job logs",
285
- description="Get logs for a specific job",
313
+ operation_id="fetch_job_logs",
314
+ summary="Get execution logs for a specific job",
315
+ description="Retrieves the execution logs for a specified job, providing real-time or historical output from the job's containers. Useful for debugging, monitoring progress, and understanding job behavior.",
316
+ tags=["info", "avoid"],
286
317
  response_description="Job logs")
287
318
  def job_logs(
288
319
  job_name: str,
289
- force_namespace: str = None,
290
- pod_name: str = None,
291
- tail: int = 100,
320
+ force_namespace: str = Query(None),
321
+ pod_name: str = Query(None),
322
+ tail: int = Query(100),
292
323
  api_key: str = Depends(verify_api_key)
293
324
  ):
294
325
  """
@@ -307,28 +338,68 @@ def job_logs(
307
338
  )
308
339
 
309
340
  @app.get("/fetch_job_templates",
310
- summary="Fetch job templates",
311
- description="Get available job templates",
341
+ operation_id="fetch_job_templates",
342
+ summary="Get available job templates",
343
+ description="Retrieves a list of all available job templates that can be used to deploy workloads. Templates provide predefined configurations for frameworks.",
344
+ tags=["info"],
312
345
  response_description="List of job templates")
313
346
  def job_templates(api_key: str = Depends(verify_api_key)):
314
- """Get available job templates"""
315
347
  return fetch_job_templates()
316
348
 
349
+ @app.get("/fetch_model_templates",
350
+ operation_id="fetch_model_templates",
351
+ summary="Get available model engines templates",
352
+ description="Retrieves a list of all available model engine templates that can be used to deploy models. Templates provide predefined configurations for model engine frameworks.",
353
+ tags=["info"],
354
+ response_description="List of model engine templates")
355
+ def model_templates(api_key: str = Depends(verify_api_key)):
356
+ return fetch_job_templates(type="model")
357
+
317
358
  @app.get("/fetch_job_defaults",
318
- summary="Fetch job defaults",
319
- description="Get default values for a job template",
320
- response_description="Job metadata values")
321
- def job_templates(name: str, api_key: str = Depends(verify_api_key)):
322
- """
323
- Get job defaults with the following parameters:
324
-
325
- - **name**: Name of the job template
326
- """
327
- return fetch_job_defaults(name=name)
359
+ operation_id="fetch_job_defaults",
360
+ summary="Get default values for a job or model engine template deployment",
361
+ description="Retrieves the default values for a specific job or model engine template deployment. This helps users understand what parameters are required and what their default values are before deploying a job.",
362
+ tags=["info"],
363
+ response_description="Job and model engine default values")
364
+ def job_defaults(name: str, api_key: str = Depends(verify_api_key)):
365
+ result = fetch_job_defaults(name=name)
366
+ return result["defaults"]
367
+
368
+ @app.get("/fetch_job_metadata",
369
+ operation_id="fetch_job_metadata",
370
+ summary="Get metadata with information about a given job or model engine template deployment",
371
+ description="Retrieves the metadata associated with a specific job or model engine template deployment. This helps users understand what the template can be used for.",
372
+ tags=["info"],
373
+ response_description="Job and model engine metadata values")
374
+ def job_metadata(name: str, api_key: str = Depends(verify_api_key)):
375
+ result = fetch_job_defaults(name=name)
376
+ return result["metadata"]
377
+
378
+ @app.get("/fetch_job_rules",
379
+ operation_id="fetch_job_rules",
380
+ summary="Get the rules associated with the use of a given job or model engine template",
381
+ description="Retrieves the rules associated with a specific job or model engine template deployment. This helps users and AI agents determine if a given model engine template is adequate for the task.",
382
+ tags=["info"],
383
+ response_description="Job and model engine rules")
384
+ def job_rules(name: str, api_key: str = Depends(verify_api_key)):
385
+ result = job_metadata(name=name)
386
+ return result["template_rules"]
387
+
388
+ @app.get("/fetch_job_values_rules",
389
+ operation_id="fetch_job_values_rules",
390
+ summary="Get information on how to provide values to the parameters of a specific job or model engine template",
391
+ description="Retrieves information necessary to fill up the values required to deploy a specific job or model engine template. This helps users and AI agents generate the values dictionary for a job or model engine template deployment.",
392
+ tags=["info"],
393
+ response_description="Job and model engine info for values")
394
+ def job_values_rules(name: str, api_key: str = Depends(verify_api_key)):
395
+ result = job_metadata(name=name)
396
+ return result["values_rules"]
328
397
 
329
398
  @app.post("/deploy_job",
330
- summary="Deploy job",
331
- description="Deploy a new job",
399
+ operation_id="deploy_job",
400
+ summary="Deploy a new job to the pool",
401
+ description="Deploys a new job to the Kalavai pool using a specified template and configuration. The job will be scheduled on appropriate nodes based on resource availability and any specified target labels.",
402
+ tags=["job_management"],
332
403
  response_description="Result of job deployment")
333
404
  def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)):
334
405
  """
@@ -348,8 +419,10 @@ def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)
348
419
  return result
349
420
 
350
421
  @app.post("/delete_job",
351
- summary="Delete job",
352
- description="Delete a job",
422
+ operation_id="delete_job",
423
+ summary="Terminate and remove a job from the pool",
424
+ description="Terminates a running job and removes it from the Kalavai pool. This will stop all containers associated with the job and free up the resources they were using.",
425
+ tags=["job_management"],
353
426
  response_description="Result of job deletion")
354
427
  def job_delete(request: DeleteJobRequest, api_key: str = Depends(verify_api_key)):
355
428
  """
@@ -365,8 +438,10 @@ def job_delete(request: DeleteJobRequest, api_key: str = Depends(verify_api_key)
365
438
  return result
366
439
 
367
440
  @app.get("/authenticate_user",
368
- summary="Authenticate user",
369
- description="Authenticate a user",
441
+ operation_id="authenticate_user",
442
+ summary="Authenticate a user with the Kalavai system",
443
+ description="Authenticates a user against the Kalavai system, establishing their identity and permissions. This is required for accessing pool management features and deploying jobs.",
444
+ tags=["info", "auth"],
370
445
  response_description="Authentication result")
371
446
  def user_authenticate(user_id: str, api_key: str = Depends(verify_api_key)):
372
447
  """
@@ -380,8 +455,10 @@ def user_authenticate(user_id: str, api_key: str = Depends(verify_api_key)):
380
455
  return result
381
456
 
382
457
  @app.get("/load_user_session",
383
- summary="Load user session",
384
- description="Load the current user session",
458
+ operation_id="load_user_session",
459
+ summary="Load current user session information",
460
+ description="Retrieves information about the currently authenticated user's session, including their identity, permissions, and any active connections to pools.",
461
+ tags=["info", "auth"],
385
462
  response_description="User session information")
386
463
  def user_session(api_key: str = Depends(verify_api_key)):
387
464
  """Load the current user session"""
@@ -389,8 +466,10 @@ def user_session(api_key: str = Depends(verify_api_key)):
389
466
  return result
390
467
 
391
468
  @app.get("/user_logout",
392
- summary="User logout",
393
- description="Log out the current user",
469
+ operation_id="user_logout",
470
+ summary="Log out the current user",
471
+ description="Terminates the current user's session and clears authentication credentials. This should be called when the user is done using the system to ensure proper cleanup.",
472
+ tags=["auth"],
394
473
  response_description="Logout result")
395
474
  def logout_user():
396
475
  """Log out the current user"""
@@ -398,8 +477,10 @@ def logout_user():
398
477
  return result
399
478
 
400
479
  @app.get("/is_connected",
401
- summary="Check connection",
402
- description="Check if connected to a pool",
480
+ operation_id="is_connected",
481
+ summary="Check if connected to a Kalavai pool",
482
+ description="Verifies whether the current instance is connected to a Kalavai pool. Returns connection status and pool information if connected.",
483
+ tags=["agent_management"],
403
484
  response_description="Connection status")
404
485
  def pool_connected():
405
486
  """Check if connected to a pool"""
@@ -407,8 +488,10 @@ def pool_connected():
407
488
  return result
408
489
 
409
490
  @app.get("/is_agent_running",
410
- summary="Check agent status",
411
- description="Check if the agent is running",
491
+ operation_id="is_agent_running",
492
+ summary="Check if the Kalavai agent is running",
493
+ description="Verifies whether the Kalavai agent service is currently running on this machine. The agent is responsible for managing pool connections and job execution.",
494
+ tags=["agent_management"],
412
495
  response_description="Agent status")
413
496
  def agent_running():
414
497
  """Check if the agent is running"""
@@ -416,8 +499,10 @@ def agent_running():
416
499
  return result
417
500
 
418
501
  @app.get("/is_server",
419
- summary="Check server status",
420
- description="Check if running as server",
502
+ operation_id="is_server",
503
+ summary="Check if running as a pool server",
504
+ description="Determines whether this instance is running as a Kalavai pool server (coordinator) or as a client node. Server instances manage the pool while client instances contribute resources.",
505
+ tags=["agent_management"],
421
506
  response_description="Server status")
422
507
  def server():
423
508
  """Check if running as server"""
@@ -425,8 +510,10 @@ def server():
425
510
  return result
426
511
 
427
512
  @app.post("/pause_agent",
428
- summary="Pause agent",
429
- description="Pause the agent",
513
+ operation_id="pause_agent",
514
+ summary="Pause the Kalavai agent service",
515
+ description="Temporarily pauses the Kalavai agent, stopping it from accepting new jobs or participating in pool operations. Existing jobs will continue running until completion.",
516
+ tags=["agent_management"],
430
517
  response_description="Result of pausing agent")
431
518
  def agent_pause():
432
519
  """Pause the agent"""
@@ -434,8 +521,10 @@ def agent_pause():
434
521
  return result
435
522
 
436
523
  @app.post("/resume_agent",
437
- summary="Resume agent",
438
- description="Resume the agent",
524
+ operation_id="resume_agent",
525
+ summary="Resume the Kalavai agent service",
526
+ description="Resumes the previously paused Kalavai agent, allowing it to accept new jobs and participate in pool operations again.",
527
+ tags=["agent_management"],
439
528
  response_description="Result of resuming agent")
440
529
  def agent_resume():
441
530
  """Resume the agent"""
@@ -443,8 +532,10 @@ def agent_resume():
443
532
  return result
444
533
 
445
534
  @app.get("/get_ip_addresses",
446
- summary="Get IP addresses",
447
- description="Get available IP addresses",
535
+ operation_id="get_ip_addresses",
536
+ summary="Get available IP addresses for pool configuration",
537
+ description="Retrieves a list of available IP addresses that can be used for pool configuration. Optionally filters by subnet to help with network planning and pool setup.",
538
+ tags=["agent_management"],
448
539
  response_description="List of IP addresses")
449
540
  def ip_addresses(subnet: str = None, api_key: str = Depends(verify_api_key)):
450
541
  """
@@ -456,8 +547,10 @@ def ip_addresses(subnet: str = None, api_key: str = Depends(verify_api_key)):
456
547
  return result
457
548
 
458
549
  @app.get("/list_available_pools",
459
- summary="List available pools",
460
- description="Get list of available pools",
550
+ operation_id="list_available_pools",
551
+ summary="List all available Kalavai pools",
552
+ description="Retrieves a list of all Kalavai pools that are currently available for connection. Can filter to show only pools owned by the current user or all public pools.",
553
+ tags=["agent_management"],
461
554
  response_description="List of available pools")
462
555
  def pool_connected(user_only: bool = False, api_key: str = Depends(verify_api_key)):
463
556
  """
@@ -469,8 +562,10 @@ def pool_connected(user_only: bool = False, api_key: str = Depends(verify_api_ke
469
562
  return result
470
563
 
471
564
  @app.post("/add_node_labels",
472
- summary="Add node labels",
473
- description="Add labels to a node",
565
+ operation_id="add_node_labels",
566
+ summary="Add custom labels to a compute node",
567
+ description="Adds custom labels to a specific compute node in the pool. Labels can be used for job scheduling, resource allocation, and organizational purposes. Labels are key-value pairs that help categorize and identify nodes.",
568
+ tags=["pool_management"],
474
569
  response_description="Result of adding labels")
475
570
  def node_labels(request: NodeLabelsRequest, api_key: str = Depends(verify_api_key)):
476
571
  """
@@ -485,21 +580,40 @@ def node_labels(request: NodeLabelsRequest, api_key: str = Depends(verify_api_ke
485
580
  )
486
581
  return result
487
582
 
488
- @app.post("/get_node_labels",
489
- summary="Get node labels",
490
- description="Get labels for specified nodes",
583
+ @app.get("/get_node_labels",
584
+ operation_id="get_node_labels",
585
+ summary="Get labels for specified compute nodes",
586
+ description="Retrieves all labels associated with specified compute nodes in the pool. Labels provide metadata about nodes and can be used for filtering and scheduling decisions.",
587
+ tags=["info"],
491
588
  response_description="Node labels")
492
- def node_labels_get(request: GetNodeLabelsRequest, api_key: str = Depends(verify_api_key)):
589
+ def node_labels_get(request: Optional[NodesActionRequest]=NodesActionRequest(), api_key: str = Depends(verify_api_key)):
493
590
  """
494
591
  Get node labels with the following parameters:
495
592
 
496
593
  - **node_names**: List of node names to get labels for
497
594
  """
498
595
  result = get_node_labels(
499
- node_names=request.node_names
596
+ node_names=request.nodes
500
597
  )
501
598
  return result
502
599
 
600
+ ### BUILD MCP WRAPPER ###
601
+ mcp = FastApiMCP(
602
+ app,
603
+ name="Protected MCP",
604
+ #exclude_operations=[],
605
+ exclude_tags=[
606
+ "auth",
607
+ "agent_management",
608
+ "job_management",
609
+ "pool_management",
610
+ "avoid"
611
+ ]
612
+ )
613
+ mcp.mount()
614
+ ##########################
615
+
616
+
503
617
  def run_api(host="0.0.0.0", port=8001, log_level="critical"):
504
618
  uvicorn.run(
505
619
  app,
@@ -18,7 +18,7 @@ class CreatePoolRequest(BaseModel):
18
18
  description: str = Field("", description="Description of the pool")
19
19
 
20
20
  class NodesActionRequest(BaseModel):
21
- nodes: list[str] = Field(None, description="List of node names to perform the action on")
21
+ nodes: list[str] = Field(None, description="List of node names to perform the action on, defaults to None")
22
22
 
23
23
  class JoinPoolRequest(BaseModel):
24
24
  token: str = Field(description="Token to join the pool")
@@ -26,10 +26,10 @@ class JoinPoolRequest(BaseModel):
26
26
  node_name: str = Field(None, description="Name of the node")
27
27
  num_gpus: int = Field(None, description="Number of GPUs to allocate")
28
28
  frontend: bool = Field(False, description="Whether this is a frontend request")
29
+
29
30
  class JobDetailsRequest(BaseModel):
30
31
  jobs: list[Job] = Field(description="List of jobs to get details for")
31
32
 
32
-
33
33
  class StopPoolRequest(BaseModel):
34
34
  skip_node_deletion: bool = Field(False, description="Whether to skip node deletion when stopping the pool")
35
35
 
@@ -46,6 +46,3 @@ class DeleteJobRequest(BaseModel):
46
46
  class NodeLabelsRequest(BaseModel):
47
47
  node_name: str = Field(description="Name of the node to add labels to")
48
48
  labels: Dict[str, str] = Field(description="Dictionary of labels to add to the node")
49
-
50
- class GetNodeLabelsRequest(BaseModel):
51
- node_names: List[str] = Field(description="List of node names to get labels for")
kalavai_client/core.py CHANGED
@@ -74,7 +74,7 @@ from kalavai_client.env import (
74
74
  )
75
75
 
76
76
  class Job(BaseModel):
77
- owner: Optional[str] = None
77
+ owner: Optional[str] = "default"
78
78
  name: Optional[str] = None
79
79
  workers: Optional[str] = None
80
80
  endpoint: Optional[str] = None
@@ -238,13 +238,17 @@ def fetch_job_defaults(name):
238
238
  except Exception as e:
239
239
  return {"error": str(e)}
240
240
 
241
- def fetch_job_templates():
241
+ def fetch_job_templates(type: str=None):
242
+ data = None
243
+ if type is not None:
244
+ data = {"type": type}
242
245
  try:
243
246
  templates = request_to_server(
244
247
  method="get",
245
248
  endpoint="/v1/get_job_templates",
246
249
  server_creds=USER_LOCAL_SERVER_FILE,
247
250
  data=None,
251
+ params=data,
248
252
  user_cookie=USER_COOKIE
249
253
  )
250
254
  return templates
kalavai_client/utils.py CHANGED
@@ -105,6 +105,29 @@ def is_storage_compatible():
105
105
  return False
106
106
  ################
107
107
 
108
+ def extract_auth_token(headers):
109
+ """
110
+ Extract auth token. Valid headers:
111
+ X-API-KEY: token
112
+ X-API-Key: token
113
+ Authorization: Bearer token
114
+ authorization: Bearer token
115
+ """
116
+ #return headers.get("X-API-Key")
117
+ bearer = None
118
+ try:
119
+ for header in ["Authorization", "authorization", "X-API-KEY", "X-API-Key"]:
120
+ bearer = headers.get(header, None)
121
+ if bearer is not None:
122
+ break
123
+ if bearer is not None and " " in bearer:
124
+ return bearer.split()[-1]
125
+ else:
126
+ return bearer
127
+ except Exception as e:
128
+ return {"error": str(e)}
129
+
130
+
108
131
  def generate_compose_config(role, node_name, target_platform="amd64", write_to_file=True, node_ip_address="0.0.0.0", num_gpus=0, node_labels=None, pool_ip=None, vpn_token=None, pool_token=None):
109
132
 
110
133
  if node_labels is not None:
@@ -268,6 +291,7 @@ def request_to_server(
268
291
  endpoint,
269
292
  data,
270
293
  server_creds,
294
+ params=None,
271
295
  force_url=None,
272
296
  force_key=None,
273
297
  user_cookie=None,
@@ -296,6 +320,7 @@ def request_to_server(
296
320
  method=method,
297
321
  url=f"http://{service_url}{endpoint}",
298
322
  json=data,
323
+ params=params,
299
324
  headers=headers,
300
325
  timeout=timeout
301
326
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: kalavai-client
3
- Version: 0.6.17
3
+ Version: 0.6.18
4
4
  Summary: Client app for kalavai platform
5
5
  License: Apache-2.0
6
6
  Keywords: LLM,platform
@@ -8,15 +8,9 @@ Author: Carlos Fernandez Musoles
8
8
  Author-email: carlos@kalavai.net
9
9
  Maintainer: Carlos Fernandez Musoles
10
10
  Maintainer-email: carlos@kalavai.net
11
- Requires-Python: >=3.4
11
+ Requires-Python: >=3.10
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.4
15
- Classifier: Programming Language :: Python :: 3.5
16
- Classifier: Programming Language :: Python :: 3.6
17
- Classifier: Programming Language :: Python :: 3.7
18
- Classifier: Programming Language :: Python :: 3.8
19
- Classifier: Programming Language :: Python :: 3.9
20
14
  Classifier: Programming Language :: Python :: 3.10
21
15
  Classifier: Programming Language :: Python :: 3.11
22
16
  Classifier: Programming Language :: Python :: 3.12
@@ -26,6 +20,7 @@ Requires-Dist: Pillow (==10.3.0)
26
20
  Requires-Dist: arguably (>=1.2.5)
27
21
  Requires-Dist: build ; extra == "dev"
28
22
  Requires-Dist: fastapi (==0.115.8)
23
+ Requires-Dist: fastapi-mcp (==0.3.0)
29
24
  Requires-Dist: importlib_resources (==6.5.2)
30
25
  Requires-Dist: jinja2 (==3.1.4)
31
26
  Requires-Dist: netifaces (==0.11.0)
@@ -1,8 +1,8 @@
1
- kalavai_client/__init__.py,sha256=6D1LEuHzwFI-n2PmIawhX9C0K4e_BwIIJUAUnpkzCWE,23
1
+ kalavai_client/__init__.py,sha256=ErdtY8HWYl_n6MmTR8hlH878NUP9glayQd1egl02vKY,23
2
2
  kalavai_client/__main__.py,sha256=WQUfxvRsBJH5gsCJg8pLz95QnZIj7Ol8psTO77m0QE0,73
3
3
  kalavai_client/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- kalavai_client/assets/apps.yaml,sha256=zVtfPqesNhoBLpNlhIPAVtxgXLqEQU2pK1GTzKGEqiQ,6395
5
- kalavai_client/assets/apps_values.yaml,sha256=dvsAnMC1uk9oDsnITIYJc5CIg9LTwGzXldjPZTwRQyE,2069
4
+ kalavai_client/assets/apps.yaml,sha256=17JuXSv-Qj5Az6ZTRyiEaQXVbI325uTrZzKk2irts2g,6410
5
+ kalavai_client/assets/apps_values.yaml,sha256=LeSNd3PwkIx0wkTIlEk2KNz3Yy4sXSaHALQEkopdhKE,2165
6
6
  kalavai_client/assets/docker-compose-gui.yaml,sha256=shqN78YLw0QP7bqTKveI4ppz5E-5b1JowmsSB4OG3nA,778
7
7
  kalavai_client/assets/docker-compose-template.yaml,sha256=KHIwJ2WWX7Y7wQKiXRr82Jqd3IKRyls5zhTyl8mSmrc,1805
8
8
  kalavai_client/assets/nginx.conf,sha256=drVVCg8GHucz7hmt_BI6giAhK92OV71257NTs3LthwM,225
@@ -11,15 +11,15 @@ kalavai_client/assets/pool_config_values.yaml,sha256=_iAnugramLiwJaaDcPSetThvOdR
11
11
  kalavai_client/assets/user_workspace.yaml,sha256=wDvlMYknOPABAEo0dsQwU7bac8iubjAG9tdkFbJZ5Go,476
12
12
  kalavai_client/assets/user_workspace_values.yaml,sha256=G0HOzQUxrDMCwuW9kbWUZaKMzDDPVwDwzBHCL2Xi2ZM,542
13
13
  kalavai_client/auth.py,sha256=EB3PMvKUn5_KAQkezkEHEt-OMZXyfkZguIQlUFkEHcA,3243
14
- kalavai_client/bridge_api.py,sha256=5tYqI8UdG7K1Qskywk97kC0TpvYruUZxqxvbn-2nve4,15405
15
- kalavai_client/bridge_models.py,sha256=t1fJGaF6YDMQdOnEU3XT8zTBHU8eUWJ1yhM5s7c6vMo,2546
14
+ kalavai_client/bridge_api.py,sha256=Hd7whTX2TAiNYX1G237hv2rqtKUBGRJkzUoWOMZm44A,25562
15
+ kalavai_client/bridge_models.py,sha256=3mHCqIHVysLLkQvGT-DKqKOrtAlQSfEOdrwSq2yTRRU,2439
16
16
  kalavai_client/cli.py,sha256=SzKG7_ZG0ehMQsECQRWSvqj2Fju2Gd5O7uBa60bFBAY,47830
17
17
  kalavai_client/cluster.py,sha256=Z2PIXbZuSAv9xmw-MyZP1M41BpVMpirLzG51bqGA-zc,13548
18
- kalavai_client/core.py,sha256=dJVX5mhFzIshazCfAzb-AqpqWjkp_djgbMyNXzuAF48,34650
18
+ kalavai_client/core.py,sha256=weg54lc03gp2qGwEXl90XEnXGdwFFlaTqZjxyKsngj4,34765
19
19
  kalavai_client/env.py,sha256=YsfZj7LWf6ABquDsoIFFkXCFYwenpDk8zVnGsf7qv98,2823
20
- kalavai_client/utils.py,sha256=bhvQzF12q7L2hGVrbcmXRDXXIsAdlzcsTms6RQRxGU4,12733
21
- kalavai_client-0.6.17.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
22
- kalavai_client-0.6.17.dist-info/METADATA,sha256=J5TEqnwPm3ZopGEi2MRn_ddy_VV101JcB8sl6MGL4iI,12655
23
- kalavai_client-0.6.17.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
24
- kalavai_client-0.6.17.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
25
- kalavai_client-0.6.17.dist-info/RECORD,,
20
+ kalavai_client/utils.py,sha256=GeX1rKUdlQoOW_K2relER8jRQEN1M0UdhsLKOkv5D_g,13428
21
+ kalavai_client-0.6.18.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
22
+ kalavai_client-0.6.18.dist-info/METADATA,sha256=nCvnC5f8QM1sHV4wk3HI9YjH0c_vkpOIkcVpMFIKEx0,12393
23
+ kalavai_client-0.6.18.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
24
+ kalavai_client-0.6.18.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
25
+ kalavai_client-0.6.18.dist-info/RECORD,,