kalavai-client 0.6.12__py3-none-any.whl → 0.6.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
1
 
2
- __version__ = "0.6.12"
2
+ __version__ = "0.6.14"
@@ -152,7 +152,7 @@ releases:
152
152
  - name: replicas
153
153
  value: 1
154
154
  - name: image_tag
155
- value: "v2025.05.2"
155
+ value: "v2025.06.6"
156
156
  - name: deployment.in_cluster
157
157
  value: "True"
158
158
  - name: deployment.kalavai_username_key
@@ -18,6 +18,7 @@ services:
18
18
  {%if command %}
19
19
  {{service_name}}:
20
20
  image: docker.io/bundenth/kalavai-runner:gpu-latest
21
+ pull_policy: always
21
22
  container_name: {{service_name}}
22
23
  {% if vpn %}
23
24
  depends_on:
@@ -35,6 +36,9 @@ services:
35
36
  {% endif %}
36
37
  --node_name="{{node_name}}"
37
38
  --node_ip="{{node_ip_address}}"
39
+ {% if random_suffix %}
40
+ --random_suffix="{{random_suffix}}"
41
+ {% endif %}
38
42
  {% if command == "server" %}
39
43
  --port_range="30000-32767"
40
44
  {% else %}
@@ -54,12 +54,22 @@ from kalavai_client.core import (
54
54
  )
55
55
  from kalavai_client.utils import load_user_id
56
56
 
57
- app = FastAPI()
57
+ app = FastAPI(
58
+ title="Kalavai Bridge API",
59
+ description="API for managing Kalavai pools, jobs, and nodes",
60
+ version="1.0.0",
61
+ docs_url="/docs",
62
+ redoc_url="/redoc",
63
+ )
58
64
 
59
65
  ################################
60
66
  ## API Key Validation methods ##
61
67
  ################################
62
68
  async def verify_api_key(request: Request):
69
+ """
70
+ Verify the API key from the request headers.
71
+ The API key must match the user ID.
72
+ """
63
73
  user_id = load_user_id()
64
74
  if user_id is None:
65
75
  return None
@@ -68,8 +78,25 @@ async def verify_api_key(request: Request):
68
78
  raise HTTPException(status_code=401, detail="Request requires API Key")
69
79
  return api_key
70
80
 
71
- @app.post("/create_pool")
81
+ @app.post("/create_pool",
82
+ summary="Create a new pool",
83
+ description="Creates a new pool with the specified configuration",
84
+ response_description="Result of pool creation")
72
85
  def pool_create(request: CreatePoolRequest, api_key: str = Depends(verify_api_key)):
86
+ """
87
+ Create a new pool with the following parameters:
88
+
89
+ - **cluster_name**: Name of the cluster
90
+ - **ip_address**: IP address for the pool
91
+ - **app_values**: Application configuration values
92
+ - **num_gpus**: Number of GPUs to allocate
93
+ - **node_name**: Name of the node
94
+ - **only_registered_users**: Whether to restrict to registered users
95
+ - **location**: Location of the pool
96
+ - **description**: Pool description
97
+ - **token_mode**: Token type for authentication
98
+ - **frontend**: Whether this is a frontend request
99
+ """
73
100
  result = create_pool(
74
101
  cluster_name=request.cluster_name,
75
102
  ip_address=request.ip_address,
@@ -84,8 +111,20 @@ def pool_create(request: CreatePoolRequest, api_key: str = Depends(verify_api_ke
84
111
  )
85
112
  return result
86
113
 
87
- @app.post("/join_pool")
114
+ @app.post("/join_pool",
115
+ summary="Join an existing pool",
116
+ description="Join a pool using a token",
117
+ response_description="Result of joining the pool")
88
118
  def pool_join(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
119
+ """
120
+ Join a pool with the following parameters:
121
+
122
+ - **token**: Pool join token
123
+ - **ip_address**: IP address for the node
124
+ - **node_name**: Name of the node
125
+ - **num_gpus**: Number of GPUs to allocate
126
+ - **frontend**: Whether this is a frontend request
127
+ """
89
128
  result = join_pool(
90
129
  token=request.token,
91
130
  num_gpus=request.num_gpus,
@@ -95,8 +134,18 @@ def pool_join(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
95
134
  )
96
135
  return result
97
136
 
98
- @app.post("/attach_to_pool")
137
+ @app.post("/attach_to_pool",
138
+ summary="Attach to an existing pool",
139
+ description="Attach to a pool using a token",
140
+ response_description="Result of attaching to the pool")
99
141
  def pool_attach(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)):
142
+ """
143
+ Attach to a pool with the following parameters:
144
+
145
+ - **token**: Pool token
146
+ - **node_name**: Name of the node
147
+ - **frontend**: Whether this is a frontend request
148
+ """
100
149
  result = attach_to_pool(
101
150
  token=request.token,
102
151
  node_name=request.node_name,
@@ -104,65 +153,157 @@ def pool_attach(request: JoinPoolRequest, api_key: str = Depends(verify_api_key)
104
153
  )
105
154
  return result
106
155
 
107
- @app.post("/stop_pool")
156
+ @app.post("/stop_pool",
157
+ summary="Stop a pool",
158
+ description="Stop the current pool",
159
+ response_description="Result of stopping the pool")
108
160
  def pool_stop(request: StopPoolRequest, api_key: str = Depends(verify_api_key)):
161
+ """
162
+ Stop the pool with the following parameters:
163
+
164
+ - **skip_node_deletion**: Whether to skip node deletion
165
+ """
109
166
  result = stop_pool(
110
167
  skip_node_deletion=request.skip_node_deletion
111
168
  )
112
169
  return result
113
170
 
114
- @app.post("/delete_nodes")
171
+ @app.post("/delete_nodes",
172
+ summary="Delete nodes",
173
+ description="Delete specified nodes from the pool",
174
+ response_description="Result of node deletion")
115
175
  def device_delete(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
176
+ """
177
+ Delete nodes with the following parameters:
178
+
179
+ - **nodes**: List of node names to delete
180
+ """
116
181
  result = delete_nodes(
117
182
  nodes=request.nodes
118
183
  )
119
184
  return result
120
185
 
121
- @app.post("/cordon_nodes")
186
+ @app.post("/cordon_nodes",
187
+ summary="Cordon nodes",
188
+ description="Mark nodes as unschedulable",
189
+ response_description="Result of cordoning nodes")
122
190
  def device_cordon(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
191
+ """
192
+ Cordon nodes with the following parameters:
193
+
194
+ - **nodes**: List of node names to cordon
195
+ """
123
196
  result = cordon_nodes(
124
197
  nodes=request.nodes
125
198
  )
126
199
  return result
127
200
 
128
- @app.post("/uncordon_nodes")
201
+ @app.post("/uncordon_nodes",
202
+ summary="Uncordon nodes",
203
+ description="Mark nodes as schedulable",
204
+ response_description="Result of uncordoning nodes")
129
205
  def device_uncordon(request: NodesActionRequest, api_key: str = Depends(verify_api_key)):
206
+ """
207
+ Uncordon nodes with the following parameters:
208
+
209
+ - **nodes**: List of node names to uncordon
210
+ """
130
211
  result = uncordon_nodes(
131
212
  nodes=request.nodes
132
213
  )
133
214
  return result
134
215
 
135
- @app.get("/get_pool_token")
216
+ @app.get("/get_pool_token",
217
+ summary="Get pool token",
218
+ description="Get a token for the pool",
219
+ response_description="Pool token")
136
220
  def get_token(mode: int, api_key: str = Depends(verify_api_key)):
137
-
221
+ """
222
+ Get pool token with the following parameters:
223
+
224
+ - **mode**: Token type mode
225
+ """
138
226
  return get_pool_token(mode=TokenType(mode))
139
227
 
140
- @app.get("/fetch_devices")
228
+ @app.get("/fetch_devices",
229
+ summary="Fetch devices",
230
+ description="Get list of available devices",
231
+ response_description="List of devices")
141
232
  def get_devices(api_key: str = Depends(verify_api_key)):
233
+ """Get list of available devices"""
142
234
  return fetch_devices()
143
235
 
144
- @app.post("/send_pool_invites")
236
+ @app.post("/send_pool_invites",
237
+ summary="Send pool invites",
238
+ description="Send invites to join the pool",
239
+ response_description="Result of sending invites")
145
240
  def send_pool_invites(request: InvitesRequest, api_key: str = Depends(verify_api_key)):
241
+ """
242
+ Send pool invites with the following parameters:
243
+
244
+ - **invitees**: List of invitee identifiers
245
+ """
146
246
  return send_invites(invitees=request.invitees)
147
247
 
148
- @app.get("/fetch_resources")
248
+ @app.get("/fetch_resources",
249
+ summary="Fetch resources",
250
+ description="Get available resources",
251
+ response_description="Resource information")
149
252
  def resources(api_key: str = Depends(verify_api_key)):
253
+ """Get available resources"""
150
254
  return fetch_resources()
151
255
 
152
- @app.get("/fetch_job_names")
256
+ @app.get("/fetch_job_names",
257
+ summary="Fetch job names",
258
+ description="Get list of job names",
259
+ response_description="List of job names")
153
260
  def job_names(api_key: str = Depends(verify_api_key)):
261
+ """Get list of job names"""
154
262
  return fetch_job_names()
155
263
 
156
- @app.get("/fetch_gpus")
264
+ @app.get("/fetch_gpus",
265
+ summary="Fetch GPUs",
266
+ description="Get list of available GPUs",
267
+ response_description="List of GPUs")
157
268
  def gpus(available: bool = False, api_key: str = Depends(verify_api_key)):
269
+ """
270
+ Get list of GPUs with the following parameters:
271
+
272
+ - **available**: Whether to show only available GPUs
273
+ """
158
274
  return fetch_gpus(available=available)
159
275
 
160
- @app.post("/fetch_job_details")
276
+ @app.post("/fetch_job_details",
277
+ summary="Fetch job details",
278
+ description="Get details for specified jobs",
279
+ response_description="Job details")
161
280
  def job_details(request: JobDetailsRequest, api_key: str = Depends(verify_api_key)):
281
+ """
282
+ Get job details with the following parameters:
283
+
284
+ - **jobs**: List of jobs to get details for
285
+ """
162
286
  return fetch_job_details(jobs=request.jobs)
163
287
 
164
- @app.get("/fetch_job_logs")
165
- def job_logs(job_name: str, force_namespace: str=None, pod_name: str=None, tail: int=100, api_key: str = Depends(verify_api_key)):
288
+ @app.get("/fetch_job_logs",
289
+ summary="Fetch job logs",
290
+ description="Get logs for a specific job",
291
+ response_description="Job logs")
292
+ def job_logs(
293
+ job_name: str,
294
+ force_namespace: str = None,
295
+ pod_name: str = None,
296
+ tail: int = 100,
297
+ api_key: str = Depends(verify_api_key)
298
+ ):
299
+ """
300
+ Get job logs with the following parameters:
301
+
302
+ - **job_name**: Name of the job
303
+ - **force_namespace**: Optional namespace override
304
+ - **pod_name**: Optional pod name
305
+ - **tail**: Number of log lines to return
306
+ """
166
307
  return fetch_job_logs(
167
308
  job_name=job_name,
168
309
  force_namespace=force_namespace,
@@ -170,16 +311,39 @@ def job_logs(job_name: str, force_namespace: str=None, pod_name: str=None, tail:
170
311
  tail=tail
171
312
  )
172
313
 
173
- @app.get("/fetch_job_templates")
314
+ @app.get("/fetch_job_templates",
315
+ summary="Fetch job templates",
316
+ description="Get available job templates",
317
+ response_description="List of job templates")
174
318
  def job_templates(api_key: str = Depends(verify_api_key)):
319
+ """Get available job templates"""
175
320
  return fetch_job_templates()
176
321
 
177
- @app.get("/fetch_job_defaults")
322
+ @app.get("/fetch_job_defaults",
323
+ summary="Fetch job defaults",
324
+ description="Get default values for a job template",
325
+ response_description="Job default values")
178
326
  def job_templates(name: str, api_key: str = Depends(verify_api_key)):
327
+ """
328
+ Get job defaults with the following parameters:
329
+
330
+ - **name**: Name of the job template
331
+ """
179
332
  return fetch_job_defaults(name=name)
180
333
 
181
- @app.post("/deploy_job")
334
+ @app.post("/deploy_job",
335
+ summary="Deploy job",
336
+ description="Deploy a new job",
337
+ response_description="Result of job deployment")
182
338
  def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)):
339
+ """
340
+ Deploy a job with the following parameters:
341
+
342
+ - **template_name**: Name of the job template
343
+ - **values**: Job configuration values
344
+ - **force_namespace**: Optional namespace override
345
+ - **target_labels**: Optional target node labels
346
+ """
183
347
  result = deploy_job(
184
348
  template_name=request.template_name,
185
349
  values_dict=request.values,
@@ -188,76 +352,154 @@ def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)
188
352
  )
189
353
  return result
190
354
 
191
- @app.post("/delete_job")
355
+ @app.post("/delete_job",
356
+ summary="Delete job",
357
+ description="Delete a job",
358
+ response_description="Result of job deletion")
192
359
  def job_delete(request: DeleteJobRequest, api_key: str = Depends(verify_api_key)):
360
+ """
361
+ Delete a job with the following parameters:
362
+
363
+ - **name**: Name of the job to delete
364
+ - **force_namespace**: Optional namespace override
365
+ """
193
366
  result = delete_job(
194
367
  name=request.name,
195
368
  force_namespace=request.force_namespace
196
369
  )
197
370
  return result
198
371
 
199
- @app.get("/authenticate_user")
372
+ @app.get("/authenticate_user",
373
+ summary="Authenticate user",
374
+ description="Authenticate a user",
375
+ response_description="Authentication result")
200
376
  def user_authenticate(user_id: str, api_key: str = Depends(verify_api_key)):
377
+ """
378
+ Authenticate user with the following parameters:
379
+
380
+ - **user_id**: User identifier
381
+ """
201
382
  result = authenticate_user(
202
383
  user_id=user_id
203
384
  )
204
385
  return result
205
386
 
206
- @app.get("/load_user_session")
387
+ @app.get("/load_user_session",
388
+ summary="Load user session",
389
+ description="Load the current user session",
390
+ response_description="User session information")
207
391
  def user_session(api_key: str = Depends(verify_api_key)):
392
+ """Load the current user session"""
208
393
  result = load_user_session()
209
394
  return result
210
395
 
211
- @app.get("/user_logout")
396
+ @app.get("/user_logout",
397
+ summary="User logout",
398
+ description="Log out the current user",
399
+ response_description="Logout result")
212
400
  def logout_user():
401
+ """Log out the current user"""
213
402
  result = user_logout()
214
403
  return result
215
404
 
216
- @app.get("/is_connected")
405
+ @app.get("/is_connected",
406
+ summary="Check connection",
407
+ description="Check if connected to a pool",
408
+ response_description="Connection status")
217
409
  def pool_connected():
410
+ """Check if connected to a pool"""
218
411
  result = is_connected()
219
412
  return result
220
413
 
221
- @app.get("/is_agent_running")
414
+ @app.get("/is_agent_running",
415
+ summary="Check agent status",
416
+ description="Check if the agent is running",
417
+ response_description="Agent status")
222
418
  def agent_running():
419
+ """Check if the agent is running"""
223
420
  result = is_agent_running()
224
421
  return result
225
422
 
226
- @app.get("/is_server")
423
+ @app.get("/is_server",
424
+ summary="Check server status",
425
+ description="Check if running as server",
426
+ response_description="Server status")
227
427
  def server():
428
+ """Check if running as server"""
228
429
  result = is_server()
229
430
  return result
230
431
 
231
- @app.post("/pause_agent")
432
+ @app.post("/pause_agent",
433
+ summary="Pause agent",
434
+ description="Pause the agent",
435
+ response_description="Result of pausing agent")
232
436
  def agent_pause():
437
+ """Pause the agent"""
233
438
  result = pause_agent()
234
439
  return result
235
440
 
236
- @app.post("/resume_agent")
441
+ @app.post("/resume_agent",
442
+ summary="Resume agent",
443
+ description="Resume the agent",
444
+ response_description="Result of resuming agent")
237
445
  def agent_resume():
446
+ """Resume the agent"""
238
447
  result = resume_agent()
239
448
  return result
240
449
 
241
- @app.get("/get_ip_addresses")
242
- def ip_addresses(subnet: str=None, api_key: str = Depends(verify_api_key)):
450
+ @app.get("/get_ip_addresses",
451
+ summary="Get IP addresses",
452
+ description="Get available IP addresses",
453
+ response_description="List of IP addresses")
454
+ def ip_addresses(subnet: str = None, api_key: str = Depends(verify_api_key)):
455
+ """
456
+ Get IP addresses with the following parameters:
457
+
458
+ - **subnet**: Optional subnet to filter by
459
+ """
243
460
  result = get_ip_addresses(subnet=subnet)
244
461
  return result
245
462
 
246
- @app.get("/list_available_pools")
247
- def pool_connected(user_only: bool=False, api_key: str = Depends(verify_api_key)):
463
+ @app.get("/list_available_pools",
464
+ summary="List available pools",
465
+ description="Get list of available pools",
466
+ response_description="List of available pools")
467
+ def pool_connected(user_only: bool = False, api_key: str = Depends(verify_api_key)):
468
+ """
469
+ List available pools with the following parameters:
470
+
471
+ - **user_only**: Whether to show only user's pools
472
+ """
248
473
  result = list_available_pools(user_only=user_only)
249
474
  return result
250
475
 
251
- @app.post("/add_node_labels")
476
+ @app.post("/add_node_labels",
477
+ summary="Add node labels",
478
+ description="Add labels to a node",
479
+ response_description="Result of adding labels")
252
480
  def node_labels(request: NodeLabelsRequest, api_key: str = Depends(verify_api_key)):
481
+ """
482
+ Add node labels with the following parameters:
483
+
484
+ - **node_name**: Name of the node
485
+ - **labels**: Dictionary of labels to add
486
+ """
253
487
  result = add_node_labels(
254
488
  node_name=request.node_name,
255
489
  labels=request.labels
256
490
  )
257
491
  return result
258
492
 
259
- @app.post("/get_node_labels")
493
+ @app.post("/get_node_labels",
494
+ summary="Get node labels",
495
+ description="Get labels for specified nodes",
496
+ response_description="Node labels")
260
497
  def node_labels_get(request: GetNodeLabelsRequest, api_key: str = Depends(verify_api_key)):
498
+ """
499
+ Get node labels with the following parameters:
500
+
501
+ - **node_names**: List of node names to get labels for
502
+ """
261
503
  result = get_node_labels(
262
504
  node_names=request.node_names
263
505
  )
@@ -1,53 +1,53 @@
1
- from pydantic import BaseModel
1
+ from pydantic import BaseModel, Field
2
2
  from typing import List, Dict, Optional
3
3
 
4
4
  from kalavai_client.core import Job, TokenType
5
5
 
6
6
 
7
7
  class InvitesRequest(BaseModel):
8
- invitees: list[str]
8
+ invitees: list[str] = Field(description="List of user identifiers to invite to the pool")
9
9
 
10
10
  class CreatePoolRequest(BaseModel):
11
- cluster_name: str
12
- ip_address: str
13
- app_values: dict = None
14
- num_gpus: int = None
15
- node_name: str = None
16
- only_registered_users: bool = False
17
- location: str = None
18
- token_mode: TokenType = TokenType.USER
19
- description: str = ""
20
- frontend: bool = False
11
+ cluster_name: str = Field(description="Name of the cluster to create")
12
+ ip_address: str = Field(description="IP address for the pool")
13
+ app_values: dict = Field(None, description="Application configuration values")
14
+ num_gpus: int = Field(None, description="Number of GPUs to allocate")
15
+ node_name: str = Field(None, description="Name of the node")
16
+ only_registered_users: bool = Field(False, description="Whether to restrict access to registered users only")
17
+ location: str = Field(None, description="Geographic location of the pool")
18
+ token_mode: TokenType = Field(TokenType.USER, description="Token type for authentication")
19
+ description: str = Field("", description="Description of the pool")
20
+ frontend: bool = Field(False, description="Whether this is a frontend request")
21
21
 
22
22
  class NodesActionRequest(BaseModel):
23
- nodes: list[str]
23
+ nodes: list[str] = Field(description="List of node names to perform the action on")
24
24
 
25
25
  class JoinPoolRequest(BaseModel):
26
- token: str
27
- ip_address: str = None
28
- node_name: str = None
29
- num_gpus: int = None
30
- frontend: bool = False
26
+ token: str = Field(description="Token to join the pool")
27
+ ip_address: str = Field(None, description="IP address for the node")
28
+ node_name: str = Field(None, description="Name of the node")
29
+ num_gpus: int = Field(None, description="Number of GPUs to allocate")
30
+ frontend: bool = Field(False, description="Whether this is a frontend request")
31
31
  class JobDetailsRequest(BaseModel):
32
- jobs: list[Job]
32
+ jobs: list[Job] = Field(description="List of jobs to get details for")
33
33
 
34
34
 
35
35
  class StopPoolRequest(BaseModel):
36
- skip_node_deletion: bool = False
36
+ skip_node_deletion: bool = Field(False, description="Whether to skip node deletion when stopping the pool")
37
37
 
38
38
  class DeployJobRequest(BaseModel):
39
- template_name: str
40
- values: dict
41
- force_namespace: str = None
42
- target_labels: dict[str, str] = None
39
+ template_name: str = Field(description="Name of the job template to use")
40
+ values: dict = Field(description="Job configuration values")
41
+ force_namespace: str = Field(None, description="Optional namespace override")
42
+ target_labels: dict[str, str] = Field(None, description="Optional target node labels")
43
43
 
44
44
  class DeleteJobRequest(BaseModel):
45
- name: str
46
- force_namespace: str = None
45
+ name: str = Field(description="Name of the job to delete")
46
+ force_namespace: str = Field(None, description="Optional namespace override")
47
47
 
48
48
  class NodeLabelsRequest(BaseModel):
49
- node_name: str
50
- labels: Dict[str, str]
49
+ node_name: str = Field(description="Name of the node to add labels to")
50
+ labels: Dict[str, str] = Field(description="Dictionary of labels to add to the node")
51
51
 
52
52
  class GetNodeLabelsRequest(BaseModel):
53
- node_names: List[str]
53
+ node_names: List[str] = Field(description="List of node names to get labels for")
kalavai_client/cli.py CHANGED
@@ -186,7 +186,9 @@ def input_gpus(non_interactive=False):
186
186
  try:
187
187
  has_gpus = check_gpu_drivers()
188
188
  if has_gpus:
189
- max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
189
+ max_gpus = len(
190
+ [r for r in run_cmd("nvidia-smi -L").decode().split("\n") if len(r.strip())>0]
191
+ )
190
192
  if non_interactive:
191
193
  num_gpus = max_gpus
192
194
  else:
kalavai_client/cluster.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import platform
2
3
  import time
3
4
  from pathlib import Path
4
5
  from abc import ABC, abstractmethod
@@ -96,7 +97,7 @@ class dockerCluster(Cluster):
96
97
  # wait for container to be setup
97
98
  while True:
98
99
  try:
99
- run_cmd(f"docker cp {self.container_name}:/etc/rancher/k3s/k3s.yaml {self.kubeconfig_file} >/dev/null 2>&1")
100
+ run_cmd(f"docker cp {self.container_name}:/etc/rancher/k3s/k3s.yaml {self.kubeconfig_file}", hide_output=True)
100
101
  break
101
102
  except:
102
103
  pass
@@ -115,15 +116,15 @@ class dockerCluster(Cluster):
115
116
  def update_dependencies(self, dependencies_file=None, debug=False, retries=3):
116
117
  if dependencies_file is not None:
117
118
  self.dependencies_file = dependencies_file
118
- if debug:
119
- output = ""
120
- else:
121
- output = " >/dev/null 2>&1"
122
119
  while True:
123
120
  try:
124
121
  home = user_path("")
125
- run_cmd(f"docker run --rm --net=host -v {home}:{home} ghcr.io/helmfile/helmfile:v0.169.2 helmfile sync --file {self.dependencies_file} --kubeconfig {self.kubeconfig_file} {output}")
126
- #run_cmd(f"helmfile sync --file {self.dependencies_file} --kubeconfig {self.kubeconfig_file} {output}")
122
+ # convert path on host to path on container (will be different in windows os)
123
+ target_path = "/cache/kalavai"
124
+ kubeconfig_path = f"{target_path}/{Path(self.kubeconfig_file).name}"
125
+ dependencies_path = f"{target_path}/{Path(self.dependencies_file).name}"
126
+
127
+ run_cmd(f"docker run --rm --net=host -v {home}:{target_path} ghcr.io/helmfile/helmfile:v0.169.2 helmfile sync --file {dependencies_path} --kubeconfig {kubeconfig_path}", hide_output=not debug)
127
128
  break
128
129
  except Exception as e:
129
130
  if retries > 0:
@@ -142,11 +143,18 @@ class dockerCluster(Cluster):
142
143
  def is_agent_running(self):
143
144
  if not os.path.isfile(self.compose_file):
144
145
  return False
145
- status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --status=running").decode()
146
- if not status:
146
+ try:
147
+ status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --status=running").decode()
148
+ if not status:
149
+ return False
150
+ if "windows" in platform.system().lower():
151
+ status = (0 == os.system(f'docker exec {self.container_name} ps aux | findstr /n /c:"k3s server" /c:"k3s agent"'))
152
+ else:
153
+ status = (0 == os.system(f'docker exec {self.container_name} ps aux | grep -v grep | grep -E "k3s (server|agent)"'))
154
+ return status
155
+ except Exception as e:
156
+ print(f"Error when checking agent. Is Docker installed and running?\n\n{str(e)}")
147
157
  return False
148
- status = (0 == os.system(f'docker exec {self.container_name} ps aux | grep -v grep | grep -E "k3s (server|agent)"'))
149
- return status
150
158
 
151
159
  def is_seed_node(self):
152
160
  if not os.path.isfile(self.compose_file):
@@ -154,7 +162,7 @@ class dockerCluster(Cluster):
154
162
  if not self.is_agent_running():
155
163
  return False
156
164
  try:
157
- run_cmd(f"docker container exec {self.container_name} cat /var/lib/rancher/k3s/server/node-token >/dev/null 2>&1")
165
+ run_cmd(f"docker container exec {self.container_name} cat /var/lib/rancher/k3s/server/node-token", hide_output=True)
158
166
  return True
159
167
  except:
160
168
  return False
@@ -162,8 +170,12 @@ class dockerCluster(Cluster):
162
170
  def is_cluster_init(self):
163
171
  if not os.path.isfile(self.compose_file):
164
172
  return False
165
- status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --all").decode()
166
- return status
173
+ try:
174
+ status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --all").decode()
175
+ return status
176
+ except Exception as e:
177
+ print(f"Error when checking cluster. Is Docker installed and running?\n\n{str(e)}")
178
+ return False
167
179
 
168
180
  def pause_agent(self):
169
181
  status = False
@@ -177,7 +189,6 @@ class dockerCluster(Cluster):
177
189
  def restart_agent(self):
178
190
  try:
179
191
  run_cmd(f'docker compose -f {self.compose_file} start')
180
-
181
192
  except:
182
193
  pass
183
194
  time.sleep(5)
@@ -186,7 +197,6 @@ class dockerCluster(Cluster):
186
197
  def get_cluster_token(self):
187
198
  if self.is_seed_node():
188
199
  return run_cmd(f"docker container exec {self.container_name} cat /var/lib/rancher/k3s/server/node-token").decode()
189
- #return run_cmd("sudo k3s token create --kubeconfig /etc/rancher/k3s/k3s.yaml --ttl 0").decode()
190
200
  else:
191
201
  return None
192
202
 
@@ -231,7 +241,7 @@ class k3sCluster(Cluster):
231
241
  flannel_iface = f"--flannel-iface {self.default_flannel_iface}"
232
242
  else:
233
243
  flannel_iface = ""
234
- run_cmd(f'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="{self.kube_version}" INSTALL_K3S_EXEC="server --node-ip {ip_address} --node-external-ip {ip_address} {flannel_iface} --flannel-backend wireguard-native {node_labels}" sh - >/dev/null 2>&1')
244
+ run_cmd(f'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="{self.kube_version}" INSTALL_K3S_EXEC="server --node-ip {ip_address} --node-external-ip {ip_address} {flannel_iface} --flannel-backend wireguard-native {node_labels}" sh - ', hide_output=True)
235
245
  run_cmd(f"sudo cp /etc/rancher/k3s/k3s.yaml {self.kubeconfig_file}")
236
246
  run_cmd(f"sudo chown $USER {self.kubeconfig_file}")
237
247
 
@@ -245,8 +255,8 @@ class k3sCluster(Cluster):
245
255
  flannel_iface = f"--flannel-iface {self.default_flannel_iface}"
246
256
  else:
247
257
  flannel_iface = ""
248
- command = f'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="{self.kube_version}" INSTALL_K3S_EXEC="agent --token {token} --server https://{url}:6443 --node-name {node_name} --node-ip {ip_address} --node-external-ip {ip_address} {flannel_iface} {node_labels}" sh - >/dev/null 2>&1'
249
- run_cmd(command)
258
+ command = f'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="{self.kube_version}" INSTALL_K3S_EXEC="agent --token {token} --server https://{url}:6443 --node-name {node_name} --node-ip {ip_address} --node-external-ip {ip_address} {flannel_iface} {node_labels}" sh - '
259
+ run_cmd(command, hide_output=True)
250
260
 
251
261
 
252
262
  def update_dependencies(self, dependencies_file=None, debug=False, retries=3):
@@ -270,13 +280,13 @@ class k3sCluster(Cluster):
270
280
 
271
281
  def remove_agent(self):
272
282
  try:
273
- run_cmd('/usr/local/bin/k3s-uninstall.sh >/dev/null 2>&1')
274
- run_cmd('sudo rm -r /etc/rancher/node/ >/dev/null 2>&1')
283
+ run_cmd('/usr/local/bin/k3s-uninstall.sh', hide_output=True)
284
+ run_cmd('sudo rm -r /etc/rancher/node/', hide_output=True)
275
285
  return True
276
286
  except:
277
287
  pass
278
288
  try:
279
- run_cmd('/usr/local/bin/k3s-agent-uninstall.sh >/dev/null 2>&1')
289
+ run_cmd('/usr/local/bin/k3s-agent-uninstall.sh', hide_output=True)
280
290
  return True
281
291
  except:
282
292
  pass
@@ -296,12 +306,12 @@ class k3sCluster(Cluster):
296
306
  def pause_agent(self):
297
307
  status = False
298
308
  try:
299
- run_cmd('sudo systemctl stop k3s >/dev/null 2>&1')
309
+ run_cmd('sudo systemctl stop k3s', hide_output=True)
300
310
  status = True
301
311
  except:
302
312
  pass
303
313
  try:
304
- run_cmd('sudo systemctl stop k3s-agent >/dev/null 2>&1')
314
+ run_cmd('sudo systemctl stop k3s-agent', hide_output=True)
305
315
  status = True
306
316
  except:
307
317
  pass
@@ -309,11 +319,11 @@ class k3sCluster(Cluster):
309
319
 
310
320
  def restart_agent(self):
311
321
  try:
312
- run_cmd('sudo systemctl start k3s >/dev/null 2>&1')
322
+ run_cmd('sudo systemctl start k3s', hide_output=True)
313
323
  except:
314
324
  pass
315
325
  try:
316
- run_cmd('sudo systemctl start k3s-agent >/dev/null 2>&1')
326
+ run_cmd('sudo systemctl start k3s-agent', hide_output=True)
317
327
  except:
318
328
  pass
319
329
  return self.is_agent_running()
kalavai_client/core.py CHANGED
@@ -33,6 +33,7 @@ from kalavai_client.utils import (
33
33
  get_public_seeds,
34
34
  load_template,
35
35
  is_storage_compatible,
36
+ get_max_gpus,
36
37
  NODE_NAME_KEY,
37
38
  MANDATORY_TOKEN_FIELDS,
38
39
  PUBLIC_LOCATION_KEY,
@@ -157,7 +158,7 @@ def check_seed_compatibility():
157
158
  logs = []
158
159
  # docker
159
160
  try:
160
- run_cmd("docker version >/dev/null 2>&1")
161
+ run_cmd("docker ps", hide_output=True)
161
162
  except:
162
163
  logs.append("[red]Docker not installed. Install instructions:\n")
163
164
  logs.append(" Linux: https://docs.docker.com/engine/install/\n")
@@ -170,7 +171,7 @@ def check_worker_compatibility():
170
171
  logs = []
171
172
  # docker
172
173
  try:
173
- run_cmd("docker version >/dev/null 2>&1")
174
+ run_cmd("docker ps", hide_output=True)
174
175
  except:
175
176
  logs.append("[red]Docker not installed. Install instructions:\n")
176
177
  logs.append(" Linux: https://docs.docker.com/engine/install/\n")
@@ -594,16 +595,6 @@ def attach_to_pool(token, node_name=None):
594
595
 
595
596
  return cluster_name
596
597
 
597
- def get_max_gpus():
598
- try:
599
- has_gpus = check_gpu_drivers()
600
- if has_gpus:
601
- return int(run_cmd("nvidia-smi -L | wc -l").decode())
602
- else:
603
- return 0
604
- except:
605
- return 0
606
-
607
598
  def generate_worker_package(num_gpus=0, node_name=None, ip_address="0.0.0.0", storage_compatible=True):
608
599
  # get pool data from token
609
600
  token = get_pool_token(mode=TokenType.WORKER)
@@ -772,10 +763,9 @@ def create_pool(
772
763
  node_name=node_name,
773
764
  node_labels=node_labels
774
765
  )
775
-
766
+
776
767
  # start server
777
768
  CLUSTER.start_seed_node()
778
-
779
769
  while not CLUSTER.is_agent_running():
780
770
  time.sleep(10)
781
771
 
kalavai_client/utils.py CHANGED
@@ -73,17 +73,20 @@ KALAVAI_AUTH = KalavaiAuth(
73
73
 
74
74
  ####### Methods to check OS compatibility ########
75
75
  def check_gpu_drivers():
76
- value = run_cmd("command -v nvidia-smi")
77
- if len(value.decode("utf-8")) == 0:
78
- # no nvidia installed, no need to check nvidia any further
79
- return False
80
- else:
81
- # check drivers are set correctly
82
- try:
83
- value = run_cmd("nvidia-smi")
84
- return True
85
- except:
86
- raise ("Nvidia not configured properly. Please check your drivers are installed and configured")
76
+ value = run_cmd("nvidia-smi", hide_output=True)
77
+ return len(value.decode("utf-8")) == 0
78
+
79
+ def get_max_gpus():
80
+ try:
81
+ has_gpus = check_gpu_drivers()
82
+ if has_gpus:
83
+ return len(
84
+ [r for r in run_cmd("nvidia-smi -L").decode().split("\n") if len(r.strip())>0]
85
+ )
86
+ else:
87
+ return 0
88
+ except:
89
+ return 0
87
90
 
88
91
  def is_storage_compatible():
89
92
  """
@@ -92,6 +95,9 @@ def is_storage_compatible():
92
95
  Exclude: WSL
93
96
  """
94
97
  try:
98
+ import platform
99
+ if "windows" in platform.system().lower():
100
+ return True
95
101
  flagged = any([
96
102
  "microsoft" in run_cmd("cat /proc/version").decode().lower()
97
103
  ])
@@ -120,6 +126,7 @@ def generate_compose_config(role, node_name, write_to_file=True, node_ip_address
120
126
  "num_gpus": num_gpus,
121
127
  "k3s_path": f"{CONTAINER_HOST_PATH}/{rand_suffix}/k3s",
122
128
  "etc_path": f"{CONTAINER_HOST_PATH}/{rand_suffix}/etc",
129
+ "random_suffix": rand_suffix,
123
130
  "node_labels": node_labels,
124
131
  "flannel_iface": DEFAULT_FLANNEL_IFACE if vpn_token is not None else "",
125
132
  "user_id": load_user_id()
@@ -231,9 +238,17 @@ def validate_poolconfig(poolconfig_file):
231
238
  return False
232
239
  return True
233
240
 
234
- def run_cmd(command):
241
+ def run_cmd(command, hide_output=False):
235
242
  try:
236
- return_value = subprocess.check_output(command, shell=True, executable="/bin/bash")
243
+ import platform
244
+ if "windows" in platform.system().lower():
245
+ if hide_output:
246
+ command = command + " > $nul 2>&1"
247
+ return_value = subprocess.check_output(command, shell=True)
248
+ else:
249
+ if hide_output:
250
+ command = command + " >/dev/null 2>&1"
251
+ return_value = subprocess.check_output(command, shell=True, executable="/bin/bash")
237
252
  return return_value
238
253
  except OSError as error:
239
254
  return error # for exit code
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: kalavai-client
3
- Version: 0.6.12
3
+ Version: 0.6.14
4
4
  Summary: Client app for kalavai platform
5
5
  License: Apache-2.0
6
6
  Keywords: LLM,platform
@@ -1,25 +1,25 @@
1
- kalavai_client/__init__.py,sha256=RnWDr4VWcRjJDXe1q7REqBSUCBo44AvIa_8rGwJtT44,23
1
+ kalavai_client/__init__.py,sha256=nfdFjB1S39Wxvq7GI6NuNOdisKxExyZQSWX0Q2sAupU,23
2
2
  kalavai_client/__main__.py,sha256=WQUfxvRsBJH5gsCJg8pLz95QnZIj7Ol8psTO77m0QE0,73
3
3
  kalavai_client/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- kalavai_client/assets/apps.yaml,sha256=AhTA3VZI27y05xHoHJCA9nvGnk8sWMhFDruBI2is3LM,6365
4
+ kalavai_client/assets/apps.yaml,sha256=L0hi826JwWW0rDRD83wcFVJVNZoUnafPdsjpharBrHE,6365
5
5
  kalavai_client/assets/apps_values.yaml,sha256=WRew3bS1MztjzcJfphuJcKn0n2T1ICRupPpr_Csjt_s,1644
6
6
  kalavai_client/assets/docker-compose-gui.yaml,sha256=DGCyGYzz1kH6kkMbo62FJHe3F9vcAmA8DOHw-c_o0Kw,752
7
- kalavai_client/assets/docker-compose-template.yaml,sha256=Nz_JzeBnQCzPCyWP5cEQHFeZzPwQqqBJ3C_xrToWlMA,1654
7
+ kalavai_client/assets/docker-compose-template.yaml,sha256=w9Eux2-lQgkGFbNhwHwurlRJe13CVZPrAGOiFBfI5I0,1763
8
8
  kalavai_client/assets/nginx.conf,sha256=drVVCg8GHucz7hmt_BI6giAhK92OV71257NTs3LthwM,225
9
9
  kalavai_client/assets/pool_config_template.yaml,sha256=fFz4w2-fMKD5KvyzFdfcWD_jSneRlmnjLc8hCctweX0,576
10
10
  kalavai_client/assets/pool_config_values.yaml,sha256=VrM3XHQfQo6QLZ68qvagooUptaYgl1pszniY_JUtemk,233
11
11
  kalavai_client/assets/user_workspace.yaml,sha256=wDvlMYknOPABAEo0dsQwU7bac8iubjAG9tdkFbJZ5Go,476
12
12
  kalavai_client/assets/user_workspace_values.yaml,sha256=G0HOzQUxrDMCwuW9kbWUZaKMzDDPVwDwzBHCL2Xi2ZM,542
13
13
  kalavai_client/auth.py,sha256=EB3PMvKUn5_KAQkezkEHEt-OMZXyfkZguIQlUFkEHcA,3243
14
- kalavai_client/bridge_api.py,sha256=ZLyFOOz_o4agm-7DrHzoSBFrH65y__hZUoEe4diBTOA,7557
15
- kalavai_client/bridge_models.py,sha256=GbIaqGFAVs-3ikVUQZldwTTc06SsxmP6iAifH0oVDro,1219
16
- kalavai_client/cli.py,sha256=tBn3l12XhOJF17cWeX84xm10HPG7rSzalyOyugzcLhw,46900
17
- kalavai_client/cluster.py,sha256=ojUBXp2bR3hVyikIEkiGDbXvQfhBXBSk_mCqLxvyP0c,12943
18
- kalavai_client/core.py,sha256=QOzNSxNJZ8tXhUTUPZUEFI3PX8UlVVnONYqVA61NU6U,34268
14
+ kalavai_client/bridge_api.py,sha256=-z0NBUSRJkVOfP807Fd-ZX2uEtKc6BCfrDD_umQ6sNg,15592
15
+ kalavai_client/bridge_models.py,sha256=775aXLTma3dv6KmKTmebAZ55ns6d9EmNno5e4blfoNY,2738
16
+ kalavai_client/cli.py,sha256=mmwLqqSYfl9k6vqveMcbHTq7g5FFd84YUUQCSH4J0k0,46967
17
+ kalavai_client/cluster.py,sha256=Z2PIXbZuSAv9xmw-MyZP1M41BpVMpirLzG51bqGA-zc,13548
18
+ kalavai_client/core.py,sha256=u8a4uYqGS0mMJh0ArcXG2hwp2uDUSuwM5ROGXRQkHZg,34051
19
19
  kalavai_client/env.py,sha256=YsfZj7LWf6ABquDsoIFFkXCFYwenpDk8zVnGsf7qv98,2823
20
- kalavai_client/utils.py,sha256=EmjWVLjTBILjw87_6ih_v5CbVqihsYHSKxfD-C_Z-es,12276
21
- kalavai_client-0.6.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
22
- kalavai_client-0.6.12.dist-info/METADATA,sha256=ShcxZWQofBxd_VVg6LbOGaPiPDrWSix_yLIbhn7MiJ4,13354
23
- kalavai_client-0.6.12.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
24
- kalavai_client-0.6.12.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
25
- kalavai_client-0.6.12.dist-info/RECORD,,
20
+ kalavai_client/utils.py,sha256=S80bLSICvWLhtQP-dmW0OF4coKwjxmhmPIja8UArTTE,12712
21
+ kalavai_client-0.6.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
22
+ kalavai_client-0.6.14.dist-info/METADATA,sha256=UkJ77kexOEA-_8c9DwnphWnk6vZl1-1g2EGPNLJ9VHI,13354
23
+ kalavai_client-0.6.14.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
24
+ kalavai_client-0.6.14.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
25
+ kalavai_client-0.6.14.dist-info/RECORD,,