kalavai-client 0.7.5__py3-none-any.whl → 0.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kalavai-client might be problematic. Click here for more details.

@@ -1,2 +1,2 @@
1
1
 
2
- __version__ = "0.7.5"
2
+ __version__ = "0.7.7"
@@ -128,6 +128,8 @@ releases:
128
128
  value: {{prometheus_server_retention}}
129
129
  - name: server.persistentVolume.size
130
130
  value: {{prometheus_disk_size}}
131
+ - name: prometheus-node-exporter.hostRootFsMount.enabled
132
+ value: false
131
133
  - name: volcano-sh
132
134
  namespace: kalavai
133
135
  chart: volcano-sh/volcano
@@ -172,7 +174,7 @@ releases:
172
174
  - name: replicas
173
175
  value: 1
174
176
  - name: image_tag
175
- value: "v2025.09.2" #"v2025.07.34"
177
+ value: "v2025.10.2" #"v2025.07.34"
176
178
  - name: deployment.in_cluster
177
179
  value: "True"
178
180
  - name: deployment.kalavai_username_key
@@ -59,7 +59,6 @@ from kalavai_client.core import (
59
59
  add_node_labels,
60
60
  get_node_labels,
61
61
  generate_worker_package,
62
- get_deployment_values,
63
62
  TokenType
64
63
  )
65
64
  from kalavai_client.utils import (
@@ -447,20 +446,6 @@ def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)
447
446
  )
448
447
  return result
449
448
 
450
- @app.get("/get_deployment_values",
451
- operation_id="get_deployment_values",
452
- summary="Get deployment template values for a given model",
453
- description="Given a model id from Huggingface, return the deployment template values required to load the model instance in the pool, including number of workers, number of gpus and gpu backend.",
454
- tags=["job_management"],
455
- response_description="Deployment template values")
456
- def get_deployment_template_values(model_id: str, api_key: str = Depends(verify_api_key)):
457
- """
458
- Get the deployment template values for a given model id:
459
-
460
- - **model_id**: Model id from Huggingface type mode
461
- """
462
- return get_deployment_values(model_id=model_id)
463
-
464
449
  @app.post("/delete_job",
465
450
  operation_id="delete_job",
466
451
  summary="Terminate and remove a job from the pool",
kalavai_client/cli.py CHANGED
@@ -6,6 +6,7 @@ import uuid
6
6
  import time
7
7
  import socket
8
8
  from pathlib import Path
9
+ from typing import Annotated
9
10
 
10
11
  import yaml
11
12
 
@@ -62,8 +63,7 @@ from kalavai_client.core import (
62
63
  uncordon_nodes,
63
64
  TokenType,
64
65
  unregister_pool,
65
- update_pool,
66
- get_deployment_values
66
+ update_pool
67
67
  )
68
68
  from kalavai_client.utils import (
69
69
  check_gpu_drivers,
@@ -78,7 +78,8 @@ from kalavai_client.utils import (
78
78
  load_user_id,
79
79
  SERVER_IP_KEY,
80
80
  CLUSTER_NAME_KEY,
81
- KALAVAI_AUTH
81
+ KALAVAI_AUTH,
82
+ parse_key_value_pairs
82
83
  )
83
84
 
84
85
 
@@ -397,7 +398,19 @@ def pool__list(*others, user_only=False):
397
398
 
398
399
 
399
400
  @arguably.command
400
- def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None, platform="amd64", ip_address: str=None, location: str=None, app_values: str=None, pool_config_values: str=None, non_interactive: bool=False):
401
+ def pool__start(
402
+ *others,
403
+ pool_config_file=None,
404
+ apps: list=None,
405
+ mtu: str=None,
406
+ platform="amd64",
407
+ ip_address: str=None,
408
+ location: str=None,
409
+ app_values: str=None,
410
+ pool_config_values: str=None,
411
+ non_interactive: bool=False,
412
+ node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {}
413
+ ):
401
414
 
402
415
  """
403
416
  Start Kalavai pool and start/resume sharing resources.
@@ -409,6 +422,9 @@ def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None,
409
422
  if CLUSTER.is_cluster_init():
410
423
  console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
411
424
  return
425
+
426
+ if node_labels:
427
+ console.log(f"[blue]Configuration received: {node_labels}")
412
428
 
413
429
  # User acknowledgement
414
430
  if not non_interactive:
@@ -441,7 +457,8 @@ def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None,
441
457
  pool_config_file=pool_config_file,
442
458
  apps=apps,
443
459
  num_gpus=input_gpus(non_interactive=non_interactive),
444
- mtu=mtu
460
+ mtu=mtu,
461
+ node_labels=node_labels
445
462
  )
446
463
 
447
464
  if "warning" in result:
@@ -499,14 +516,32 @@ def pool__check_token(token, *others, public=False, verbose=False):
499
516
  return True
500
517
 
501
518
  @arguably.command
502
- def pool__join(token, *others, mtu=None, platform="amd64", node_name=None, non_interactive=False):
519
+ def pool__join(
520
+ token,
521
+ *others,
522
+ mtu=None,
523
+ platform="amd64",
524
+ node_name=None,
525
+ non_interactive=False,
526
+ node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {}
527
+ ):
503
528
  """
504
529
  Join Kalavai pool and start/resume sharing resources.
505
530
 
506
531
  Args:
532
+ token: Pool join token
507
533
  *others: all the other positional arguments go here
534
+ mtu: Maximum transmission unit
535
+ platform: Target platform (default: amd64)
536
+ node_name: Name for this node
537
+ non_interactive: Run in non-interactive mode
538
+ node_labels: Node labels as key=value pairs (e.g., "key1=value1,key2=value2")
508
539
  """
509
540
 
541
+ # Process node labels if provided
542
+ if node_labels:
543
+ console.log(f"[blue]Configuration received: {node_labels}")
544
+
510
545
  # check that k3s is not running already in the host
511
546
  # k3s service running or preinstalled
512
547
  if CLUSTER.is_agent_running():
@@ -554,7 +589,8 @@ def pool__join(token, *others, mtu=None, platform="amd64", node_name=None, non_i
554
589
  node_name=node_name,
555
590
  num_gpus=num_gpus,
556
591
  ip_address=ip_address,
557
- mtu=mtu
592
+ mtu=mtu,
593
+ node_labels=node_labels
558
594
  )
559
595
  if "error" in result:
560
596
  console.log(f"[red]Error when connecting: {result}")
@@ -1170,12 +1206,6 @@ def job__delete(name, *others, force_namespace: str=None):
1170
1206
  console.log(f"{result}")
1171
1207
 
1172
1208
 
1173
- @arguably.command
1174
- def job__model_requirements(model_id: str, *others):
1175
- values = get_deployment_values(model_id=model_id)
1176
- console.log(values)
1177
-
1178
-
1179
1209
  @arguably.command
1180
1210
  def job__estimate(
1181
1211
  *others,
kalavai_client/core.py CHANGED
@@ -104,70 +104,6 @@ class TokenType(Enum):
104
104
  WORKER = 2
105
105
 
106
106
 
107
- def get_deployment_values(model_id: str):
108
- """
109
- Given a model ID and the resources in the pool, identify key
110
- computing values required to deploy the model.
111
- - GPU_BACKEND: rocm or cuda
112
- - WORKERS: number of nodes to use
113
- -
114
- """
115
- # get hardcoded deployment values (per model)
116
- with open(MODEL_DEPLOYMENT_VALUES_MAPPING, "r") as f:
117
- mapping = yaml.safe_load(f)
118
-
119
- def _parse_memory_str(memory: str):
120
- memory = memory.replace("G", "")
121
- return int(memory)
122
-
123
- def _get_num_workers(memory_values: list[int], size):
124
- workers = 0
125
- available_memory = 0
126
- for gpu_mem in memory_values:
127
- available_memory += gpu_mem
128
- workers += 1
129
- if available_memory >= size:
130
- break
131
- return workers
132
-
133
- # get resources
134
- if model_id in mapping:
135
- model_size = mapping[model_id]["size"]
136
- # get gpus and extract available memory
137
- nvidia_gpu_mems = []
138
- amd_gpu_mems = []
139
- backends = set()
140
- for node_name, gpus in load_gpu_models():
141
- for gpu in gpus["gpus"]:
142
- if "nvidia" in gpu["model"].lower():
143
- nvidia_gpu_mems.append(_parse_memory_str(gpu["memory"]))
144
- backends.add("cuda")
145
- else:
146
- amd_gpu_mems.append(_parse_memory_str(gpu["memory"]))
147
- backends.add("rocm")
148
- nvidia_gpu_mems = sorted(nvidia_gpu_mems, reverse=False)
149
- amd_gpu_mems = sorted(amd_gpu_mems, reverse=False)
150
- # calculate num workers required
151
- if sum(nvidia_gpu_mems) >= model_size and sum(amd_gpu_mems) < model_size:
152
- gpu_backend = "cuda"
153
- num_workers = _get_num_workers(memory_values=nvidia_gpu_mems, size=model_size)
154
- elif sum(amd_gpu_mems) >= model_size and sum(nvidia_gpu_mems) < model_size:
155
- gpu_backend = "rocm"
156
- num_workers = _get_num_workers(memory_values=amd_gpu_mems, size=model_size)
157
- else:
158
- gpu_backend = random.choice(list(backends))
159
- num_workers = _get_num_workers(
160
- memory_values=amd_gpu_mems if gpu_backend == "rocm" else nvidia_gpu_mems,
161
- size=model_size
162
- )
163
- # populate selected template
164
- mapping[model_id][gpu_backend]["values"]["workers"] = num_workers
165
- mapping[model_id][gpu_backend]["values"]["pipeline_parallel_size"] = num_workers
166
-
167
- return mapping[model_id][gpu_backend]
168
- return None
169
-
170
-
171
107
  def set_schedulable(schedulable, node_names):
172
108
  """
173
109
  Delete job in the cluster
@@ -735,7 +671,8 @@ def join_pool(
735
671
  node_name=None,
736
672
  ip_address=None,
737
673
  target_platform="amd64",
738
- mtu="1420"
674
+ mtu="1420",
675
+ node_labels={}
739
676
  ):
740
677
  compatibility = check_worker_compatibility()
741
678
  if len(compatibility["issues"]) > 0:
@@ -746,6 +683,9 @@ def join_pool(
746
683
 
747
684
  if node_name is None:
748
685
  node_name = socket.gethostname()
686
+
687
+ if mtu is None:
688
+ mtu = "1420"
749
689
 
750
690
  # check token
751
691
  valid = check_token(token=token)
@@ -765,6 +705,7 @@ def join_pool(
765
705
 
766
706
  # join private network if provided
767
707
  node_labels = {
708
+ **node_labels,
768
709
  STORAGE_CLASS_LABEL: is_storage_compatible(),
769
710
  NODE_ROLE_LABEL: "worker"
770
711
  }
@@ -829,7 +770,8 @@ def create_pool(
829
770
  num_gpus: int=-1,
830
771
  node_name: str=None,
831
772
  mtu: str=None,
832
- apps: list=[]
773
+ apps: list=[],
774
+ node_labels: dict={}
833
775
  ):
834
776
 
835
777
  if not check_seed_compatibility():
@@ -844,6 +786,7 @@ def create_pool(
844
786
  user_id = load_user_id()
845
787
 
846
788
  node_labels = {
789
+ **node_labels,
847
790
  STORAGE_CLASS_LABEL: is_storage_compatible(),
848
791
  NODE_ROLE_LABEL: "server"
849
792
  }
kalavai_client/utils.py CHANGED
@@ -106,6 +106,32 @@ def is_storage_compatible():
106
106
  return False
107
107
  ################
108
108
 
109
+ def parse_key_value_pairs(input_str: str) -> dict:
110
+ """Parse key=value pairs from a string into a dictionary.
111
+
112
+ Args:
113
+ input_str: String containing key=value pairs separated by commas
114
+
115
+ Returns:
116
+ Dictionary with parsed key-value pairs
117
+
118
+ Raises:
119
+ ValueError: If any pair is not in key=value format
120
+ """
121
+ if not input_str.strip():
122
+ return {}
123
+
124
+ result = {}
125
+ for pair in input_str.split(','):
126
+ pair = pair.strip()
127
+ if not pair:
128
+ continue
129
+ if '=' not in pair:
130
+ raise ValueError(f"Invalid key=value pair: '{pair}'. Expected format: key=value")
131
+ key, value = pair.split('=', 1)
132
+ result[key.strip()] = value.strip()
133
+ return result
134
+
109
135
  def extract_auth_token(headers):
110
136
  """
111
137
  Extract auth token. Valid headers:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kalavai-client
3
- Version: 0.7.5
3
+ Version: 0.7.7
4
4
  Summary: Client app for kalavai platform
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -1,7 +1,7 @@
1
- kalavai_client/__init__.py,sha256=2yNV33mxa4mw0NRZdX1rF9R0X_uQaQl59WtFaMqS_ss,22
1
+ kalavai_client/__init__.py,sha256=_he-4_uHDJNaqLXyzYdBG_rgSv3K0P5DCywjUCh7v5c,22
2
2
  kalavai_client/__main__.py,sha256=WQUfxvRsBJH5gsCJg8pLz95QnZIj7Ol8psTO77m0QE0,73
3
3
  kalavai_client/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- kalavai_client/assets/apps.yaml,sha256=z5lLrBbR8KVP5Ou3pbD6cELYWATs9VJuRu-PPe1GMNk,7007
4
+ kalavai_client/assets/apps.yaml,sha256=juO6kIl8LjykWI-BPsALSdADc9cDE73t8wP2W6ll2co,7087
5
5
  kalavai_client/assets/default_pool_config.yaml,sha256=FLfT0i-xPgEZhleTOZmfyFYrB9fAEMOmZ0YGLyJr0OA,1879
6
6
  kalavai_client/assets/docker-compose-gui.yaml,sha256=OAVO0ohaCpDB9FGeih0yAbVNwUfDtaCzssZ25uiuJyA,787
7
7
  kalavai_client/assets/docker-compose-template.yaml,sha256=8giwNUVbVGxF_YppNMIsS7evWZ8qb4n2enK22u-x6Pk,1920
@@ -11,15 +11,15 @@ kalavai_client/assets/pool_config_template.yaml,sha256=MhBZQsEMKrBgbUVSKgIGmXWhy
11
11
  kalavai_client/assets/user_workspace.yaml,sha256=wDvlMYknOPABAEo0dsQwU7bac8iubjAG9tdkFbJZ5Go,476
12
12
  kalavai_client/assets/user_workspace_values.yaml,sha256=G0HOzQUxrDMCwuW9kbWUZaKMzDDPVwDwzBHCL2Xi2ZM,542
13
13
  kalavai_client/auth.py,sha256=EB3PMvKUn5_KAQkezkEHEt-OMZXyfkZguIQlUFkEHcA,3243
14
- kalavai_client/bridge_api.py,sha256=6NGRIbu3CtWyT75XCFKkLSfllF54vGJmYBjnX9qiWX8,27657
14
+ kalavai_client/bridge_api.py,sha256=0TvAGgsyfMkbcVqoPKsjhXQLo06WetBH93mZH-pOM7U,26921
15
15
  kalavai_client/bridge_models.py,sha256=bq6vQNTI1py7e_1YgnBZhorFsAKoBqBVN7nRukCuQRE,2960
16
- kalavai_client/cli.py,sha256=7EdEHjJxoS0lgmWFyT72k7rl3-qJ7ODz9ju_wZGYB1I,48312
16
+ kalavai_client/cli.py,sha256=yGCOS-eI01E_EdxWFG9IgUZnSQyjaEV8NqrX2M-YcZw,49010
17
17
  kalavai_client/cluster.py,sha256=Z2PIXbZuSAv9xmw-MyZP1M41BpVMpirLzG51bqGA-zc,13548
18
- kalavai_client/core.py,sha256=BUlCOeQnXTAqwjD4rxWIshk9Sf_s-QJy9UB6cUbcECo,38906
18
+ kalavai_client/core.py,sha256=ElDeF0AjzoZaECUg1RbwolJItuX6b71i03Gr9gIXLfY,36529
19
19
  kalavai_client/env.py,sha256=0L5gfEo5KY8gflrW-rSADx10ffDa-8gXmmrGWztKUd8,3099
20
- kalavai_client/utils.py,sha256=L0FQJT8EucabPAbxGwzMqeawJdj4wTZqmmZ0OdsyVwM,13470
21
- kalavai_client-0.7.5.dist-info/METADATA,sha256=u04xcVwDMzNCp-xU_6TE791B5xQlSc2jnqnzRB587vI,13175
22
- kalavai_client-0.7.5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
23
- kalavai_client-0.7.5.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
24
- kalavai_client-0.7.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
25
- kalavai_client-0.7.5.dist-info/RECORD,,
20
+ kalavai_client/utils.py,sha256=swMB-3elI20GihzZgqSFD5sQfQ-Nh2_TGbEnKNHx7EU,14230
21
+ kalavai_client-0.7.7.dist-info/METADATA,sha256=o6ywphe6vy6cDQI7RuRhBFjcW5-KZOYV_xMCFrPmpMc,13175
22
+ kalavai_client-0.7.7.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
23
+ kalavai_client-0.7.7.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
24
+ kalavai_client-0.7.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
25
+ kalavai_client-0.7.7.dist-info/RECORD,,