kalavai-client 0.7.6__tar.gz → 0.7.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kalavai-client might be problematic. Click here for more details.

Files changed (25) hide show
  1. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/PKG-INFO +1 -1
  2. kalavai_client-0.7.7/kalavai_client/__init__.py +2 -0
  3. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/assets/apps.yaml +1 -1
  4. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/bridge_api.py +0 -15
  5. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/cli.py +43 -13
  6. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/core.py +6 -66
  7. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/utils.py +26 -0
  8. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/pyproject.toml +1 -1
  9. kalavai_client-0.7.6/kalavai_client/__init__.py +0 -2
  10. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/LICENSE +0 -0
  11. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/README.md +0 -0
  12. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/__main__.py +0 -0
  13. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/assets/__init__.py +0 -0
  14. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/assets/default_pool_config.yaml +0 -0
  15. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/assets/docker-compose-gui.yaml +0 -0
  16. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/assets/docker-compose-template.yaml +0 -0
  17. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/assets/model_deployment_values.yaml +0 -0
  18. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/assets/nginx.conf +0 -0
  19. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/assets/pool_config_template.yaml +0 -0
  20. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/assets/user_workspace.yaml +0 -0
  21. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/assets/user_workspace_values.yaml +0 -0
  22. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/auth.py +0 -0
  23. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/bridge_models.py +0 -0
  24. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/cluster.py +0 -0
  25. {kalavai_client-0.7.6 → kalavai_client-0.7.7}/kalavai_client/env.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kalavai-client
3
- Version: 0.7.6
3
+ Version: 0.7.7
4
4
  Summary: Client app for kalavai platform
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -0,0 +1,2 @@
1
+
2
+ __version__ = "0.7.7"
@@ -174,7 +174,7 @@ releases:
174
174
  - name: replicas
175
175
  value: 1
176
176
  - name: image_tag
177
- value: "v2025.10.1" #"v2025.07.34"
177
+ value: "v2025.10.2" #"v2025.07.34"
178
178
  - name: deployment.in_cluster
179
179
  value: "True"
180
180
  - name: deployment.kalavai_username_key
@@ -59,7 +59,6 @@ from kalavai_client.core import (
59
59
  add_node_labels,
60
60
  get_node_labels,
61
61
  generate_worker_package,
62
- get_deployment_values,
63
62
  TokenType
64
63
  )
65
64
  from kalavai_client.utils import (
@@ -447,20 +446,6 @@ def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)
447
446
  )
448
447
  return result
449
448
 
450
- @app.get("/get_deployment_values",
451
- operation_id="get_deployment_values",
452
- summary="Get deployment template values for a given model",
453
- description="Given a model id from Huggingface, return the deployment template values required to load the model instance in the pool, including number of workers, number of gpus and gpu backend.",
454
- tags=["job_management"],
455
- response_description="Deployment template values")
456
- def get_deployment_template_values(model_id: str, api_key: str = Depends(verify_api_key)):
457
- """
458
- Get the deployment template values for a given model id:
459
-
460
- - **model_id**: Model id from Huggingface type mode
461
- """
462
- return get_deployment_values(model_id=model_id)
463
-
464
449
  @app.post("/delete_job",
465
450
  operation_id="delete_job",
466
451
  summary="Terminate and remove a job from the pool",
@@ -6,6 +6,7 @@ import uuid
6
6
  import time
7
7
  import socket
8
8
  from pathlib import Path
9
+ from typing import Annotated
9
10
 
10
11
  import yaml
11
12
 
@@ -62,8 +63,7 @@ from kalavai_client.core import (
62
63
  uncordon_nodes,
63
64
  TokenType,
64
65
  unregister_pool,
65
- update_pool,
66
- get_deployment_values
66
+ update_pool
67
67
  )
68
68
  from kalavai_client.utils import (
69
69
  check_gpu_drivers,
@@ -78,7 +78,8 @@ from kalavai_client.utils import (
78
78
  load_user_id,
79
79
  SERVER_IP_KEY,
80
80
  CLUSTER_NAME_KEY,
81
- KALAVAI_AUTH
81
+ KALAVAI_AUTH,
82
+ parse_key_value_pairs
82
83
  )
83
84
 
84
85
 
@@ -397,7 +398,19 @@ def pool__list(*others, user_only=False):
397
398
 
398
399
 
399
400
  @arguably.command
400
- def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None, platform="amd64", ip_address: str=None, location: str=None, app_values: str=None, pool_config_values: str=None, non_interactive: bool=False):
401
+ def pool__start(
402
+ *others,
403
+ pool_config_file=None,
404
+ apps: list=None,
405
+ mtu: str=None,
406
+ platform="amd64",
407
+ ip_address: str=None,
408
+ location: str=None,
409
+ app_values: str=None,
410
+ pool_config_values: str=None,
411
+ non_interactive: bool=False,
412
+ node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {}
413
+ ):
401
414
 
402
415
  """
403
416
  Start Kalavai pool and start/resume sharing resources.
@@ -409,6 +422,9 @@ def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None,
409
422
  if CLUSTER.is_cluster_init():
410
423
  console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
411
424
  return
425
+
426
+ if node_labels:
427
+ console.log(f"[blue]Configuration received: {node_labels}")
412
428
 
413
429
  # User acknowledgement
414
430
  if not non_interactive:
@@ -441,7 +457,8 @@ def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None,
441
457
  pool_config_file=pool_config_file,
442
458
  apps=apps,
443
459
  num_gpus=input_gpus(non_interactive=non_interactive),
444
- mtu=mtu
460
+ mtu=mtu,
461
+ node_labels=node_labels
445
462
  )
446
463
 
447
464
  if "warning" in result:
@@ -499,14 +516,32 @@ def pool__check_token(token, *others, public=False, verbose=False):
499
516
  return True
500
517
 
501
518
  @arguably.command
502
- def pool__join(token, *others, mtu=None, platform="amd64", node_name=None, non_interactive=False):
519
+ def pool__join(
520
+ token,
521
+ *others,
522
+ mtu=None,
523
+ platform="amd64",
524
+ node_name=None,
525
+ non_interactive=False,
526
+ node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {}
527
+ ):
503
528
  """
504
529
  Join Kalavai pool and start/resume sharing resources.
505
530
 
506
531
  Args:
532
+ token: Pool join token
507
533
  *others: all the other positional arguments go here
534
+ mtu: Maximum transmission unit
535
+ platform: Target platform (default: amd64)
536
+ node_name: Name for this node
537
+ non_interactive: Run in non-interactive mode
538
+ node_labels: Node labels as key=value pairs (e.g., "key1=value1,key2=value2")
508
539
  """
509
540
 
541
+ # Process node labels if provided
542
+ if node_labels:
543
+ console.log(f"[blue]Configuration received: {node_labels}")
544
+
510
545
  # check that k3s is not running already in the host
511
546
  # k3s service running or preinstalled
512
547
  if CLUSTER.is_agent_running():
@@ -554,7 +589,8 @@ def pool__join(token, *others, mtu=None, platform="amd64", node_name=None, non_i
554
589
  node_name=node_name,
555
590
  num_gpus=num_gpus,
556
591
  ip_address=ip_address,
557
- mtu=mtu
592
+ mtu=mtu,
593
+ node_labels=node_labels
558
594
  )
559
595
  if "error" in result:
560
596
  console.log(f"[red]Error when connecting: {result}")
@@ -1170,12 +1206,6 @@ def job__delete(name, *others, force_namespace: str=None):
1170
1206
  console.log(f"{result}")
1171
1207
 
1172
1208
 
1173
- @arguably.command
1174
- def job__model_requirements(model_id: str, *others):
1175
- values = get_deployment_values(model_id=model_id)
1176
- console.log(values)
1177
-
1178
-
1179
1209
  @arguably.command
1180
1210
  def job__estimate(
1181
1211
  *others,
@@ -104,70 +104,6 @@ class TokenType(Enum):
104
104
  WORKER = 2
105
105
 
106
106
 
107
- def get_deployment_values(model_id: str):
108
- """
109
- Given a model ID and the resources in the pool, identify key
110
- computing values required to deploy the model.
111
- - GPU_BACKEND: rocm or cuda
112
- - WORKERS: number of nodes to use
113
- -
114
- """
115
- # get hardcoded deployment values (per model)
116
- with open(MODEL_DEPLOYMENT_VALUES_MAPPING, "r") as f:
117
- mapping = yaml.safe_load(f)
118
-
119
- def _parse_memory_str(memory: str):
120
- memory = memory.replace("G", "")
121
- return int(memory)
122
-
123
- def _get_num_workers(memory_values: list[int], size):
124
- workers = 0
125
- available_memory = 0
126
- for gpu_mem in memory_values:
127
- available_memory += gpu_mem
128
- workers += 1
129
- if available_memory >= size:
130
- break
131
- return workers
132
-
133
- # get resources
134
- if model_id in mapping:
135
- model_size = mapping[model_id]["size"]
136
- # get gpus and extract available memory
137
- nvidia_gpu_mems = []
138
- amd_gpu_mems = []
139
- backends = set()
140
- for node_name, gpus in load_gpu_models():
141
- for gpu in gpus["gpus"]:
142
- if "nvidia" in gpu["model"].lower():
143
- nvidia_gpu_mems.append(_parse_memory_str(gpu["memory"]))
144
- backends.add("cuda")
145
- else:
146
- amd_gpu_mems.append(_parse_memory_str(gpu["memory"]))
147
- backends.add("rocm")
148
- nvidia_gpu_mems = sorted(nvidia_gpu_mems, reverse=False)
149
- amd_gpu_mems = sorted(amd_gpu_mems, reverse=False)
150
- # calculate num workers required
151
- if sum(nvidia_gpu_mems) >= model_size and sum(amd_gpu_mems) < model_size:
152
- gpu_backend = "cuda"
153
- num_workers = _get_num_workers(memory_values=nvidia_gpu_mems, size=model_size)
154
- elif sum(amd_gpu_mems) >= model_size and sum(nvidia_gpu_mems) < model_size:
155
- gpu_backend = "rocm"
156
- num_workers = _get_num_workers(memory_values=amd_gpu_mems, size=model_size)
157
- else:
158
- gpu_backend = random.choice(list(backends))
159
- num_workers = _get_num_workers(
160
- memory_values=amd_gpu_mems if gpu_backend == "rocm" else nvidia_gpu_mems,
161
- size=model_size
162
- )
163
- # populate selected template
164
- mapping[model_id][gpu_backend]["values"]["workers"] = num_workers
165
- mapping[model_id][gpu_backend]["values"]["pipeline_parallel_size"] = num_workers
166
-
167
- return mapping[model_id][gpu_backend]
168
- return None
169
-
170
-
171
107
  def set_schedulable(schedulable, node_names):
172
108
  """
173
109
  Delete job in the cluster
@@ -735,7 +671,8 @@ def join_pool(
735
671
  node_name=None,
736
672
  ip_address=None,
737
673
  target_platform="amd64",
738
- mtu="1420"
674
+ mtu="1420",
675
+ node_labels={}
739
676
  ):
740
677
  compatibility = check_worker_compatibility()
741
678
  if len(compatibility["issues"]) > 0:
@@ -768,6 +705,7 @@ def join_pool(
768
705
 
769
706
  # join private network if provided
770
707
  node_labels = {
708
+ **node_labels,
771
709
  STORAGE_CLASS_LABEL: is_storage_compatible(),
772
710
  NODE_ROLE_LABEL: "worker"
773
711
  }
@@ -832,7 +770,8 @@ def create_pool(
832
770
  num_gpus: int=-1,
833
771
  node_name: str=None,
834
772
  mtu: str=None,
835
- apps: list=[]
773
+ apps: list=[],
774
+ node_labels: dict={}
836
775
  ):
837
776
 
838
777
  if not check_seed_compatibility():
@@ -847,6 +786,7 @@ def create_pool(
847
786
  user_id = load_user_id()
848
787
 
849
788
  node_labels = {
789
+ **node_labels,
850
790
  STORAGE_CLASS_LABEL: is_storage_compatible(),
851
791
  NODE_ROLE_LABEL: "server"
852
792
  }
@@ -106,6 +106,32 @@ def is_storage_compatible():
106
106
  return False
107
107
  ################
108
108
 
109
+ def parse_key_value_pairs(input_str: str) -> dict:
110
+ """Parse key=value pairs from a string into a dictionary.
111
+
112
+ Args:
113
+ input_str: String containing key=value pairs separated by commas
114
+
115
+ Returns:
116
+ Dictionary with parsed key-value pairs
117
+
118
+ Raises:
119
+ ValueError: If any pair is not in key=value format
120
+ """
121
+ if not input_str.strip():
122
+ return {}
123
+
124
+ result = {}
125
+ for pair in input_str.split(','):
126
+ pair = pair.strip()
127
+ if not pair:
128
+ continue
129
+ if '=' not in pair:
130
+ raise ValueError(f"Invalid key=value pair: '{pair}'. Expected format: key=value")
131
+ key, value = pair.split('=', 1)
132
+ result[key.strip()] = value.strip()
133
+ return result
134
+
109
135
  def extract_auth_token(headers):
110
136
  """
111
137
  Extract auth token. Valid headers:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "kalavai-client"
3
- version = "0.7.6"
3
+ version = "0.7.7"
4
4
  authors = [
5
5
  {name = "Carlos Fernandez Musoles", email = "carlos@kalavai.net"}
6
6
  ]
@@ -1,2 +0,0 @@
1
-
2
- __version__ = "0.7.6"
File without changes
File without changes