kalavai-client 0.7.6__tar.gz → 0.7.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kalavai-client might be problematic. Click here for more details.

Files changed (25) hide show
  1. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/PKG-INFO +1 -1
  2. kalavai_client-0.7.8/kalavai_client/__init__.py +2 -0
  3. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/assets/apps.yaml +1 -1
  4. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/assets/default_pool_config.yaml +1 -0
  5. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/bridge_api.py +0 -15
  6. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/cli.py +45 -13
  7. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/core.py +11 -67
  8. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/utils.py +40 -2
  9. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/pyproject.toml +1 -1
  10. kalavai_client-0.7.6/kalavai_client/__init__.py +0 -2
  11. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/LICENSE +0 -0
  12. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/README.md +0 -0
  13. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/__main__.py +0 -0
  14. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/assets/__init__.py +0 -0
  15. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/assets/docker-compose-gui.yaml +0 -0
  16. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/assets/docker-compose-template.yaml +0 -0
  17. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/assets/model_deployment_values.yaml +0 -0
  18. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/assets/nginx.conf +0 -0
  19. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/assets/pool_config_template.yaml +0 -0
  20. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/assets/user_workspace.yaml +0 -0
  21. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/assets/user_workspace_values.yaml +0 -0
  22. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/auth.py +0 -0
  23. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/bridge_models.py +0 -0
  24. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/cluster.py +0 -0
  25. {kalavai_client-0.7.6 → kalavai_client-0.7.8}/kalavai_client/env.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kalavai-client
3
- Version: 0.7.6
3
+ Version: 0.7.8
4
4
  Summary: Client app for kalavai platform
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -0,0 +1,2 @@
1
+
2
+ __version__ = "0.7.8"
@@ -174,7 +174,7 @@ releases:
174
174
  - name: replicas
175
175
  value: 1
176
176
  - name: image_tag
177
- value: "v2025.10.1" #"v2025.07.34"
177
+ value: "{{watcher_image_tag}}" #"v2025.07.34"
178
178
  - name: deployment.in_cluster
179
179
  value: "True"
180
180
  - name: deployment.kalavai_username_key
@@ -4,6 +4,7 @@ server:
4
4
  location: null
5
5
  name: "kalavai_cluster"
6
6
  mtu: 1280
7
+ watcher_image_tag: "v2025.10.3"
7
8
 
8
9
  core:
9
10
  # Deploy systems
@@ -59,7 +59,6 @@ from kalavai_client.core import (
59
59
  add_node_labels,
60
60
  get_node_labels,
61
61
  generate_worker_package,
62
- get_deployment_values,
63
62
  TokenType
64
63
  )
65
64
  from kalavai_client.utils import (
@@ -447,20 +446,6 @@ def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)
447
446
  )
448
447
  return result
449
448
 
450
- @app.get("/get_deployment_values",
451
- operation_id="get_deployment_values",
452
- summary="Get deployment template values for a given model",
453
- description="Given a model id from Huggingface, return the deployment template values required to load the model instance in the pool, including number of workers, number of gpus and gpu backend.",
454
- tags=["job_management"],
455
- response_description="Deployment template values")
456
- def get_deployment_template_values(model_id: str, api_key: str = Depends(verify_api_key)):
457
- """
458
- Get the deployment template values for a given model id:
459
-
460
- - **model_id**: Model id from Huggingface type mode
461
- """
462
- return get_deployment_values(model_id=model_id)
463
-
464
449
  @app.post("/delete_job",
465
450
  operation_id="delete_job",
466
451
  summary="Terminate and remove a job from the pool",
@@ -6,6 +6,7 @@ import uuid
6
6
  import time
7
7
  import socket
8
8
  from pathlib import Path
9
+ from typing import Annotated
9
10
 
10
11
  import yaml
11
12
 
@@ -62,8 +63,7 @@ from kalavai_client.core import (
62
63
  uncordon_nodes,
63
64
  TokenType,
64
65
  unregister_pool,
65
- update_pool,
66
- get_deployment_values
66
+ update_pool
67
67
  )
68
68
  from kalavai_client.utils import (
69
69
  check_gpu_drivers,
@@ -78,7 +78,8 @@ from kalavai_client.utils import (
78
78
  load_user_id,
79
79
  SERVER_IP_KEY,
80
80
  CLUSTER_NAME_KEY,
81
- KALAVAI_AUTH
81
+ KALAVAI_AUTH,
82
+ parse_key_value_pairs
82
83
  )
83
84
 
84
85
 
@@ -397,7 +398,20 @@ def pool__list(*others, user_only=False):
397
398
 
398
399
 
399
400
  @arguably.command
400
- def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None, platform="amd64", ip_address: str=None, location: str=None, app_values: str=None, pool_config_values: str=None, non_interactive: bool=False):
401
+ def pool__start(
402
+ *others,
403
+ pool_config_file=None,
404
+ apps: list=None,
405
+ mtu: str=None,
406
+ watcher_image_tag: str=None,
407
+ platform="amd64",
408
+ ip_address: str=None,
409
+ location: str=None,
410
+ app_values: str=None,
411
+ pool_config_values: str=None,
412
+ non_interactive: bool=False,
413
+ node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {}
414
+ ):
401
415
 
402
416
  """
403
417
  Start Kalavai pool and start/resume sharing resources.
@@ -409,6 +423,9 @@ def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None,
409
423
  if CLUSTER.is_cluster_init():
410
424
  console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
411
425
  return
426
+
427
+ if node_labels:
428
+ console.log(f"[blue]Configuration received: {node_labels}")
412
429
 
413
430
  # User acknowledgement
414
431
  if not non_interactive:
@@ -438,10 +455,12 @@ def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None,
438
455
  ip_address=ip_address,
439
456
  location=location,
440
457
  target_platform=platform,
458
+ watcher_image_tag=watcher_image_tag,
441
459
  pool_config_file=pool_config_file,
442
460
  apps=apps,
443
461
  num_gpus=input_gpus(non_interactive=non_interactive),
444
- mtu=mtu
462
+ mtu=mtu,
463
+ node_labels=node_labels
445
464
  )
446
465
 
447
466
  if "warning" in result:
@@ -499,14 +518,32 @@ def pool__check_token(token, *others, public=False, verbose=False):
499
518
  return True
500
519
 
501
520
  @arguably.command
502
- def pool__join(token, *others, mtu=None, platform="amd64", node_name=None, non_interactive=False):
521
+ def pool__join(
522
+ token,
523
+ *others,
524
+ mtu=None,
525
+ platform="amd64",
526
+ node_name=None,
527
+ non_interactive=False,
528
+ node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {}
529
+ ):
503
530
  """
504
531
  Join Kalavai pool and start/resume sharing resources.
505
532
 
506
533
  Args:
534
+ token: Pool join token
507
535
  *others: all the other positional arguments go here
536
+ mtu: Maximum transmission unit
537
+ platform: Target platform (default: amd64)
538
+ node_name: Name for this node
539
+ non_interactive: Run in non-interactive mode
540
+ node_labels: Node labels as key=value pairs (e.g., "key1=value1,key2=value2")
508
541
  """
509
542
 
543
+ # Process node labels if provided
544
+ if node_labels:
545
+ console.log(f"[blue]Configuration received: {node_labels}")
546
+
510
547
  # check that k3s is not running already in the host
511
548
  # k3s service running or preinstalled
512
549
  if CLUSTER.is_agent_running():
@@ -554,7 +591,8 @@ def pool__join(token, *others, mtu=None, platform="amd64", node_name=None, non_i
554
591
  node_name=node_name,
555
592
  num_gpus=num_gpus,
556
593
  ip_address=ip_address,
557
- mtu=mtu
594
+ mtu=mtu,
595
+ node_labels=node_labels
558
596
  )
559
597
  if "error" in result:
560
598
  console.log(f"[red]Error when connecting: {result}")
@@ -1170,12 +1208,6 @@ def job__delete(name, *others, force_namespace: str=None):
1170
1208
  console.log(f"{result}")
1171
1209
 
1172
1210
 
1173
- @arguably.command
1174
- def job__model_requirements(model_id: str, *others):
1175
- values = get_deployment_values(model_id=model_id)
1176
- console.log(values)
1177
-
1178
-
1179
1211
  @arguably.command
1180
1212
  def job__estimate(
1181
1213
  *others,
@@ -47,6 +47,7 @@ from kalavai_client.utils import (
47
47
  WRITE_AUTH_KEY,
48
48
  WATCHER_PORT_KEY,
49
49
  WATCHER_SERVICE_KEY,
50
+ WATCHER_IMAGE_TAG_KEY,
50
51
  USER_NODE_LABEL_KEY,
51
52
  ALLOW_UNREGISTERED_USER_KEY,
52
53
  KALAVAI_AUTH
@@ -104,70 +105,6 @@ class TokenType(Enum):
104
105
  WORKER = 2
105
106
 
106
107
 
107
- def get_deployment_values(model_id: str):
108
- """
109
- Given a model ID and the resources in the pool, identify key
110
- computing values required to deploy the model.
111
- - GPU_BACKEND: rocm or cuda
112
- - WORKERS: number of nodes to use
113
- -
114
- """
115
- # get hardcoded deployment values (per model)
116
- with open(MODEL_DEPLOYMENT_VALUES_MAPPING, "r") as f:
117
- mapping = yaml.safe_load(f)
118
-
119
- def _parse_memory_str(memory: str):
120
- memory = memory.replace("G", "")
121
- return int(memory)
122
-
123
- def _get_num_workers(memory_values: list[int], size):
124
- workers = 0
125
- available_memory = 0
126
- for gpu_mem in memory_values:
127
- available_memory += gpu_mem
128
- workers += 1
129
- if available_memory >= size:
130
- break
131
- return workers
132
-
133
- # get resources
134
- if model_id in mapping:
135
- model_size = mapping[model_id]["size"]
136
- # get gpus and extract available memory
137
- nvidia_gpu_mems = []
138
- amd_gpu_mems = []
139
- backends = set()
140
- for node_name, gpus in load_gpu_models():
141
- for gpu in gpus["gpus"]:
142
- if "nvidia" in gpu["model"].lower():
143
- nvidia_gpu_mems.append(_parse_memory_str(gpu["memory"]))
144
- backends.add("cuda")
145
- else:
146
- amd_gpu_mems.append(_parse_memory_str(gpu["memory"]))
147
- backends.add("rocm")
148
- nvidia_gpu_mems = sorted(nvidia_gpu_mems, reverse=False)
149
- amd_gpu_mems = sorted(amd_gpu_mems, reverse=False)
150
- # calculate num workers required
151
- if sum(nvidia_gpu_mems) >= model_size and sum(amd_gpu_mems) < model_size:
152
- gpu_backend = "cuda"
153
- num_workers = _get_num_workers(memory_values=nvidia_gpu_mems, size=model_size)
154
- elif sum(amd_gpu_mems) >= model_size and sum(nvidia_gpu_mems) < model_size:
155
- gpu_backend = "rocm"
156
- num_workers = _get_num_workers(memory_values=amd_gpu_mems, size=model_size)
157
- else:
158
- gpu_backend = random.choice(list(backends))
159
- num_workers = _get_num_workers(
160
- memory_values=amd_gpu_mems if gpu_backend == "rocm" else nvidia_gpu_mems,
161
- size=model_size
162
- )
163
- # populate selected template
164
- mapping[model_id][gpu_backend]["values"]["workers"] = num_workers
165
- mapping[model_id][gpu_backend]["values"]["pipeline_parallel_size"] = num_workers
166
-
167
- return mapping[model_id][gpu_backend]
168
- return None
169
-
170
-
171
108
  def set_schedulable(schedulable, node_names):
172
109
  """
173
110
  Delete job in the cluster
@@ -735,7 +672,8 @@ def join_pool(
735
672
  node_name=None,
736
673
  ip_address=None,
737
674
  target_platform="amd64",
738
- mtu="1420"
675
+ mtu="1420",
676
+ node_labels={}
739
677
  ):
740
678
  compatibility = check_worker_compatibility()
741
679
  if len(compatibility["issues"]) > 0:
@@ -768,6 +706,7 @@ def join_pool(
768
706
 
769
707
  # join private network if provided
770
708
  node_labels = {
709
+ **node_labels,
771
710
  STORAGE_CLASS_LABEL: is_storage_compatible(),
772
711
  NODE_ROLE_LABEL: "worker"
773
712
  }
@@ -826,13 +765,15 @@ def create_pool(
826
765
  ip_address: str=None,
827
766
  location: str=None,
828
767
  target_platform: str="amd64",
768
+ watcher_image_tag: str=None,
829
769
  pool_config_file: str=None,
830
770
  description: str="",
831
771
  token_mode: TokenType=TokenType.USER,
832
772
  num_gpus: int=-1,
833
773
  node_name: str=None,
834
774
  mtu: str=None,
835
- apps: list=[]
775
+ apps: list=[],
776
+ node_labels: dict={}
836
777
  ):
837
778
 
838
779
  if not check_seed_compatibility():
@@ -847,6 +788,7 @@ def create_pool(
847
788
  user_id = load_user_id()
848
789
 
849
790
  node_labels = {
791
+ **node_labels,
850
792
  STORAGE_CLASS_LABEL: is_storage_compatible(),
851
793
  NODE_ROLE_LABEL: "server"
852
794
  }
@@ -859,6 +801,7 @@ def create_pool(
859
801
  config_values = yaml.safe_load(f)
860
802
  # use default values if not provided
861
803
  try:
804
+ watcher_image_tag = config_values["server"]["watcher_image_tag"] if watcher_image_tag is None else watcher_image_tag
862
805
  cluster_name = config_values["server"]["name"] if cluster_name is None else cluster_name
863
806
  ip_address = config_values["server"]["ip_address"] if ip_address is None else ip_address
864
807
  location = config_values["server"]["location"] if location is None else location
@@ -916,7 +859,8 @@ def create_pool(
916
859
  WATCHER_PORT_KEY: DEFAULT_WATCHER_PORT,
917
860
  WATCHER_SERVICE_KEY: watcher_service,
918
861
  USER_NODE_LABEL_KEY: USER_NODE_LABEL,
919
- ALLOW_UNREGISTERED_USER_KEY: True, # Change this if only registered users are allowed
862
+ WATCHER_IMAGE_TAG_KEY: watcher_image_tag,
863
+ ALLOW_UNREGISTERED_USER_KEY: True # Change this if only registered users are allowed
920
864
  }
921
865
 
922
866
  store_server_info(
@@ -44,6 +44,7 @@ USER_API_KEY = "user_api_key"
44
44
  READONLY_AUTH_KEY = "watcher_readonly_key"
45
45
  WATCHER_SERVICE_KEY = "watcher_service"
46
46
  WATCHER_PORT_KEY = "watcher_port"
47
+ WATCHER_IMAGE_TAG_KEY = "watcher_image_tag"
47
48
  ENDPOINT_PORTS_KEY = "endpoint_ports"
48
49
  TEMPLATE_ID_FIELD = "id_field"
49
50
  TEMPLATE_ID_KEY = "deployment_id"
@@ -106,6 +107,32 @@ def is_storage_compatible():
106
107
  return False
107
108
  ################
108
109
 
110
+ def parse_key_value_pairs(input_str: str) -> dict:
111
+ """Parse key=value pairs from a string into a dictionary.
112
+
113
+ Args:
114
+ input_str: String containing key=value pairs separated by commas
115
+
116
+ Returns:
117
+ Dictionary with parsed key-value pairs
118
+
119
+ Raises:
120
+ ValueError: If any pair is not in key=value format
121
+ """
122
+ if not input_str.strip():
123
+ return {}
124
+
125
+ result = {}
126
+ for pair in input_str.split(','):
127
+ pair = pair.strip()
128
+ if not pair:
129
+ continue
130
+ if '=' not in pair:
131
+ raise ValueError(f"Invalid key=value pair: '{pair}'. Expected format: key=value")
132
+ key, value = pair.split('=', 1)
133
+ result[key.strip()] = value.strip()
134
+ return result
135
+
109
136
  def extract_auth_token(headers):
110
137
  """
111
138
  Extract auth token. Valid headers:
@@ -129,8 +156,19 @@ def extract_auth_token(headers):
129
156
  return {"error": str(e)}
130
157
 
131
158
 
132
- def generate_compose_config(role, node_name, mtu="1420", target_platform="amd64", write_to_file=True, node_ip_address="0.0.0.0", num_gpus=0, node_labels=None, pool_ip=None, vpn_token=None, pool_token=None):
133
-
159
+ def generate_compose_config(
160
+ role,
161
+ node_name,
162
+ mtu="1420",
163
+ target_platform="amd64",
164
+ write_to_file=True,
165
+ node_ip_address="0.0.0.0",
166
+ num_gpus=0,
167
+ node_labels=None,
168
+ pool_ip=None,
169
+ vpn_token=None,
170
+ pool_token=None
171
+ ):
134
172
  if node_labels is not None:
135
173
  node_labels = " ".join([f"--node-label {key}={value}" for key, value in node_labels.items()])
136
174
  rand_suffix = uuid.uuid4().hex[:8]
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "kalavai-client"
3
- version = "0.7.6"
3
+ version = "0.7.8"
4
4
  authors = [
5
5
  {name = "Carlos Fernandez Musoles", email = "carlos@kalavai.net"}
6
6
  ]
@@ -1,2 +0,0 @@
1
-
2
- __version__ = "0.7.6"
File without changes
File without changes