kalavai-client 0.7.6__py3-none-any.whl → 0.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kalavai-client might be problematic. Click here for more details.
- kalavai_client/__init__.py +1 -1
- kalavai_client/assets/apps.yaml +1 -1
- kalavai_client/bridge_api.py +0 -15
- kalavai_client/cli.py +43 -13
- kalavai_client/core.py +6 -66
- kalavai_client/utils.py +26 -0
- {kalavai_client-0.7.6.dist-info → kalavai_client-0.7.7.dist-info}/METADATA +1 -1
- {kalavai_client-0.7.6.dist-info → kalavai_client-0.7.7.dist-info}/RECORD +11 -11
- {kalavai_client-0.7.6.dist-info → kalavai_client-0.7.7.dist-info}/WHEEL +0 -0
- {kalavai_client-0.7.6.dist-info → kalavai_client-0.7.7.dist-info}/entry_points.txt +0 -0
- {kalavai_client-0.7.6.dist-info → kalavai_client-0.7.7.dist-info}/licenses/LICENSE +0 -0
kalavai_client/__init__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
|
|
2
|
-
__version__ = "0.7.
|
|
2
|
+
__version__ = "0.7.7"
|
kalavai_client/assets/apps.yaml
CHANGED
kalavai_client/bridge_api.py
CHANGED
|
@@ -59,7 +59,6 @@ from kalavai_client.core import (
|
|
|
59
59
|
add_node_labels,
|
|
60
60
|
get_node_labels,
|
|
61
61
|
generate_worker_package,
|
|
62
|
-
get_deployment_values,
|
|
63
62
|
TokenType
|
|
64
63
|
)
|
|
65
64
|
from kalavai_client.utils import (
|
|
@@ -447,20 +446,6 @@ def job_deploy(request: DeployJobRequest, api_key: str = Depends(verify_api_key)
|
|
|
447
446
|
)
|
|
448
447
|
return result
|
|
449
448
|
|
|
450
|
-
@app.get("/get_deployment_values",
|
|
451
|
-
operation_id="get_deployment_values",
|
|
452
|
-
summary="Get deployment template values for a given model",
|
|
453
|
-
description="Given a model id from Huggingface, return the deployment template values required to load the model instance in the pool, including number of workers, number of gpus and gpu backend.",
|
|
454
|
-
tags=["job_management"],
|
|
455
|
-
response_description="Deployment template values")
|
|
456
|
-
def get_deployment_template_values(model_id: str, api_key: str = Depends(verify_api_key)):
|
|
457
|
-
"""
|
|
458
|
-
Get the deployment template values for a given model id:
|
|
459
|
-
|
|
460
|
-
- **model_id**: Model id from Huggingface type mode
|
|
461
|
-
"""
|
|
462
|
-
return get_deployment_values(model_id=model_id)
|
|
463
|
-
|
|
464
449
|
@app.post("/delete_job",
|
|
465
450
|
operation_id="delete_job",
|
|
466
451
|
summary="Terminate and remove a job from the pool",
|
kalavai_client/cli.py
CHANGED
|
@@ -6,6 +6,7 @@ import uuid
|
|
|
6
6
|
import time
|
|
7
7
|
import socket
|
|
8
8
|
from pathlib import Path
|
|
9
|
+
from typing import Annotated
|
|
9
10
|
|
|
10
11
|
import yaml
|
|
11
12
|
|
|
@@ -62,8 +63,7 @@ from kalavai_client.core import (
|
|
|
62
63
|
uncordon_nodes,
|
|
63
64
|
TokenType,
|
|
64
65
|
unregister_pool,
|
|
65
|
-
update_pool
|
|
66
|
-
get_deployment_values
|
|
66
|
+
update_pool
|
|
67
67
|
)
|
|
68
68
|
from kalavai_client.utils import (
|
|
69
69
|
check_gpu_drivers,
|
|
@@ -78,7 +78,8 @@ from kalavai_client.utils import (
|
|
|
78
78
|
load_user_id,
|
|
79
79
|
SERVER_IP_KEY,
|
|
80
80
|
CLUSTER_NAME_KEY,
|
|
81
|
-
KALAVAI_AUTH
|
|
81
|
+
KALAVAI_AUTH,
|
|
82
|
+
parse_key_value_pairs
|
|
82
83
|
)
|
|
83
84
|
|
|
84
85
|
|
|
@@ -397,7 +398,19 @@ def pool__list(*others, user_only=False):
|
|
|
397
398
|
|
|
398
399
|
|
|
399
400
|
@arguably.command
|
|
400
|
-
def pool__start(
|
|
401
|
+
def pool__start(
|
|
402
|
+
*others,
|
|
403
|
+
pool_config_file=None,
|
|
404
|
+
apps: list=None,
|
|
405
|
+
mtu: str=None,
|
|
406
|
+
platform="amd64",
|
|
407
|
+
ip_address: str=None,
|
|
408
|
+
location: str=None,
|
|
409
|
+
app_values: str=None,
|
|
410
|
+
pool_config_values: str=None,
|
|
411
|
+
non_interactive: bool=False,
|
|
412
|
+
node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {}
|
|
413
|
+
):
|
|
401
414
|
|
|
402
415
|
"""
|
|
403
416
|
Start Kalavai pool and start/resume sharing resources.
|
|
@@ -409,6 +422,9 @@ def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None,
|
|
|
409
422
|
if CLUSTER.is_cluster_init():
|
|
410
423
|
console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
|
|
411
424
|
return
|
|
425
|
+
|
|
426
|
+
if node_labels:
|
|
427
|
+
console.log(f"[blue]Configuration received: {node_labels}")
|
|
412
428
|
|
|
413
429
|
# User acknowledgement
|
|
414
430
|
if not non_interactive:
|
|
@@ -441,7 +457,8 @@ def pool__start(*others, pool_config_file=None, apps: list=None, mtu: str=None,
|
|
|
441
457
|
pool_config_file=pool_config_file,
|
|
442
458
|
apps=apps,
|
|
443
459
|
num_gpus=input_gpus(non_interactive=non_interactive),
|
|
444
|
-
mtu=mtu
|
|
460
|
+
mtu=mtu,
|
|
461
|
+
node_labels=node_labels
|
|
445
462
|
)
|
|
446
463
|
|
|
447
464
|
if "warning" in result:
|
|
@@ -499,14 +516,32 @@ def pool__check_token(token, *others, public=False, verbose=False):
|
|
|
499
516
|
return True
|
|
500
517
|
|
|
501
518
|
@arguably.command
|
|
502
|
-
def pool__join(
|
|
519
|
+
def pool__join(
|
|
520
|
+
token,
|
|
521
|
+
*others,
|
|
522
|
+
mtu=None,
|
|
523
|
+
platform="amd64",
|
|
524
|
+
node_name=None,
|
|
525
|
+
non_interactive=False,
|
|
526
|
+
node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {}
|
|
527
|
+
):
|
|
503
528
|
"""
|
|
504
529
|
Join Kalavai pool and start/resume sharing resources.
|
|
505
530
|
|
|
506
531
|
Args:
|
|
532
|
+
token: Pool join token
|
|
507
533
|
*others: all the other positional arguments go here
|
|
534
|
+
mtu: Maximum transmission unit
|
|
535
|
+
platform: Target platform (default: amd64)
|
|
536
|
+
node_name: Name for this node
|
|
537
|
+
non_interactive: Run in non-interactive mode
|
|
538
|
+
node_labels: Node labels as key=value pairs (e.g., "key1=value1,key2=value2")
|
|
508
539
|
"""
|
|
509
540
|
|
|
541
|
+
# Process node labels if provided
|
|
542
|
+
if node_labels:
|
|
543
|
+
console.log(f"[blue]Configuration received: {node_labels}")
|
|
544
|
+
|
|
510
545
|
# check that k3s is not running already in the host
|
|
511
546
|
# k3s service running or preinstalled
|
|
512
547
|
if CLUSTER.is_agent_running():
|
|
@@ -554,7 +589,8 @@ def pool__join(token, *others, mtu=None, platform="amd64", node_name=None, non_i
|
|
|
554
589
|
node_name=node_name,
|
|
555
590
|
num_gpus=num_gpus,
|
|
556
591
|
ip_address=ip_address,
|
|
557
|
-
mtu=mtu
|
|
592
|
+
mtu=mtu,
|
|
593
|
+
node_labels=node_labels
|
|
558
594
|
)
|
|
559
595
|
if "error" in result:
|
|
560
596
|
console.log(f"[red]Error when connecting: {result}")
|
|
@@ -1170,12 +1206,6 @@ def job__delete(name, *others, force_namespace: str=None):
|
|
|
1170
1206
|
console.log(f"{result}")
|
|
1171
1207
|
|
|
1172
1208
|
|
|
1173
|
-
@arguably.command
|
|
1174
|
-
def job__model_requirements(model_id: str, *others):
|
|
1175
|
-
values = get_deployment_values(model_id=model_id)
|
|
1176
|
-
console.log(values)
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
1209
|
@arguably.command
|
|
1180
1210
|
def job__estimate(
|
|
1181
1211
|
*others,
|
kalavai_client/core.py
CHANGED
|
@@ -104,70 +104,6 @@ class TokenType(Enum):
|
|
|
104
104
|
WORKER = 2
|
|
105
105
|
|
|
106
106
|
|
|
107
|
-
def get_deployment_values(model_id: str):
|
|
108
|
-
"""
|
|
109
|
-
Given a model ID and the resources in the pool, identify key
|
|
110
|
-
computing values required to deploy the model.
|
|
111
|
-
- GPU_BACKEND: rocm or cuda
|
|
112
|
-
- WORKERS: number of nodes to use
|
|
113
|
-
-
|
|
114
|
-
"""
|
|
115
|
-
# get hardcoded deployment values (per model)
|
|
116
|
-
with open(MODEL_DEPLOYMENT_VALUES_MAPPING, "r") as f:
|
|
117
|
-
mapping = yaml.safe_load(f)
|
|
118
|
-
|
|
119
|
-
def _parse_memory_str(memory: str):
|
|
120
|
-
memory = memory.replace("G", "")
|
|
121
|
-
return int(memory)
|
|
122
|
-
|
|
123
|
-
def _get_num_workers(memory_values: list[int], size):
|
|
124
|
-
workers = 0
|
|
125
|
-
available_memory = 0
|
|
126
|
-
for gpu_mem in memory_values:
|
|
127
|
-
available_memory += gpu_mem
|
|
128
|
-
workers += 1
|
|
129
|
-
if available_memory >= size:
|
|
130
|
-
break
|
|
131
|
-
return workers
|
|
132
|
-
|
|
133
|
-
# get resources
|
|
134
|
-
if model_id in mapping:
|
|
135
|
-
model_size = mapping[model_id]["size"]
|
|
136
|
-
# get gpus and extract available memory
|
|
137
|
-
nvidia_gpu_mems = []
|
|
138
|
-
amd_gpu_mems = []
|
|
139
|
-
backends = set()
|
|
140
|
-
for node_name, gpus in load_gpu_models():
|
|
141
|
-
for gpu in gpus["gpus"]:
|
|
142
|
-
if "nvidia" in gpu["model"].lower():
|
|
143
|
-
nvidia_gpu_mems.append(_parse_memory_str(gpu["memory"]))
|
|
144
|
-
backends.add("cuda")
|
|
145
|
-
else:
|
|
146
|
-
amd_gpu_mems.append(_parse_memory_str(gpu["memory"]))
|
|
147
|
-
backends.add("rocm")
|
|
148
|
-
nvidia_gpu_mems = sorted(nvidia_gpu_mems, reverse=False)
|
|
149
|
-
amd_gpu_mems = sorted(amd_gpu_mems, reverse=False)
|
|
150
|
-
# calculate num workers required
|
|
151
|
-
if sum(nvidia_gpu_mems) >= model_size and sum(amd_gpu_mems) < model_size:
|
|
152
|
-
gpu_backend = "cuda"
|
|
153
|
-
num_workers = _get_num_workers(memory_values=nvidia_gpu_mems, size=model_size)
|
|
154
|
-
elif sum(amd_gpu_mems) >= model_size and sum(nvidia_gpu_mems) < model_size:
|
|
155
|
-
gpu_backend = "rocm"
|
|
156
|
-
num_workers = _get_num_workers(memory_values=amd_gpu_mems, size=model_size)
|
|
157
|
-
else:
|
|
158
|
-
gpu_backend = random.choice(list(backends))
|
|
159
|
-
num_workers = _get_num_workers(
|
|
160
|
-
memory_values=amd_gpu_mems if gpu_backend == "rocm" else nvidia_gpu_mems,
|
|
161
|
-
size=model_size
|
|
162
|
-
)
|
|
163
|
-
# populate selected template
|
|
164
|
-
mapping[model_id][gpu_backend]["values"]["workers"] = num_workers
|
|
165
|
-
mapping[model_id][gpu_backend]["values"]["pipeline_parallel_size"] = num_workers
|
|
166
|
-
|
|
167
|
-
return mapping[model_id][gpu_backend]
|
|
168
|
-
return None
|
|
169
|
-
|
|
170
|
-
|
|
171
107
|
def set_schedulable(schedulable, node_names):
|
|
172
108
|
"""
|
|
173
109
|
Delete job in the cluster
|
|
@@ -735,7 +671,8 @@ def join_pool(
|
|
|
735
671
|
node_name=None,
|
|
736
672
|
ip_address=None,
|
|
737
673
|
target_platform="amd64",
|
|
738
|
-
mtu="1420"
|
|
674
|
+
mtu="1420",
|
|
675
|
+
node_labels={}
|
|
739
676
|
):
|
|
740
677
|
compatibility = check_worker_compatibility()
|
|
741
678
|
if len(compatibility["issues"]) > 0:
|
|
@@ -768,6 +705,7 @@ def join_pool(
|
|
|
768
705
|
|
|
769
706
|
# join private network if provided
|
|
770
707
|
node_labels = {
|
|
708
|
+
**node_labels,
|
|
771
709
|
STORAGE_CLASS_LABEL: is_storage_compatible(),
|
|
772
710
|
NODE_ROLE_LABEL: "worker"
|
|
773
711
|
}
|
|
@@ -832,7 +770,8 @@ def create_pool(
|
|
|
832
770
|
num_gpus: int=-1,
|
|
833
771
|
node_name: str=None,
|
|
834
772
|
mtu: str=None,
|
|
835
|
-
apps: list=[]
|
|
773
|
+
apps: list=[],
|
|
774
|
+
node_labels: dict={}
|
|
836
775
|
):
|
|
837
776
|
|
|
838
777
|
if not check_seed_compatibility():
|
|
@@ -847,6 +786,7 @@ def create_pool(
|
|
|
847
786
|
user_id = load_user_id()
|
|
848
787
|
|
|
849
788
|
node_labels = {
|
|
789
|
+
**node_labels,
|
|
850
790
|
STORAGE_CLASS_LABEL: is_storage_compatible(),
|
|
851
791
|
NODE_ROLE_LABEL: "server"
|
|
852
792
|
}
|
kalavai_client/utils.py
CHANGED
|
@@ -106,6 +106,32 @@ def is_storage_compatible():
|
|
|
106
106
|
return False
|
|
107
107
|
################
|
|
108
108
|
|
|
109
|
+
def parse_key_value_pairs(input_str: str) -> dict:
|
|
110
|
+
"""Parse key=value pairs from a string into a dictionary.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
input_str: String containing key=value pairs separated by commas
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Dictionary with parsed key-value pairs
|
|
117
|
+
|
|
118
|
+
Raises:
|
|
119
|
+
ValueError: If any pair is not in key=value format
|
|
120
|
+
"""
|
|
121
|
+
if not input_str.strip():
|
|
122
|
+
return {}
|
|
123
|
+
|
|
124
|
+
result = {}
|
|
125
|
+
for pair in input_str.split(','):
|
|
126
|
+
pair = pair.strip()
|
|
127
|
+
if not pair:
|
|
128
|
+
continue
|
|
129
|
+
if '=' not in pair:
|
|
130
|
+
raise ValueError(f"Invalid key=value pair: '{pair}'. Expected format: key=value")
|
|
131
|
+
key, value = pair.split('=', 1)
|
|
132
|
+
result[key.strip()] = value.strip()
|
|
133
|
+
return result
|
|
134
|
+
|
|
109
135
|
def extract_auth_token(headers):
|
|
110
136
|
"""
|
|
111
137
|
Extract auth token. Valid headers:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
kalavai_client/__init__.py,sha256=
|
|
1
|
+
kalavai_client/__init__.py,sha256=_he-4_uHDJNaqLXyzYdBG_rgSv3K0P5DCywjUCh7v5c,22
|
|
2
2
|
kalavai_client/__main__.py,sha256=WQUfxvRsBJH5gsCJg8pLz95QnZIj7Ol8psTO77m0QE0,73
|
|
3
3
|
kalavai_client/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
kalavai_client/assets/apps.yaml,sha256=
|
|
4
|
+
kalavai_client/assets/apps.yaml,sha256=juO6kIl8LjykWI-BPsALSdADc9cDE73t8wP2W6ll2co,7087
|
|
5
5
|
kalavai_client/assets/default_pool_config.yaml,sha256=FLfT0i-xPgEZhleTOZmfyFYrB9fAEMOmZ0YGLyJr0OA,1879
|
|
6
6
|
kalavai_client/assets/docker-compose-gui.yaml,sha256=OAVO0ohaCpDB9FGeih0yAbVNwUfDtaCzssZ25uiuJyA,787
|
|
7
7
|
kalavai_client/assets/docker-compose-template.yaml,sha256=8giwNUVbVGxF_YppNMIsS7evWZ8qb4n2enK22u-x6Pk,1920
|
|
@@ -11,15 +11,15 @@ kalavai_client/assets/pool_config_template.yaml,sha256=MhBZQsEMKrBgbUVSKgIGmXWhy
|
|
|
11
11
|
kalavai_client/assets/user_workspace.yaml,sha256=wDvlMYknOPABAEo0dsQwU7bac8iubjAG9tdkFbJZ5Go,476
|
|
12
12
|
kalavai_client/assets/user_workspace_values.yaml,sha256=G0HOzQUxrDMCwuW9kbWUZaKMzDDPVwDwzBHCL2Xi2ZM,542
|
|
13
13
|
kalavai_client/auth.py,sha256=EB3PMvKUn5_KAQkezkEHEt-OMZXyfkZguIQlUFkEHcA,3243
|
|
14
|
-
kalavai_client/bridge_api.py,sha256=
|
|
14
|
+
kalavai_client/bridge_api.py,sha256=0TvAGgsyfMkbcVqoPKsjhXQLo06WetBH93mZH-pOM7U,26921
|
|
15
15
|
kalavai_client/bridge_models.py,sha256=bq6vQNTI1py7e_1YgnBZhorFsAKoBqBVN7nRukCuQRE,2960
|
|
16
|
-
kalavai_client/cli.py,sha256=
|
|
16
|
+
kalavai_client/cli.py,sha256=yGCOS-eI01E_EdxWFG9IgUZnSQyjaEV8NqrX2M-YcZw,49010
|
|
17
17
|
kalavai_client/cluster.py,sha256=Z2PIXbZuSAv9xmw-MyZP1M41BpVMpirLzG51bqGA-zc,13548
|
|
18
|
-
kalavai_client/core.py,sha256=
|
|
18
|
+
kalavai_client/core.py,sha256=ElDeF0AjzoZaECUg1RbwolJItuX6b71i03Gr9gIXLfY,36529
|
|
19
19
|
kalavai_client/env.py,sha256=0L5gfEo5KY8gflrW-rSADx10ffDa-8gXmmrGWztKUd8,3099
|
|
20
|
-
kalavai_client/utils.py,sha256=
|
|
21
|
-
kalavai_client-0.7.
|
|
22
|
-
kalavai_client-0.7.
|
|
23
|
-
kalavai_client-0.7.
|
|
24
|
-
kalavai_client-0.7.
|
|
25
|
-
kalavai_client-0.7.
|
|
20
|
+
kalavai_client/utils.py,sha256=swMB-3elI20GihzZgqSFD5sQfQ-Nh2_TGbEnKNHx7EU,14230
|
|
21
|
+
kalavai_client-0.7.7.dist-info/METADATA,sha256=o6ywphe6vy6cDQI7RuRhBFjcW5-KZOYV_xMCFrPmpMc,13175
|
|
22
|
+
kalavai_client-0.7.7.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
23
|
+
kalavai_client-0.7.7.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
|
|
24
|
+
kalavai_client-0.7.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
25
|
+
kalavai_client-0.7.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|