kalavai-client 0.5.12__py3-none-any.whl → 0.5.13__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- kalavai_client/__init__.py +1 -1
- kalavai_client/assets/apps.yaml +1 -1
- kalavai_client/cli.py +119 -216
- kalavai_client/cluster.py +3 -0
- kalavai_client/core.py +227 -0
- kalavai_client/env.py +19 -0
- kalavai_client/utils.py +3 -11
- {kalavai_client-0.5.12.dist-info → kalavai_client-0.5.13.dist-info}/METADATA +5 -2
- {kalavai_client-0.5.12.dist-info → kalavai_client-0.5.13.dist-info}/RECORD +12 -10
- {kalavai_client-0.5.12.dist-info → kalavai_client-0.5.13.dist-info}/LICENSE +0 -0
- {kalavai_client-0.5.12.dist-info → kalavai_client-0.5.13.dist-info}/WHEEL +0 -0
- {kalavai_client-0.5.12.dist-info → kalavai_client-0.5.13.dist-info}/entry_points.txt +0 -0
kalavai_client/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
|
2
|
-
__version__ = "0.5.
|
2
|
+
__version__ = "0.5.13"
|
kalavai_client/assets/apps.yaml
CHANGED
kalavai_client/cli.py
CHANGED
@@ -15,6 +15,21 @@ import netifaces as ni
|
|
15
15
|
import arguably
|
16
16
|
from rich.console import Console
|
17
17
|
|
18
|
+
from kalavai_client.env import (
|
19
|
+
USER_COOKIE,
|
20
|
+
USER_LOCAL_SERVER_FILE,
|
21
|
+
TEMPLATE_LABEL,
|
22
|
+
user_path
|
23
|
+
)
|
24
|
+
from kalavai_client.core import (
|
25
|
+
fetch_resources,
|
26
|
+
fetch_job_names,
|
27
|
+
fetch_job_details,
|
28
|
+
fetch_devices,
|
29
|
+
fetch_job_logs,
|
30
|
+
fetch_gpus,
|
31
|
+
load_gpu_models
|
32
|
+
)
|
18
33
|
from kalavai_client.utils import (
|
19
34
|
check_gpu_drivers,
|
20
35
|
run_cmd,
|
@@ -27,10 +42,8 @@ from kalavai_client.utils import (
|
|
27
42
|
generate_table,
|
28
43
|
request_to_server,
|
29
44
|
resource_path,
|
30
|
-
user_path,
|
31
45
|
safe_remove,
|
32
46
|
leave_vpn,
|
33
|
-
join_vpn,
|
34
47
|
get_vpn_details,
|
35
48
|
load_server_info,
|
36
49
|
user_login,
|
@@ -68,7 +81,6 @@ LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
|
|
68
81
|
VERSION = 1
|
69
82
|
RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
|
70
83
|
CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
|
71
|
-
TEMPLATE_LABEL = "kalavai.job.name"
|
72
84
|
RAY_LABEL = "kalavai.ray.name"
|
73
85
|
PVC_NAME_LABEL = "kalavai.storage.name"
|
74
86
|
DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
|
@@ -98,9 +110,7 @@ USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
|
|
98
110
|
USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
|
99
111
|
USER_HELM_APPS_FILE = user_path("apps.yaml")
|
100
112
|
USER_KUBECONFIG_FILE = user_path("kubeconfig")
|
101
|
-
USER_LOCAL_SERVER_FILE = user_path(".server")
|
102
113
|
USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
|
103
|
-
USER_COOKIE = user_path(".user_cookie.pkl")
|
104
114
|
|
105
115
|
|
106
116
|
console = Console()
|
@@ -118,7 +128,7 @@ CLUSTER = dockerCluster(
|
|
118
128
|
######################
|
119
129
|
## HELPER FUNCTIONS ##
|
120
130
|
######################
|
121
|
-
|
131
|
+
|
122
132
|
def check_seed_compatibility():
|
123
133
|
"""Check required packages to start pools"""
|
124
134
|
logs = []
|
@@ -288,21 +298,11 @@ def select_ip_address(subnet=None):
|
|
288
298
|
console.log("[red] Input error")
|
289
299
|
return ips[option]
|
290
300
|
|
291
|
-
def fetch_gpus():
|
292
|
-
data = request_to_server(
|
293
|
-
method="post",
|
294
|
-
endpoint="/v1/get_node_gpus",
|
295
|
-
data={},
|
296
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
297
|
-
user_cookie=USER_COOKIE
|
298
|
-
)
|
299
|
-
return data.items()
|
300
|
-
|
301
301
|
def select_gpus(message):
|
302
302
|
console.log(f"[yellow]{message}")
|
303
303
|
gpu_models = ["Any/None"]
|
304
304
|
gpu_models_full = ["Any/None"]
|
305
|
-
available_gpus =
|
305
|
+
available_gpus = load_gpu_models()
|
306
306
|
for _, gpus in available_gpus:
|
307
307
|
for gpu in gpus["gpus"]:
|
308
308
|
#status = "free" if "ready" in gpu else "busy"
|
@@ -737,7 +737,7 @@ def pool__check_token(token, *others, public=False):
|
|
737
737
|
|
738
738
|
|
739
739
|
@arguably.command
|
740
|
-
def pool__join(token, *others, node_name=None
|
740
|
+
def pool__join(token, *others, node_name=None):
|
741
741
|
"""
|
742
742
|
Join Kalavai pool and start/resume sharing resources.
|
743
743
|
|
@@ -860,7 +860,7 @@ def pool__join(token, *others, node_name=None, ip_address: str=None):
|
|
860
860
|
CLUSTER.start_worker_node()
|
861
861
|
except Exception as e:
|
862
862
|
console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
|
863
|
-
|
863
|
+
pool__stop()
|
864
864
|
exit()
|
865
865
|
|
866
866
|
# ensure we are connected
|
@@ -869,6 +869,22 @@ def pool__join(token, *others, node_name=None, ip_address: str=None):
|
|
869
869
|
time.sleep(30)
|
870
870
|
if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
|
871
871
|
break
|
872
|
+
|
873
|
+
# send note to server to let them know the node is coming online
|
874
|
+
if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
|
875
|
+
console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with [yellow]--node-name'")
|
876
|
+
pool__stop()
|
877
|
+
return
|
878
|
+
|
879
|
+
# check the node has connected successfully
|
880
|
+
try:
|
881
|
+
while not CLUSTER.is_agent_running():
|
882
|
+
console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
|
883
|
+
time.sleep(30)
|
884
|
+
except KeyboardInterrupt:
|
885
|
+
console.log("[red]Installation aborted. Leaving pool.")
|
886
|
+
pool__stop()
|
887
|
+
return
|
872
888
|
|
873
889
|
init_user_workspace()
|
874
890
|
|
@@ -963,29 +979,24 @@ def pool__gpus(*others, available=False):
|
|
963
979
|
console.log(f"[red]Problems with your pool: {str(e)}")
|
964
980
|
return
|
965
981
|
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
)
|
985
|
-
|
986
|
-
except Exception as e:
|
987
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
988
|
-
|
982
|
+
gpus = fetch_gpus(available=available)
|
983
|
+
if "error" in gpus:
|
984
|
+
console.log(f"[red]Error when fetching gpus: {gpus}")
|
985
|
+
return
|
986
|
+
|
987
|
+
columns = ["Node", "Ready", "GPU(s)", "Available", "Total"]
|
988
|
+
rows = []
|
989
|
+
for gpu in gpus:
|
990
|
+
rows.append([
|
991
|
+
gpu.node,
|
992
|
+
str(gpu.ready),
|
993
|
+
gpu.model,
|
994
|
+
str(gpu.available),
|
995
|
+
str(gpu.total)
|
996
|
+
])
|
997
|
+
console.print(
|
998
|
+
generate_table(columns=columns, rows=rows,end_sections=[n for n in range(len(rows))])
|
999
|
+
)
|
989
1000
|
|
990
1001
|
@arguably.command
|
991
1002
|
def pool__resources(*others):
|
@@ -998,45 +1009,33 @@ def pool__resources(*others):
|
|
998
1009
|
console.log(f"[red]Problems with your pool: {str(e)}")
|
999
1010
|
return
|
1000
1011
|
|
1001
|
-
|
1002
|
-
|
1003
|
-
method="get",
|
1004
|
-
endpoint="/v1/get_cluster_total_resources",
|
1005
|
-
data={},
|
1006
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1007
|
-
user_cookie=USER_COOKIE
|
1008
|
-
)
|
1009
|
-
available = request_to_server(
|
1010
|
-
method="get",
|
1011
|
-
endpoint="/v1/get_cluster_available_resources",
|
1012
|
-
data={},
|
1013
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1014
|
-
user_cookie=USER_COOKIE
|
1015
|
-
)
|
1016
|
-
columns = []
|
1017
|
-
total_values = []
|
1018
|
-
available_values = []
|
1019
|
-
for col in total.keys():
|
1020
|
-
if col in RESOURCE_EXCLUDE:
|
1021
|
-
continue
|
1022
|
-
columns.append(col)
|
1023
|
-
total_values.append(str(total[col]))
|
1024
|
-
available_values.append(str(available[col]))
|
1025
|
-
|
1026
|
-
columns = [""] + columns
|
1027
|
-
total_values = ["Total"] + total_values
|
1028
|
-
available_values = ["Available"] + available_values
|
1029
|
-
|
1030
|
-
rows = [
|
1031
|
-
tuple(available_values),
|
1032
|
-
tuple(total_values)
|
1033
|
-
]
|
1034
|
-
console.print(
|
1035
|
-
generate_table(columns=columns, rows=rows, end_sections=[0, 1])
|
1036
|
-
)
|
1037
|
-
|
1038
|
-
except Exception as e:
|
1012
|
+
data = fetch_resources()
|
1013
|
+
if "error" in data:
|
1039
1014
|
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1015
|
+
return
|
1016
|
+
|
1017
|
+
columns = []
|
1018
|
+
total_values = []
|
1019
|
+
available_values = []
|
1020
|
+
for col in data["total"].keys():
|
1021
|
+
if col in RESOURCE_EXCLUDE:
|
1022
|
+
continue
|
1023
|
+
columns.append(col)
|
1024
|
+
total_values.append(str(data["total"][col]))
|
1025
|
+
available_values.append(str(data["available"][col]))
|
1026
|
+
|
1027
|
+
columns = [""] + columns
|
1028
|
+
total_values = ["Total"] + total_values
|
1029
|
+
available_values = ["Available"] + available_values
|
1030
|
+
|
1031
|
+
rows = [
|
1032
|
+
tuple(available_values),
|
1033
|
+
tuple(total_values)
|
1034
|
+
]
|
1035
|
+
console.print(
|
1036
|
+
generate_table(columns=columns, rows=rows, end_sections=[0, 1])
|
1037
|
+
)
|
1038
|
+
|
1040
1039
|
|
1041
1040
|
@arguably.command
|
1042
1041
|
def pool__update(*others):
|
@@ -1333,22 +1332,18 @@ def node__list(*others):
|
|
1333
1332
|
return
|
1334
1333
|
|
1335
1334
|
try:
|
1336
|
-
|
1337
|
-
method="get",
|
1338
|
-
endpoint="/v1/get_nodes",
|
1339
|
-
data={},
|
1340
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1341
|
-
user_cookie=USER_COOKIE
|
1342
|
-
)
|
1335
|
+
devices = fetch_devices()
|
1343
1336
|
rows = []
|
1344
|
-
columns = ["Node name"]
|
1345
|
-
for
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1337
|
+
columns = ["Node name", "Memory Pressure", "Disk pressure", "PID pressure", "Ready", "Unschedulable"]
|
1338
|
+
for device in devices:
|
1339
|
+
rows.append([
|
1340
|
+
device.name,
|
1341
|
+
str(device.memory_pressure),
|
1342
|
+
str(device.disk_pressure),
|
1343
|
+
str(device.pid_pressure),
|
1344
|
+
str(device.ready),
|
1345
|
+
str(device.unschedulable)
|
1346
|
+
])
|
1352
1347
|
|
1353
1348
|
console.log("Nodes with 'unschedulable=True' will not receive workload")
|
1354
1349
|
console.log("To make a node unschedulable (i.e. won't receive workloads) use [yellow]kalavai node cordon <node name>")
|
@@ -1540,7 +1535,9 @@ def job__test(local_template_dir, *others, values, defaults, force_namespace: st
|
|
1540
1535
|
console.log(f"[red]--values ({values}) is not a valid local file")
|
1541
1536
|
return
|
1542
1537
|
with open(values, "r") as f:
|
1543
|
-
|
1538
|
+
raw_values = yaml.load(f, Loader=yaml.SafeLoader)
|
1539
|
+
values_dict = {variable["name"]: variable['value'] for variable in raw_values}
|
1540
|
+
|
1544
1541
|
# load defaults
|
1545
1542
|
if not os.path.isfile(defaults):
|
1546
1543
|
console.log(f"[red]--defaults ({defaults}) is not a valid local file")
|
@@ -1648,7 +1645,7 @@ def job__estimate(billion_parameters, *others, precision=32):
|
|
1648
1645
|
|
1649
1646
|
average_vram = 8
|
1650
1647
|
required_memory = float(billion_parameters) * (precision / 8) / 1.2
|
1651
|
-
available_gpus =
|
1648
|
+
available_gpus = load_gpu_models()
|
1652
1649
|
vrams = []
|
1653
1650
|
for _, gpus in available_gpus:
|
1654
1651
|
for model in gpus["gpus"]:
|
@@ -1705,7 +1702,7 @@ def job__status(name, *others):
|
|
1705
1702
|
return
|
1706
1703
|
|
1707
1704
|
@arguably.command
|
1708
|
-
def job__list(*others
|
1705
|
+
def job__list(*others):
|
1709
1706
|
"""
|
1710
1707
|
List jobs in the cluster
|
1711
1708
|
"""
|
@@ -1715,106 +1712,22 @@ def job__list(*others, detailed=False):
|
|
1715
1712
|
console.log(f"[red]Problems with your pool: {str(e)}")
|
1716
1713
|
return
|
1717
1714
|
|
1718
|
-
|
1719
|
-
|
1720
|
-
"
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1724
|
-
result = request_to_server(
|
1725
|
-
method="post",
|
1726
|
-
endpoint="/v1/get_objects_of_type",
|
1727
|
-
data=data,
|
1728
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1729
|
-
user_cookie=USER_COOKIE
|
1730
|
-
)
|
1731
|
-
all_deployments = defaultdict(list)
|
1732
|
-
for ns, ds in result.items():
|
1733
|
-
all_deployments[ns].extend([d["metadata"]["labels"][TEMPLATE_LABEL] for d in ds["items"]])
|
1734
|
-
#deployments = {ns: d["metadata"]["labels"][TEMPLATE_LABEL] for ns, ds in result.items() for d in ds["items"]}
|
1735
|
-
except Exception as e:
|
1736
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1737
|
-
return
|
1738
|
-
if len(all_deployments.keys()) == 0:
|
1715
|
+
all_deployments = fetch_job_names()
|
1716
|
+
if "error" in all_deployments:
|
1717
|
+
console.log(f"[red]Error when connecting to kalavai service: {all_deployments}")
|
1718
|
+
return
|
1719
|
+
|
1720
|
+
if len(all_deployments) == 0:
|
1739
1721
|
console.log("[green]No deployments found.")
|
1740
1722
|
return
|
1741
1723
|
|
1724
|
+
details = fetch_job_details(jobs=all_deployments)
|
1725
|
+
if "error" in details:
|
1726
|
+
console.log(f"[red]{details}")
|
1727
|
+
return
|
1742
1728
|
columns = ["Owner", "Deployment", "Workers", "Endpoint"]
|
1743
|
-
|
1744
|
-
|
1745
|
-
rows = []
|
1746
|
-
for namespace, deployments in all_deployments.items():
|
1747
|
-
for deployment in deployments:
|
1748
|
-
try:
|
1749
|
-
# get status for deployment
|
1750
|
-
if detailed:
|
1751
|
-
data = {
|
1752
|
-
"group": "batch.volcano.sh",
|
1753
|
-
"api_version": "v1alpha1",
|
1754
|
-
"plural": "jobs",
|
1755
|
-
# "group": "leaderworkerset.x-k8s.io",
|
1756
|
-
# "api_version": "v1",
|
1757
|
-
# "plural": "leaderworkersets",
|
1758
|
-
"name": deployment
|
1759
|
-
}
|
1760
|
-
result = request_to_server(
|
1761
|
-
method="post",
|
1762
|
-
endpoint="/v1/get_status_for_object",
|
1763
|
-
data=data,
|
1764
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1765
|
-
user_cookie=USER_COOKIE
|
1766
|
-
)
|
1767
|
-
ss = [] # flatten results ({namespace: statuses})
|
1768
|
-
[ss.extend(values) for values in result.values()]
|
1769
|
-
if len(ss) > 0:
|
1770
|
-
last = ss[-1]
|
1771
|
-
statuses = f"[{last['lastTransitionTime']}] {last['status']}"
|
1772
|
-
else:
|
1773
|
-
statuses = "Unknown"
|
1774
|
-
# get pod statuses
|
1775
|
-
data = {
|
1776
|
-
"label": TEMPLATE_LABEL,
|
1777
|
-
"value": deployment
|
1778
|
-
}
|
1779
|
-
result = request_to_server(
|
1780
|
-
method="post",
|
1781
|
-
endpoint="/v1/get_pods_status_for_label",
|
1782
|
-
data=data,
|
1783
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1784
|
-
user_cookie=USER_COOKIE
|
1785
|
-
)
|
1786
|
-
workers_status = defaultdict(int)
|
1787
|
-
for ns, ss in result.items():
|
1788
|
-
if ns != namespace: # same job name, different namespace
|
1789
|
-
continue
|
1790
|
-
for _, values in ss.items():
|
1791
|
-
workers_status[values["status"]] += 1
|
1792
|
-
workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
|
1793
|
-
# get URL details
|
1794
|
-
data = {
|
1795
|
-
"label": TEMPLATE_LABEL,
|
1796
|
-
"value": deployment,
|
1797
|
-
"types": ["NodePort"]
|
1798
|
-
}
|
1799
|
-
result = request_to_server(
|
1800
|
-
method="post",
|
1801
|
-
endpoint="/v1/get_ports_for_services",
|
1802
|
-
data=data,
|
1803
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1804
|
-
user_cookie=USER_COOKIE
|
1805
|
-
)
|
1806
|
-
node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
|
1807
|
-
|
1808
|
-
urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
|
1809
|
-
row = [namespace, deployment, workers, "\n".join(urls)]
|
1810
|
-
if detailed:
|
1811
|
-
row.append(statuses)
|
1812
|
-
rows.append(row)
|
1813
|
-
|
1814
|
-
except Exception as e:
|
1815
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1816
|
-
return
|
1817
|
-
|
1729
|
+
rows = [[job.owner, job.name, job.workers, job.endpoint] for job in details]
|
1730
|
+
|
1818
1731
|
console.print(
|
1819
1732
|
generate_table(columns=columns, rows=rows, end_sections=range(len(rows)))
|
1820
1733
|
)
|
@@ -1836,26 +1749,19 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
|
|
1836
1749
|
|
1837
1750
|
if force_namespace is not None:
|
1838
1751
|
console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
|
1839
|
-
|
1840
|
-
|
1841
|
-
|
1842
|
-
|
1843
|
-
|
1844
|
-
|
1845
|
-
if
|
1846
|
-
|
1752
|
+
|
1753
|
+
all_logs = fetch_job_logs(
|
1754
|
+
job_name=name,
|
1755
|
+
pod_name=pod_name,
|
1756
|
+
force_namespace=force_namespace,
|
1757
|
+
tail=tail)
|
1758
|
+
if "error" in all_logs:
|
1759
|
+
console.log(f"[red]{all_logs}")
|
1760
|
+
return
|
1847
1761
|
while True:
|
1848
1762
|
try:
|
1849
|
-
# send tail as parameter (fetch only last _tail_ lines)
|
1850
|
-
result = request_to_server(
|
1851
|
-
method="post",
|
1852
|
-
endpoint="/v1/get_logs_for_label",
|
1853
|
-
data=data,
|
1854
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1855
|
-
user_cookie=USER_COOKIE
|
1856
|
-
)
|
1857
1763
|
if not stream:
|
1858
|
-
for pod, logs in
|
1764
|
+
for pod, logs in all_logs.items():
|
1859
1765
|
if pod_name is not None and pod_name != pod:
|
1860
1766
|
continue
|
1861
1767
|
console.log(f"[yellow]Pod {pod}")
|
@@ -1863,7 +1769,7 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
|
|
1863
1769
|
break
|
1864
1770
|
else:
|
1865
1771
|
os.system("clear")
|
1866
|
-
for pod, logs in
|
1772
|
+
for pod, logs in all_logs.items():
|
1867
1773
|
if pod_name is not None and pod_name != pod:
|
1868
1774
|
continue
|
1869
1775
|
print(f"Pod {pod}")
|
@@ -1871,10 +1777,7 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
|
|
1871
1777
|
time.sleep(1)
|
1872
1778
|
except KeyboardInterrupt:
|
1873
1779
|
break
|
1874
|
-
|
1875
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1876
|
-
console.log(f"Check if {name} is running with [yellow]kalavai job list")
|
1877
|
-
return
|
1780
|
+
|
1878
1781
|
|
1879
1782
|
@arguably.command
|
1880
1783
|
def job__manifest(*others, name, force_namespace: str=None):
|
kalavai_client/cluster.py
CHANGED
@@ -133,6 +133,9 @@ class dockerCluster(Cluster):
|
|
133
133
|
if not os.path.isfile(self.compose_file):
|
134
134
|
return False
|
135
135
|
status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --status=running").decode()
|
136
|
+
if not status:
|
137
|
+
return False
|
138
|
+
status = (0 == os.system(f'docker exec {self.container_name} ps aux | grep -v grep | grep -E "k3s (server|agent)"'))
|
136
139
|
return status
|
137
140
|
|
138
141
|
def is_seed_node(self):
|
kalavai_client/core.py
ADDED
@@ -0,0 +1,227 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
import math
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
from kalavai_client.utils import (
|
7
|
+
request_to_server,
|
8
|
+
load_server_info
|
9
|
+
)
|
10
|
+
from kalavai_client.env import (
|
11
|
+
USER_COOKIE,
|
12
|
+
USER_LOCAL_SERVER_FILE,
|
13
|
+
TEMPLATE_LABEL,
|
14
|
+
SERVER_IP_KEY
|
15
|
+
)
|
16
|
+
|
17
|
+
class Job(BaseModel):
|
18
|
+
owner: str = None
|
19
|
+
name: str = None
|
20
|
+
workers: str = None
|
21
|
+
endpoint: str = None
|
22
|
+
|
23
|
+
class DeviceStatus(BaseModel):
|
24
|
+
name: str
|
25
|
+
memory_pressure: bool
|
26
|
+
disk_pressure: bool
|
27
|
+
pid_pressure: bool
|
28
|
+
ready: bool
|
29
|
+
unschedulable: bool
|
30
|
+
|
31
|
+
class GPU(BaseModel):
|
32
|
+
node: str
|
33
|
+
available: int
|
34
|
+
total: int
|
35
|
+
ready: bool
|
36
|
+
model: str
|
37
|
+
|
38
|
+
|
39
|
+
def fetch_resources():
|
40
|
+
try:
|
41
|
+
total = request_to_server(
|
42
|
+
method="get",
|
43
|
+
endpoint="/v1/get_cluster_total_resources",
|
44
|
+
data={},
|
45
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
46
|
+
user_cookie=USER_COOKIE
|
47
|
+
)
|
48
|
+
available = request_to_server(
|
49
|
+
method="get",
|
50
|
+
endpoint="/v1/get_cluster_available_resources",
|
51
|
+
data={},
|
52
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
53
|
+
user_cookie=USER_COOKIE
|
54
|
+
)
|
55
|
+
except Exception as e:
|
56
|
+
return {"error": str(e)}
|
57
|
+
|
58
|
+
return {"total": total, "available": available}
|
59
|
+
|
60
|
+
def fetch_job_names():
|
61
|
+
data = {
|
62
|
+
"group": "batch.volcano.sh",
|
63
|
+
"api_version": "v1alpha1",
|
64
|
+
"plural": "jobs"
|
65
|
+
}
|
66
|
+
try:
|
67
|
+
jobs = request_to_server(
|
68
|
+
method="post",
|
69
|
+
endpoint="/v1/get_objects_of_type",
|
70
|
+
data=data,
|
71
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
72
|
+
user_cookie=USER_COOKIE
|
73
|
+
)
|
74
|
+
all_jobs = []
|
75
|
+
for ns, ds in jobs.items():
|
76
|
+
all_jobs.extend([Job(owner=ns, name=d["metadata"]["labels"][TEMPLATE_LABEL]) for d in ds["items"]])
|
77
|
+
except Exception as e:
|
78
|
+
return {"error": str(e)}
|
79
|
+
|
80
|
+
return all_jobs
|
81
|
+
|
82
|
+
def fetch_job_details(jobs: list[Job]):
|
83
|
+
"""Get job details. A job is a dict:
|
84
|
+
{
|
85
|
+
"namespace": ns,
|
86
|
+
"name": name
|
87
|
+
}
|
88
|
+
"""
|
89
|
+
job_details = []
|
90
|
+
for job in jobs:
|
91
|
+
namespace = job.owner
|
92
|
+
deployment = job.name
|
93
|
+
try:
|
94
|
+
# get pod statuses
|
95
|
+
data = {
|
96
|
+
"label": TEMPLATE_LABEL,
|
97
|
+
"value": deployment
|
98
|
+
}
|
99
|
+
result = request_to_server(
|
100
|
+
method="post",
|
101
|
+
endpoint="/v1/get_pods_status_for_label",
|
102
|
+
data=data,
|
103
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
104
|
+
user_cookie=USER_COOKIE
|
105
|
+
)
|
106
|
+
workers_status = defaultdict(int)
|
107
|
+
for ns, ss in result.items():
|
108
|
+
if ns != namespace: # same job name, different namespace
|
109
|
+
continue
|
110
|
+
for _, values in ss.items():
|
111
|
+
workers_status[values["status"]] += 1
|
112
|
+
workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
|
113
|
+
# get URL details
|
114
|
+
data = {
|
115
|
+
"label": TEMPLATE_LABEL,
|
116
|
+
"value": deployment,
|
117
|
+
"types": ["NodePort"]
|
118
|
+
}
|
119
|
+
result = request_to_server(
|
120
|
+
method="post",
|
121
|
+
endpoint="/v1/get_ports_for_services",
|
122
|
+
data=data,
|
123
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
124
|
+
user_cookie=USER_COOKIE
|
125
|
+
)
|
126
|
+
node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
|
127
|
+
|
128
|
+
urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
|
129
|
+
job_details.append(
|
130
|
+
Job(owner=namespace,
|
131
|
+
name=deployment,
|
132
|
+
workers=workers,
|
133
|
+
endpoint="\n".join(urls))
|
134
|
+
)
|
135
|
+
|
136
|
+
except Exception as e:
|
137
|
+
return {"error": str(e)}
|
138
|
+
|
139
|
+
return job_details
|
140
|
+
|
141
|
+
def fetch_devices():
|
142
|
+
"""Load devices status info for all hosts"""
|
143
|
+
try:
|
144
|
+
data = request_to_server(
|
145
|
+
method="get",
|
146
|
+
endpoint="/v1/get_nodes",
|
147
|
+
data={},
|
148
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
149
|
+
user_cookie=USER_COOKIE
|
150
|
+
)
|
151
|
+
devices = []
|
152
|
+
for node, status in data.items():
|
153
|
+
devices.append(
|
154
|
+
DeviceStatus(
|
155
|
+
name=node,
|
156
|
+
memory_pressure=status["MemoryPressure"],
|
157
|
+
disk_pressure=status["DiskPressure"],
|
158
|
+
pid_pressure=status["PIDPressure"],
|
159
|
+
ready=status["Ready"],
|
160
|
+
unschedulable=status["unschedulable"]
|
161
|
+
)
|
162
|
+
)
|
163
|
+
return devices
|
164
|
+
|
165
|
+
except Exception as e:
|
166
|
+
return {"error": str(e)}
|
167
|
+
|
168
|
+
def fetch_job_logs(job_name, force_namespace=None, pod_name=None, tail=100):
|
169
|
+
data = {
|
170
|
+
"label": TEMPLATE_LABEL,
|
171
|
+
"value": job_name,
|
172
|
+
"tail": tail
|
173
|
+
}
|
174
|
+
if force_namespace is not None:
|
175
|
+
data["force_namespace"] = force_namespace
|
176
|
+
try:
|
177
|
+
# send tail as parameter (fetch only last _tail_ lines)
|
178
|
+
all_logs = request_to_server(
|
179
|
+
method="post",
|
180
|
+
endpoint="/v1/get_logs_for_label",
|
181
|
+
data=data,
|
182
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
183
|
+
user_cookie=USER_COOKIE
|
184
|
+
)
|
185
|
+
return {pod: logs for pod, logs in all_logs.items() if pod_name is None or pod_name == pod}
|
186
|
+
|
187
|
+
except Exception as e:
|
188
|
+
return {"error": str(e)}
|
189
|
+
|
190
|
+
|
191
|
+
def load_gpu_models():
|
192
|
+
data = request_to_server(
|
193
|
+
method="post",
|
194
|
+
endpoint="/v1/get_node_gpus",
|
195
|
+
data={},
|
196
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
197
|
+
user_cookie=USER_COOKIE
|
198
|
+
)
|
199
|
+
return data.items()
|
200
|
+
|
201
|
+
def fetch_gpus(available=False):
|
202
|
+
try:
|
203
|
+
data = load_gpu_models()
|
204
|
+
all_gpus = []
|
205
|
+
for node, gpus in data:
|
206
|
+
row_gpus = []
|
207
|
+
for gpu in gpus["gpus"]:
|
208
|
+
status = gpu["ready"] if "ready" in gpu else True
|
209
|
+
if available and not status:
|
210
|
+
continue
|
211
|
+
row_gpus.append( (f"{gpu['model']} ({math.floor(int(gpu['memory'])/1000)} GBs)", str(status)))
|
212
|
+
if len(row_gpus) > 0:
|
213
|
+
models, statuses = zip(*row_gpus)
|
214
|
+
#rows.append([node, "\n".join(statuses), "\n".join(models), str(gpus["available"]), str(gpus["capacity"])])
|
215
|
+
all_gpus.extend([
|
216
|
+
GPU(
|
217
|
+
node=node,
|
218
|
+
ready=status,
|
219
|
+
model=model,
|
220
|
+
available=gpus["available"],
|
221
|
+
total=gpus["capacity"]
|
222
|
+
) for model, status in zip(models, statuses)
|
223
|
+
])
|
224
|
+
return all_gpus
|
225
|
+
|
226
|
+
except Exception as e:
|
227
|
+
return {"error": str(e)}
|
kalavai_client/env.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
|
5
|
+
def user_path(relative_path, create_path=False):
|
6
|
+
"""Transform a relative path into the user's cache folder path"""
|
7
|
+
base = os.path.expanduser("~")
|
8
|
+
kalavai_user_path = os.path.join(base, ".cache/kalavai")
|
9
|
+
full_path = os.path.join(kalavai_user_path, relative_path)
|
10
|
+
if create_path:
|
11
|
+
Path(full_path).mkdir(parents=True, exist_ok=True)
|
12
|
+
|
13
|
+
return full_path
|
14
|
+
|
15
|
+
|
16
|
+
USER_LOCAL_SERVER_FILE = user_path(".server")
|
17
|
+
USER_COOKIE = user_path(".user_cookie.pkl")
|
18
|
+
TEMPLATE_LABEL = "kalavai.job.name"
|
19
|
+
SERVER_IP_KEY = "server_ip"
|
kalavai_client/utils.py
CHANGED
@@ -14,6 +14,9 @@ import yaml
|
|
14
14
|
|
15
15
|
|
16
16
|
from kalavai_client.auth import KalavaiAuthClient
|
17
|
+
from kalavai_client.env import (
|
18
|
+
SERVER_IP_KEY
|
19
|
+
)
|
17
20
|
|
18
21
|
|
19
22
|
GITHUB_ORG = "kalavai-net"
|
@@ -22,7 +25,6 @@ GITHUB_TEMPLATE_PATH = "templates"
|
|
22
25
|
USER_NODE_LABEL_KEY = "user_node_label"
|
23
26
|
CLUSTER_IP_KEY = "cluster_ip"
|
24
27
|
CLUSTER_TOKEN_KEY = "cluster_token"
|
25
|
-
SERVER_IP_KEY = "server_ip"
|
26
28
|
NODE_NAME_KEY = "node_name"
|
27
29
|
PUBLIC_LOCATION_KEY = "public_location"
|
28
30
|
CLUSTER_NAME_KEY = "cluster_name"
|
@@ -397,16 +399,6 @@ def resource_path(relative_path: str):
|
|
397
399
|
return None
|
398
400
|
return resource
|
399
401
|
|
400
|
-
def user_path(relative_path, create_path=False):
|
401
|
-
"""Transform a relative path into the user's cache folder path"""
|
402
|
-
base = os.path.expanduser("~")
|
403
|
-
kalavai_user_path = os.path.join(base, ".cache/kalavai")
|
404
|
-
full_path = os.path.join(kalavai_user_path, relative_path)
|
405
|
-
if create_path:
|
406
|
-
Path(full_path).mkdir(parents=True, exist_ok=True)
|
407
|
-
|
408
|
-
return full_path
|
409
|
-
|
410
402
|
def safe_remove(filepath, force=True):
|
411
403
|
if not os.path.exists(filepath):
|
412
404
|
return
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: kalavai-client
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.13
|
4
4
|
Summary: Client app for kalavai platform
|
5
5
|
License: Apache-2.0
|
6
6
|
Keywords: LLM,platform
|
@@ -89,6 +89,9 @@ https://github.com/user-attachments/assets/0d2316f3-79ea-46ac-b41e-8ef720f52672
|
|
89
89
|
|
90
90
|
### News updates
|
91
91
|
|
92
|
+
<img src="docs/docs/assets/images/DeepSeek-Emblem.png" width="100">
|
93
|
+
|
94
|
+
- 6 February 2025: 🔥🔥🔥 Access **DeepSeek R1 model for free** when you join our [public LLM pool](https://kalavai-net.github.io/kalavai-client/public_llm_pool/)
|
92
95
|
- 31 January 2025: `kalavai-client` is now a [PyPI package](https://pypi.org/project/kalavai-client/), easier to install than ever!
|
93
96
|
- 27 January 2025: Support for accessing pools from remote computers
|
94
97
|
- 9 January 2025: Added support for [Aphrodite Engine](https://github.com/aphrodite-engine/aphrodite-engine) models
|
@@ -140,7 +143,7 @@ From release **v0.5.0, you can now install `kalavai-client` in non-worker comput
|
|
140
143
|
For workers sharing resources with the pool:
|
141
144
|
|
142
145
|
- A laptop, desktop or Virtual Machine
|
143
|
-
- Docker engine installed (for [linux](https://docs.docker.com/engine/install/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
|
146
|
+
- Docker engine installed (for [linux](https://docs.docker.com/engine/install/ubuntu/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
|
144
147
|
|
145
148
|
> **Support for Windows and MacOS workers is experimental**: kalavai workers run on docker containers that require access to the host network interfaces, thus systems that do not support containers natively (Windows and MacOS) may have difficulties finding each other.
|
146
149
|
|
@@ -1,7 +1,7 @@
|
|
1
|
-
kalavai_client/__init__.py,sha256=
|
1
|
+
kalavai_client/__init__.py,sha256=OKM-UDxm0absUf9IgE89lC_PpDG9RbBD4It-hbz8ORM,23
|
2
2
|
kalavai_client/__main__.py,sha256=WQUfxvRsBJH5gsCJg8pLz95QnZIj7Ol8psTO77m0QE0,73
|
3
3
|
kalavai_client/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
kalavai_client/assets/apps.yaml,sha256=
|
4
|
+
kalavai_client/assets/apps.yaml,sha256=yC-vtYTPE960KUQihTk5pee8xZz9RD8Reuyh1nSpRWk,5981
|
5
5
|
kalavai_client/assets/apps_values.yaml,sha256=CjKVelPQHd-hm-DTMEuya92feKiphU9mh3HrosLYYPE,1676
|
6
6
|
kalavai_client/assets/docker-compose-template.yaml,sha256=mo8LUam9-AzB_0w72wTyMyreKr4Ns-pxZGc4GVWcUHA,2747
|
7
7
|
kalavai_client/assets/nginx.conf,sha256=drVVCg8GHucz7hmt_BI6giAhK92OV71257NTs3LthwM,225
|
@@ -10,11 +10,13 @@ kalavai_client/assets/pool_config_values.yaml,sha256=VrM3XHQfQo6QLZ68qvagooUptaY
|
|
10
10
|
kalavai_client/assets/user_workspace.yaml,sha256=wDvlMYknOPABAEo0dsQwU7bac8iubjAG9tdkFbJZ5Go,476
|
11
11
|
kalavai_client/assets/user_workspace_values.yaml,sha256=G0HOzQUxrDMCwuW9kbWUZaKMzDDPVwDwzBHCL2Xi2ZM,542
|
12
12
|
kalavai_client/auth.py,sha256=QsBh28L2LwjBBK6pTUE4Xu36lLDTyetyU1YfS1Hbb6g,1717
|
13
|
-
kalavai_client/cli.py,sha256=
|
14
|
-
kalavai_client/cluster.py,sha256=
|
15
|
-
kalavai_client/
|
16
|
-
kalavai_client
|
17
|
-
kalavai_client
|
18
|
-
kalavai_client-0.5.
|
19
|
-
kalavai_client-0.5.
|
20
|
-
kalavai_client-0.5.
|
13
|
+
kalavai_client/cli.py,sha256=_oUZAYV397_-BQAHsTcyK0pkyK5iusdyYrJU5z6lecM,66312
|
14
|
+
kalavai_client/cluster.py,sha256=z9HCD6ZUemjabcDenszQhqB_IUiVi_vpFbfAkKwHiEU,12292
|
15
|
+
kalavai_client/core.py,sha256=Vb-5MBHjpuR590FIDOnytJpP1Xjt7hYqehPV2rh6P68,6863
|
16
|
+
kalavai_client/env.py,sha256=RAi37vJtIGfPR25PNxZYMZNkkEKR4AyUPN_htFiFesM,575
|
17
|
+
kalavai_client/utils.py,sha256=kQk_1QOs8u08rcfhkcfo_oC-cZzww0cij-1R_jK1ER8,12185
|
18
|
+
kalavai_client-0.5.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
19
|
+
kalavai_client-0.5.13.dist-info/METADATA,sha256=fQus2g5Q39Wu_HglzdiOGnalva3GcSG48o5iyyikbt4,14347
|
20
|
+
kalavai_client-0.5.13.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
21
|
+
kalavai_client-0.5.13.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
|
22
|
+
kalavai_client-0.5.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|