kalavai-client 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- kalavai_client/__init__.py +1 -1
- kalavai_client/assets/apps.yaml +1 -1
- kalavai_client/cli.py +119 -217
- kalavai_client/cluster.py +5 -1
- kalavai_client/core.py +227 -0
- kalavai_client/env.py +19 -0
- kalavai_client/utils.py +3 -11
- {kalavai_client-0.5.12.dist-info → kalavai_client-0.5.14.dist-info}/METADATA +5 -2
- {kalavai_client-0.5.12.dist-info → kalavai_client-0.5.14.dist-info}/RECORD +12 -10
- {kalavai_client-0.5.12.dist-info → kalavai_client-0.5.14.dist-info}/LICENSE +0 -0
- {kalavai_client-0.5.12.dist-info → kalavai_client-0.5.14.dist-info}/WHEEL +0 -0
- {kalavai_client-0.5.12.dist-info → kalavai_client-0.5.14.dist-info}/entry_points.txt +0 -0
kalavai_client/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
|
2
|
-
__version__ = "0.5.
|
2
|
+
__version__ = "0.5.14"
|
kalavai_client/assets/apps.yaml
CHANGED
kalavai_client/cli.py
CHANGED
@@ -15,10 +15,24 @@ import netifaces as ni
|
|
15
15
|
import arguably
|
16
16
|
from rich.console import Console
|
17
17
|
|
18
|
+
from kalavai_client.env import (
|
19
|
+
USER_COOKIE,
|
20
|
+
USER_LOCAL_SERVER_FILE,
|
21
|
+
TEMPLATE_LABEL,
|
22
|
+
user_path
|
23
|
+
)
|
24
|
+
from kalavai_client.core import (
|
25
|
+
fetch_resources,
|
26
|
+
fetch_job_names,
|
27
|
+
fetch_job_details,
|
28
|
+
fetch_devices,
|
29
|
+
fetch_job_logs,
|
30
|
+
fetch_gpus,
|
31
|
+
load_gpu_models
|
32
|
+
)
|
18
33
|
from kalavai_client.utils import (
|
19
34
|
check_gpu_drivers,
|
20
35
|
run_cmd,
|
21
|
-
user_path,
|
22
36
|
decode_dict,
|
23
37
|
generate_join_token,
|
24
38
|
user_confirm,
|
@@ -27,10 +41,8 @@ from kalavai_client.utils import (
|
|
27
41
|
generate_table,
|
28
42
|
request_to_server,
|
29
43
|
resource_path,
|
30
|
-
user_path,
|
31
44
|
safe_remove,
|
32
45
|
leave_vpn,
|
33
|
-
join_vpn,
|
34
46
|
get_vpn_details,
|
35
47
|
load_server_info,
|
36
48
|
user_login,
|
@@ -68,7 +80,6 @@ LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
|
|
68
80
|
VERSION = 1
|
69
81
|
RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
|
70
82
|
CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
|
71
|
-
TEMPLATE_LABEL = "kalavai.job.name"
|
72
83
|
RAY_LABEL = "kalavai.ray.name"
|
73
84
|
PVC_NAME_LABEL = "kalavai.storage.name"
|
74
85
|
DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
|
@@ -98,9 +109,7 @@ USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
|
|
98
109
|
USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
|
99
110
|
USER_HELM_APPS_FILE = user_path("apps.yaml")
|
100
111
|
USER_KUBECONFIG_FILE = user_path("kubeconfig")
|
101
|
-
USER_LOCAL_SERVER_FILE = user_path(".server")
|
102
112
|
USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
|
103
|
-
USER_COOKIE = user_path(".user_cookie.pkl")
|
104
113
|
|
105
114
|
|
106
115
|
console = Console()
|
@@ -118,7 +127,7 @@ CLUSTER = dockerCluster(
|
|
118
127
|
######################
|
119
128
|
## HELPER FUNCTIONS ##
|
120
129
|
######################
|
121
|
-
|
130
|
+
|
122
131
|
def check_seed_compatibility():
|
123
132
|
"""Check required packages to start pools"""
|
124
133
|
logs = []
|
@@ -288,21 +297,11 @@ def select_ip_address(subnet=None):
|
|
288
297
|
console.log("[red] Input error")
|
289
298
|
return ips[option]
|
290
299
|
|
291
|
-
def fetch_gpus():
|
292
|
-
data = request_to_server(
|
293
|
-
method="post",
|
294
|
-
endpoint="/v1/get_node_gpus",
|
295
|
-
data={},
|
296
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
297
|
-
user_cookie=USER_COOKIE
|
298
|
-
)
|
299
|
-
return data.items()
|
300
|
-
|
301
300
|
def select_gpus(message):
|
302
301
|
console.log(f"[yellow]{message}")
|
303
302
|
gpu_models = ["Any/None"]
|
304
303
|
gpu_models_full = ["Any/None"]
|
305
|
-
available_gpus =
|
304
|
+
available_gpus = load_gpu_models()
|
306
305
|
for _, gpus in available_gpus:
|
307
306
|
for gpu in gpus["gpus"]:
|
308
307
|
#status = "free" if "ready" in gpu else "busy"
|
@@ -737,7 +736,7 @@ def pool__check_token(token, *others, public=False):
|
|
737
736
|
|
738
737
|
|
739
738
|
@arguably.command
|
740
|
-
def pool__join(token, *others, node_name=None
|
739
|
+
def pool__join(token, *others, node_name=None):
|
741
740
|
"""
|
742
741
|
Join Kalavai pool and start/resume sharing resources.
|
743
742
|
|
@@ -860,7 +859,7 @@ def pool__join(token, *others, node_name=None, ip_address: str=None):
|
|
860
859
|
CLUSTER.start_worker_node()
|
861
860
|
except Exception as e:
|
862
861
|
console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
|
863
|
-
|
862
|
+
pool__stop()
|
864
863
|
exit()
|
865
864
|
|
866
865
|
# ensure we are connected
|
@@ -869,6 +868,22 @@ def pool__join(token, *others, node_name=None, ip_address: str=None):
|
|
869
868
|
time.sleep(30)
|
870
869
|
if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
|
871
870
|
break
|
871
|
+
|
872
|
+
# send note to server to let them know the node is coming online
|
873
|
+
if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
|
874
|
+
console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with [yellow]--node-name'")
|
875
|
+
pool__stop()
|
876
|
+
return
|
877
|
+
|
878
|
+
# check the node has connected successfully
|
879
|
+
try:
|
880
|
+
while not CLUSTER.is_agent_running():
|
881
|
+
console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
|
882
|
+
time.sleep(30)
|
883
|
+
except KeyboardInterrupt:
|
884
|
+
console.log("[red]Installation aborted. Leaving pool.")
|
885
|
+
pool__stop()
|
886
|
+
return
|
872
887
|
|
873
888
|
init_user_workspace()
|
874
889
|
|
@@ -963,29 +978,24 @@ def pool__gpus(*others, available=False):
|
|
963
978
|
console.log(f"[red]Problems with your pool: {str(e)}")
|
964
979
|
return
|
965
980
|
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
)
|
985
|
-
|
986
|
-
except Exception as e:
|
987
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
988
|
-
|
981
|
+
gpus = fetch_gpus(available=available)
|
982
|
+
if "error" in gpus:
|
983
|
+
console.log(f"[red]Error when fetching gpus: {gpus}")
|
984
|
+
return
|
985
|
+
|
986
|
+
columns = ["Node", "Ready", "GPU(s)", "Available", "Total"]
|
987
|
+
rows = []
|
988
|
+
for gpu in gpus:
|
989
|
+
rows.append([
|
990
|
+
gpu.node,
|
991
|
+
str(gpu.ready),
|
992
|
+
gpu.model,
|
993
|
+
str(gpu.available),
|
994
|
+
str(gpu.total)
|
995
|
+
])
|
996
|
+
console.print(
|
997
|
+
generate_table(columns=columns, rows=rows,end_sections=[n for n in range(len(rows))])
|
998
|
+
)
|
989
999
|
|
990
1000
|
@arguably.command
|
991
1001
|
def pool__resources(*others):
|
@@ -998,45 +1008,33 @@ def pool__resources(*others):
|
|
998
1008
|
console.log(f"[red]Problems with your pool: {str(e)}")
|
999
1009
|
return
|
1000
1010
|
|
1001
|
-
|
1002
|
-
|
1003
|
-
method="get",
|
1004
|
-
endpoint="/v1/get_cluster_total_resources",
|
1005
|
-
data={},
|
1006
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1007
|
-
user_cookie=USER_COOKIE
|
1008
|
-
)
|
1009
|
-
available = request_to_server(
|
1010
|
-
method="get",
|
1011
|
-
endpoint="/v1/get_cluster_available_resources",
|
1012
|
-
data={},
|
1013
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1014
|
-
user_cookie=USER_COOKIE
|
1015
|
-
)
|
1016
|
-
columns = []
|
1017
|
-
total_values = []
|
1018
|
-
available_values = []
|
1019
|
-
for col in total.keys():
|
1020
|
-
if col in RESOURCE_EXCLUDE:
|
1021
|
-
continue
|
1022
|
-
columns.append(col)
|
1023
|
-
total_values.append(str(total[col]))
|
1024
|
-
available_values.append(str(available[col]))
|
1025
|
-
|
1026
|
-
columns = [""] + columns
|
1027
|
-
total_values = ["Total"] + total_values
|
1028
|
-
available_values = ["Available"] + available_values
|
1029
|
-
|
1030
|
-
rows = [
|
1031
|
-
tuple(available_values),
|
1032
|
-
tuple(total_values)
|
1033
|
-
]
|
1034
|
-
console.print(
|
1035
|
-
generate_table(columns=columns, rows=rows, end_sections=[0, 1])
|
1036
|
-
)
|
1037
|
-
|
1038
|
-
except Exception as e:
|
1011
|
+
data = fetch_resources()
|
1012
|
+
if "error" in data:
|
1039
1013
|
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1014
|
+
return
|
1015
|
+
|
1016
|
+
columns = []
|
1017
|
+
total_values = []
|
1018
|
+
available_values = []
|
1019
|
+
for col in data["total"].keys():
|
1020
|
+
if col in RESOURCE_EXCLUDE:
|
1021
|
+
continue
|
1022
|
+
columns.append(col)
|
1023
|
+
total_values.append(str(data["total"][col]))
|
1024
|
+
available_values.append(str(data["available"][col]))
|
1025
|
+
|
1026
|
+
columns = [""] + columns
|
1027
|
+
total_values = ["Total"] + total_values
|
1028
|
+
available_values = ["Available"] + available_values
|
1029
|
+
|
1030
|
+
rows = [
|
1031
|
+
tuple(available_values),
|
1032
|
+
tuple(total_values)
|
1033
|
+
]
|
1034
|
+
console.print(
|
1035
|
+
generate_table(columns=columns, rows=rows, end_sections=[0, 1])
|
1036
|
+
)
|
1037
|
+
|
1040
1038
|
|
1041
1039
|
@arguably.command
|
1042
1040
|
def pool__update(*others):
|
@@ -1333,22 +1331,18 @@ def node__list(*others):
|
|
1333
1331
|
return
|
1334
1332
|
|
1335
1333
|
try:
|
1336
|
-
|
1337
|
-
method="get",
|
1338
|
-
endpoint="/v1/get_nodes",
|
1339
|
-
data={},
|
1340
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1341
|
-
user_cookie=USER_COOKIE
|
1342
|
-
)
|
1334
|
+
devices = fetch_devices()
|
1343
1335
|
rows = []
|
1344
|
-
columns = ["Node name"]
|
1345
|
-
for
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1336
|
+
columns = ["Node name", "Memory Pressure", "Disk pressure", "PID pressure", "Ready", "Unschedulable"]
|
1337
|
+
for device in devices:
|
1338
|
+
rows.append([
|
1339
|
+
device.name,
|
1340
|
+
str(device.memory_pressure),
|
1341
|
+
str(device.disk_pressure),
|
1342
|
+
str(device.pid_pressure),
|
1343
|
+
str(device.ready),
|
1344
|
+
str(device.unschedulable)
|
1345
|
+
])
|
1352
1346
|
|
1353
1347
|
console.log("Nodes with 'unschedulable=True' will not receive workload")
|
1354
1348
|
console.log("To make a node unschedulable (i.e. won't receive workloads) use [yellow]kalavai node cordon <node name>")
|
@@ -1540,7 +1534,9 @@ def job__test(local_template_dir, *others, values, defaults, force_namespace: st
|
|
1540
1534
|
console.log(f"[red]--values ({values}) is not a valid local file")
|
1541
1535
|
return
|
1542
1536
|
with open(values, "r") as f:
|
1543
|
-
|
1537
|
+
raw_values = yaml.load(f, Loader=yaml.SafeLoader)
|
1538
|
+
values_dict = {variable["name"]: variable['value'] for variable in raw_values}
|
1539
|
+
|
1544
1540
|
# load defaults
|
1545
1541
|
if not os.path.isfile(defaults):
|
1546
1542
|
console.log(f"[red]--defaults ({defaults}) is not a valid local file")
|
@@ -1648,7 +1644,7 @@ def job__estimate(billion_parameters, *others, precision=32):
|
|
1648
1644
|
|
1649
1645
|
average_vram = 8
|
1650
1646
|
required_memory = float(billion_parameters) * (precision / 8) / 1.2
|
1651
|
-
available_gpus =
|
1647
|
+
available_gpus = load_gpu_models()
|
1652
1648
|
vrams = []
|
1653
1649
|
for _, gpus in available_gpus:
|
1654
1650
|
for model in gpus["gpus"]:
|
@@ -1705,7 +1701,7 @@ def job__status(name, *others):
|
|
1705
1701
|
return
|
1706
1702
|
|
1707
1703
|
@arguably.command
|
1708
|
-
def job__list(*others
|
1704
|
+
def job__list(*others):
|
1709
1705
|
"""
|
1710
1706
|
List jobs in the cluster
|
1711
1707
|
"""
|
@@ -1715,106 +1711,22 @@ def job__list(*others, detailed=False):
|
|
1715
1711
|
console.log(f"[red]Problems with your pool: {str(e)}")
|
1716
1712
|
return
|
1717
1713
|
|
1718
|
-
|
1719
|
-
|
1720
|
-
"
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1724
|
-
result = request_to_server(
|
1725
|
-
method="post",
|
1726
|
-
endpoint="/v1/get_objects_of_type",
|
1727
|
-
data=data,
|
1728
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1729
|
-
user_cookie=USER_COOKIE
|
1730
|
-
)
|
1731
|
-
all_deployments = defaultdict(list)
|
1732
|
-
for ns, ds in result.items():
|
1733
|
-
all_deployments[ns].extend([d["metadata"]["labels"][TEMPLATE_LABEL] for d in ds["items"]])
|
1734
|
-
#deployments = {ns: d["metadata"]["labels"][TEMPLATE_LABEL] for ns, ds in result.items() for d in ds["items"]}
|
1735
|
-
except Exception as e:
|
1736
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1737
|
-
return
|
1738
|
-
if len(all_deployments.keys()) == 0:
|
1714
|
+
all_deployments = fetch_job_names()
|
1715
|
+
if "error" in all_deployments:
|
1716
|
+
console.log(f"[red]Error when connecting to kalavai service: {all_deployments}")
|
1717
|
+
return
|
1718
|
+
|
1719
|
+
if len(all_deployments) == 0:
|
1739
1720
|
console.log("[green]No deployments found.")
|
1740
1721
|
return
|
1741
1722
|
|
1723
|
+
details = fetch_job_details(jobs=all_deployments)
|
1724
|
+
if "error" in details:
|
1725
|
+
console.log(f"[red]{details}")
|
1726
|
+
return
|
1742
1727
|
columns = ["Owner", "Deployment", "Workers", "Endpoint"]
|
1743
|
-
|
1744
|
-
|
1745
|
-
rows = []
|
1746
|
-
for namespace, deployments in all_deployments.items():
|
1747
|
-
for deployment in deployments:
|
1748
|
-
try:
|
1749
|
-
# get status for deployment
|
1750
|
-
if detailed:
|
1751
|
-
data = {
|
1752
|
-
"group": "batch.volcano.sh",
|
1753
|
-
"api_version": "v1alpha1",
|
1754
|
-
"plural": "jobs",
|
1755
|
-
# "group": "leaderworkerset.x-k8s.io",
|
1756
|
-
# "api_version": "v1",
|
1757
|
-
# "plural": "leaderworkersets",
|
1758
|
-
"name": deployment
|
1759
|
-
}
|
1760
|
-
result = request_to_server(
|
1761
|
-
method="post",
|
1762
|
-
endpoint="/v1/get_status_for_object",
|
1763
|
-
data=data,
|
1764
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1765
|
-
user_cookie=USER_COOKIE
|
1766
|
-
)
|
1767
|
-
ss = [] # flatten results ({namespace: statuses})
|
1768
|
-
[ss.extend(values) for values in result.values()]
|
1769
|
-
if len(ss) > 0:
|
1770
|
-
last = ss[-1]
|
1771
|
-
statuses = f"[{last['lastTransitionTime']}] {last['status']}"
|
1772
|
-
else:
|
1773
|
-
statuses = "Unknown"
|
1774
|
-
# get pod statuses
|
1775
|
-
data = {
|
1776
|
-
"label": TEMPLATE_LABEL,
|
1777
|
-
"value": deployment
|
1778
|
-
}
|
1779
|
-
result = request_to_server(
|
1780
|
-
method="post",
|
1781
|
-
endpoint="/v1/get_pods_status_for_label",
|
1782
|
-
data=data,
|
1783
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1784
|
-
user_cookie=USER_COOKIE
|
1785
|
-
)
|
1786
|
-
workers_status = defaultdict(int)
|
1787
|
-
for ns, ss in result.items():
|
1788
|
-
if ns != namespace: # same job name, different namespace
|
1789
|
-
continue
|
1790
|
-
for _, values in ss.items():
|
1791
|
-
workers_status[values["status"]] += 1
|
1792
|
-
workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
|
1793
|
-
# get URL details
|
1794
|
-
data = {
|
1795
|
-
"label": TEMPLATE_LABEL,
|
1796
|
-
"value": deployment,
|
1797
|
-
"types": ["NodePort"]
|
1798
|
-
}
|
1799
|
-
result = request_to_server(
|
1800
|
-
method="post",
|
1801
|
-
endpoint="/v1/get_ports_for_services",
|
1802
|
-
data=data,
|
1803
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1804
|
-
user_cookie=USER_COOKIE
|
1805
|
-
)
|
1806
|
-
node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
|
1807
|
-
|
1808
|
-
urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
|
1809
|
-
row = [namespace, deployment, workers, "\n".join(urls)]
|
1810
|
-
if detailed:
|
1811
|
-
row.append(statuses)
|
1812
|
-
rows.append(row)
|
1813
|
-
|
1814
|
-
except Exception as e:
|
1815
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1816
|
-
return
|
1817
|
-
|
1728
|
+
rows = [[job.owner, job.name, job.workers, job.endpoint] for job in details]
|
1729
|
+
|
1818
1730
|
console.print(
|
1819
1731
|
generate_table(columns=columns, rows=rows, end_sections=range(len(rows)))
|
1820
1732
|
)
|
@@ -1836,26 +1748,19 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
|
|
1836
1748
|
|
1837
1749
|
if force_namespace is not None:
|
1838
1750
|
console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
|
1839
|
-
|
1840
|
-
|
1841
|
-
|
1842
|
-
|
1843
|
-
|
1844
|
-
|
1845
|
-
if
|
1846
|
-
|
1751
|
+
|
1752
|
+
all_logs = fetch_job_logs(
|
1753
|
+
job_name=name,
|
1754
|
+
pod_name=pod_name,
|
1755
|
+
force_namespace=force_namespace,
|
1756
|
+
tail=tail)
|
1757
|
+
if "error" in all_logs:
|
1758
|
+
console.log(f"[red]{all_logs}")
|
1759
|
+
return
|
1847
1760
|
while True:
|
1848
1761
|
try:
|
1849
|
-
# send tail as parameter (fetch only last _tail_ lines)
|
1850
|
-
result = request_to_server(
|
1851
|
-
method="post",
|
1852
|
-
endpoint="/v1/get_logs_for_label",
|
1853
|
-
data=data,
|
1854
|
-
server_creds=USER_LOCAL_SERVER_FILE,
|
1855
|
-
user_cookie=USER_COOKIE
|
1856
|
-
)
|
1857
1762
|
if not stream:
|
1858
|
-
for pod, logs in
|
1763
|
+
for pod, logs in all_logs.items():
|
1859
1764
|
if pod_name is not None and pod_name != pod:
|
1860
1765
|
continue
|
1861
1766
|
console.log(f"[yellow]Pod {pod}")
|
@@ -1863,7 +1768,7 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
|
|
1863
1768
|
break
|
1864
1769
|
else:
|
1865
1770
|
os.system("clear")
|
1866
|
-
for pod, logs in
|
1771
|
+
for pod, logs in all_logs.items():
|
1867
1772
|
if pod_name is not None and pod_name != pod:
|
1868
1773
|
continue
|
1869
1774
|
print(f"Pod {pod}")
|
@@ -1871,10 +1776,7 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
|
|
1871
1776
|
time.sleep(1)
|
1872
1777
|
except KeyboardInterrupt:
|
1873
1778
|
break
|
1874
|
-
|
1875
|
-
console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
|
1876
|
-
console.log(f"Check if {name} is running with [yellow]kalavai job list")
|
1877
|
-
return
|
1779
|
+
|
1878
1780
|
|
1879
1781
|
@arguably.command
|
1880
1782
|
def job__manifest(*others, name, force_namespace: str=None):
|
kalavai_client/cluster.py
CHANGED
@@ -7,10 +7,11 @@ from kalavai_client.utils import (
|
|
7
7
|
run_cmd,
|
8
8
|
check_gpu_drivers,
|
9
9
|
validate_poolconfig,
|
10
|
-
user_path,
|
11
10
|
populate_template
|
12
11
|
)
|
13
12
|
|
13
|
+
from kalavai_client.env import user_path
|
14
|
+
|
14
15
|
|
15
16
|
class Cluster(ABC):
|
16
17
|
@abstractmethod
|
@@ -133,6 +134,9 @@ class dockerCluster(Cluster):
|
|
133
134
|
if not os.path.isfile(self.compose_file):
|
134
135
|
return False
|
135
136
|
status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --status=running").decode()
|
137
|
+
if not status:
|
138
|
+
return False
|
139
|
+
status = (0 == os.system(f'docker exec {self.container_name} ps aux | grep -v grep | grep -E "k3s (server|agent)"'))
|
136
140
|
return status
|
137
141
|
|
138
142
|
def is_seed_node(self):
|
kalavai_client/core.py
ADDED
@@ -0,0 +1,227 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
import math
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
from kalavai_client.utils import (
|
7
|
+
request_to_server,
|
8
|
+
load_server_info
|
9
|
+
)
|
10
|
+
from kalavai_client.env import (
|
11
|
+
USER_COOKIE,
|
12
|
+
USER_LOCAL_SERVER_FILE,
|
13
|
+
TEMPLATE_LABEL,
|
14
|
+
SERVER_IP_KEY
|
15
|
+
)
|
16
|
+
|
17
|
+
class Job(BaseModel):
|
18
|
+
owner: str = None
|
19
|
+
name: str = None
|
20
|
+
workers: str = None
|
21
|
+
endpoint: str = None
|
22
|
+
|
23
|
+
class DeviceStatus(BaseModel):
|
24
|
+
name: str
|
25
|
+
memory_pressure: bool
|
26
|
+
disk_pressure: bool
|
27
|
+
pid_pressure: bool
|
28
|
+
ready: bool
|
29
|
+
unschedulable: bool
|
30
|
+
|
31
|
+
class GPU(BaseModel):
|
32
|
+
node: str
|
33
|
+
available: int
|
34
|
+
total: int
|
35
|
+
ready: bool
|
36
|
+
model: str
|
37
|
+
|
38
|
+
|
39
|
+
def fetch_resources():
|
40
|
+
try:
|
41
|
+
total = request_to_server(
|
42
|
+
method="get",
|
43
|
+
endpoint="/v1/get_cluster_total_resources",
|
44
|
+
data={},
|
45
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
46
|
+
user_cookie=USER_COOKIE
|
47
|
+
)
|
48
|
+
available = request_to_server(
|
49
|
+
method="get",
|
50
|
+
endpoint="/v1/get_cluster_available_resources",
|
51
|
+
data={},
|
52
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
53
|
+
user_cookie=USER_COOKIE
|
54
|
+
)
|
55
|
+
except Exception as e:
|
56
|
+
return {"error": str(e)}
|
57
|
+
|
58
|
+
return {"total": total, "available": available}
|
59
|
+
|
60
|
+
def fetch_job_names():
|
61
|
+
data = {
|
62
|
+
"group": "batch.volcano.sh",
|
63
|
+
"api_version": "v1alpha1",
|
64
|
+
"plural": "jobs"
|
65
|
+
}
|
66
|
+
try:
|
67
|
+
jobs = request_to_server(
|
68
|
+
method="post",
|
69
|
+
endpoint="/v1/get_objects_of_type",
|
70
|
+
data=data,
|
71
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
72
|
+
user_cookie=USER_COOKIE
|
73
|
+
)
|
74
|
+
all_jobs = []
|
75
|
+
for ns, ds in jobs.items():
|
76
|
+
all_jobs.extend([Job(owner=ns, name=d["metadata"]["labels"][TEMPLATE_LABEL]) for d in ds["items"]])
|
77
|
+
except Exception as e:
|
78
|
+
return {"error": str(e)}
|
79
|
+
|
80
|
+
return all_jobs
|
81
|
+
|
82
|
+
def fetch_job_details(jobs: list[Job]):
|
83
|
+
"""Get job details. A job is a dict:
|
84
|
+
{
|
85
|
+
"namespace": ns,
|
86
|
+
"name": name
|
87
|
+
}
|
88
|
+
"""
|
89
|
+
job_details = []
|
90
|
+
for job in jobs:
|
91
|
+
namespace = job.owner
|
92
|
+
deployment = job.name
|
93
|
+
try:
|
94
|
+
# get pod statuses
|
95
|
+
data = {
|
96
|
+
"label": TEMPLATE_LABEL,
|
97
|
+
"value": deployment
|
98
|
+
}
|
99
|
+
result = request_to_server(
|
100
|
+
method="post",
|
101
|
+
endpoint="/v1/get_pods_status_for_label",
|
102
|
+
data=data,
|
103
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
104
|
+
user_cookie=USER_COOKIE
|
105
|
+
)
|
106
|
+
workers_status = defaultdict(int)
|
107
|
+
for ns, ss in result.items():
|
108
|
+
if ns != namespace: # same job name, different namespace
|
109
|
+
continue
|
110
|
+
for _, values in ss.items():
|
111
|
+
workers_status[values["status"]] += 1
|
112
|
+
workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
|
113
|
+
# get URL details
|
114
|
+
data = {
|
115
|
+
"label": TEMPLATE_LABEL,
|
116
|
+
"value": deployment,
|
117
|
+
"types": ["NodePort"]
|
118
|
+
}
|
119
|
+
result = request_to_server(
|
120
|
+
method="post",
|
121
|
+
endpoint="/v1/get_ports_for_services",
|
122
|
+
data=data,
|
123
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
124
|
+
user_cookie=USER_COOKIE
|
125
|
+
)
|
126
|
+
node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
|
127
|
+
|
128
|
+
urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
|
129
|
+
job_details.append(
|
130
|
+
Job(owner=namespace,
|
131
|
+
name=deployment,
|
132
|
+
workers=workers,
|
133
|
+
endpoint="\n".join(urls))
|
134
|
+
)
|
135
|
+
|
136
|
+
except Exception as e:
|
137
|
+
return {"error": str(e)}
|
138
|
+
|
139
|
+
return job_details
|
140
|
+
|
141
|
+
def fetch_devices():
|
142
|
+
"""Load devices status info for all hosts"""
|
143
|
+
try:
|
144
|
+
data = request_to_server(
|
145
|
+
method="get",
|
146
|
+
endpoint="/v1/get_nodes",
|
147
|
+
data={},
|
148
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
149
|
+
user_cookie=USER_COOKIE
|
150
|
+
)
|
151
|
+
devices = []
|
152
|
+
for node, status in data.items():
|
153
|
+
devices.append(
|
154
|
+
DeviceStatus(
|
155
|
+
name=node,
|
156
|
+
memory_pressure=status["MemoryPressure"],
|
157
|
+
disk_pressure=status["DiskPressure"],
|
158
|
+
pid_pressure=status["PIDPressure"],
|
159
|
+
ready=status["Ready"],
|
160
|
+
unschedulable=status["unschedulable"]
|
161
|
+
)
|
162
|
+
)
|
163
|
+
return devices
|
164
|
+
|
165
|
+
except Exception as e:
|
166
|
+
return {"error": str(e)}
|
167
|
+
|
168
|
+
def fetch_job_logs(job_name, force_namespace=None, pod_name=None, tail=100):
|
169
|
+
data = {
|
170
|
+
"label": TEMPLATE_LABEL,
|
171
|
+
"value": job_name,
|
172
|
+
"tail": tail
|
173
|
+
}
|
174
|
+
if force_namespace is not None:
|
175
|
+
data["force_namespace"] = force_namespace
|
176
|
+
try:
|
177
|
+
# send tail as parameter (fetch only last _tail_ lines)
|
178
|
+
all_logs = request_to_server(
|
179
|
+
method="post",
|
180
|
+
endpoint="/v1/get_logs_for_label",
|
181
|
+
data=data,
|
182
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
183
|
+
user_cookie=USER_COOKIE
|
184
|
+
)
|
185
|
+
return {pod: logs for pod, logs in all_logs.items() if pod_name is None or pod_name == pod}
|
186
|
+
|
187
|
+
except Exception as e:
|
188
|
+
return {"error": str(e)}
|
189
|
+
|
190
|
+
|
191
|
+
def load_gpu_models():
|
192
|
+
data = request_to_server(
|
193
|
+
method="post",
|
194
|
+
endpoint="/v1/get_node_gpus",
|
195
|
+
data={},
|
196
|
+
server_creds=USER_LOCAL_SERVER_FILE,
|
197
|
+
user_cookie=USER_COOKIE
|
198
|
+
)
|
199
|
+
return data.items()
|
200
|
+
|
201
|
+
def fetch_gpus(available=False):
|
202
|
+
try:
|
203
|
+
data = load_gpu_models()
|
204
|
+
all_gpus = []
|
205
|
+
for node, gpus in data:
|
206
|
+
row_gpus = []
|
207
|
+
for gpu in gpus["gpus"]:
|
208
|
+
status = gpu["ready"] if "ready" in gpu else True
|
209
|
+
if available and not status:
|
210
|
+
continue
|
211
|
+
row_gpus.append( (f"{gpu['model']} ({math.floor(int(gpu['memory'])/1000)} GBs)", str(status)))
|
212
|
+
if len(row_gpus) > 0:
|
213
|
+
models, statuses = zip(*row_gpus)
|
214
|
+
#rows.append([node, "\n".join(statuses), "\n".join(models), str(gpus["available"]), str(gpus["capacity"])])
|
215
|
+
all_gpus.extend([
|
216
|
+
GPU(
|
217
|
+
node=node,
|
218
|
+
ready=status,
|
219
|
+
model=model,
|
220
|
+
available=gpus["available"],
|
221
|
+
total=gpus["capacity"]
|
222
|
+
) for model, status in zip(models, statuses)
|
223
|
+
])
|
224
|
+
return all_gpus
|
225
|
+
|
226
|
+
except Exception as e:
|
227
|
+
return {"error": str(e)}
|
kalavai_client/env.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
|
5
|
+
def user_path(relative_path, create_path=False):
|
6
|
+
"""Transform a relative path into the user's cache folder path"""
|
7
|
+
base = os.path.expanduser("~")
|
8
|
+
kalavai_user_path = os.path.join(base, ".cache/kalavai")
|
9
|
+
full_path = os.path.join(kalavai_user_path, relative_path)
|
10
|
+
if create_path:
|
11
|
+
Path(full_path).mkdir(parents=True, exist_ok=True)
|
12
|
+
|
13
|
+
return full_path
|
14
|
+
|
15
|
+
|
16
|
+
USER_LOCAL_SERVER_FILE = user_path(".server")
|
17
|
+
USER_COOKIE = user_path(".user_cookie.pkl")
|
18
|
+
TEMPLATE_LABEL = "kalavai.job.name"
|
19
|
+
SERVER_IP_KEY = "server_ip"
|
kalavai_client/utils.py
CHANGED
@@ -14,6 +14,9 @@ import yaml
|
|
14
14
|
|
15
15
|
|
16
16
|
from kalavai_client.auth import KalavaiAuthClient
|
17
|
+
from kalavai_client.env import (
|
18
|
+
SERVER_IP_KEY
|
19
|
+
)
|
17
20
|
|
18
21
|
|
19
22
|
GITHUB_ORG = "kalavai-net"
|
@@ -22,7 +25,6 @@ GITHUB_TEMPLATE_PATH = "templates"
|
|
22
25
|
USER_NODE_LABEL_KEY = "user_node_label"
|
23
26
|
CLUSTER_IP_KEY = "cluster_ip"
|
24
27
|
CLUSTER_TOKEN_KEY = "cluster_token"
|
25
|
-
SERVER_IP_KEY = "server_ip"
|
26
28
|
NODE_NAME_KEY = "node_name"
|
27
29
|
PUBLIC_LOCATION_KEY = "public_location"
|
28
30
|
CLUSTER_NAME_KEY = "cluster_name"
|
@@ -397,16 +399,6 @@ def resource_path(relative_path: str):
|
|
397
399
|
return None
|
398
400
|
return resource
|
399
401
|
|
400
|
-
def user_path(relative_path, create_path=False):
|
401
|
-
"""Transform a relative path into the user's cache folder path"""
|
402
|
-
base = os.path.expanduser("~")
|
403
|
-
kalavai_user_path = os.path.join(base, ".cache/kalavai")
|
404
|
-
full_path = os.path.join(kalavai_user_path, relative_path)
|
405
|
-
if create_path:
|
406
|
-
Path(full_path).mkdir(parents=True, exist_ok=True)
|
407
|
-
|
408
|
-
return full_path
|
409
|
-
|
410
402
|
def safe_remove(filepath, force=True):
|
411
403
|
if not os.path.exists(filepath):
|
412
404
|
return
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: kalavai-client
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.14
|
4
4
|
Summary: Client app for kalavai platform
|
5
5
|
License: Apache-2.0
|
6
6
|
Keywords: LLM,platform
|
@@ -89,6 +89,9 @@ https://github.com/user-attachments/assets/0d2316f3-79ea-46ac-b41e-8ef720f52672
|
|
89
89
|
|
90
90
|
### News updates
|
91
91
|
|
92
|
+
<img src="docs/docs/assets/images/DeepSeek-Emblem.png" width="100">
|
93
|
+
|
94
|
+
- 6 February 2025: 🔥🔥🔥 Access **DeepSeek R1 model for free** when you join our [public LLM pool](https://kalavai-net.github.io/kalavai-client/public_llm_pool/)
|
92
95
|
- 31 January 2025: `kalavai-client` is now a [PyPI package](https://pypi.org/project/kalavai-client/), easier to install than ever!
|
93
96
|
- 27 January 2025: Support for accessing pools from remote computers
|
94
97
|
- 9 January 2025: Added support for [Aphrodite Engine](https://github.com/aphrodite-engine/aphrodite-engine) models
|
@@ -140,7 +143,7 @@ From release **v0.5.0, you can now install `kalavai-client` in non-worker comput
|
|
140
143
|
For workers sharing resources with the pool:
|
141
144
|
|
142
145
|
- A laptop, desktop or Virtual Machine
|
143
|
-
- Docker engine installed (for [linux](https://docs.docker.com/engine/install/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
|
146
|
+
- Docker engine installed (for [linux](https://docs.docker.com/engine/install/ubuntu/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
|
144
147
|
|
145
148
|
> **Support for Windows and MacOS workers is experimental**: kalavai workers run on docker containers that require access to the host network interfaces, thus systems that do not support containers natively (Windows and MacOS) may have difficulties finding each other.
|
146
149
|
|
@@ -1,7 +1,7 @@
|
|
1
|
-
kalavai_client/__init__.py,sha256=
|
1
|
+
kalavai_client/__init__.py,sha256=w3lW-XLGLyRg9TivuSzsLBUjs3nPX74ewdf47ZPX-as,23
|
2
2
|
kalavai_client/__main__.py,sha256=WQUfxvRsBJH5gsCJg8pLz95QnZIj7Ol8psTO77m0QE0,73
|
3
3
|
kalavai_client/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
kalavai_client/assets/apps.yaml,sha256=
|
4
|
+
kalavai_client/assets/apps.yaml,sha256=yC-vtYTPE960KUQihTk5pee8xZz9RD8Reuyh1nSpRWk,5981
|
5
5
|
kalavai_client/assets/apps_values.yaml,sha256=CjKVelPQHd-hm-DTMEuya92feKiphU9mh3HrosLYYPE,1676
|
6
6
|
kalavai_client/assets/docker-compose-template.yaml,sha256=mo8LUam9-AzB_0w72wTyMyreKr4Ns-pxZGc4GVWcUHA,2747
|
7
7
|
kalavai_client/assets/nginx.conf,sha256=drVVCg8GHucz7hmt_BI6giAhK92OV71257NTs3LthwM,225
|
@@ -10,11 +10,13 @@ kalavai_client/assets/pool_config_values.yaml,sha256=VrM3XHQfQo6QLZ68qvagooUptaY
|
|
10
10
|
kalavai_client/assets/user_workspace.yaml,sha256=wDvlMYknOPABAEo0dsQwU7bac8iubjAG9tdkFbJZ5Go,476
|
11
11
|
kalavai_client/assets/user_workspace_values.yaml,sha256=G0HOzQUxrDMCwuW9kbWUZaKMzDDPVwDwzBHCL2Xi2ZM,542
|
12
12
|
kalavai_client/auth.py,sha256=QsBh28L2LwjBBK6pTUE4Xu36lLDTyetyU1YfS1Hbb6g,1717
|
13
|
-
kalavai_client/cli.py,sha256=
|
14
|
-
kalavai_client/cluster.py,sha256=
|
15
|
-
kalavai_client/
|
16
|
-
kalavai_client
|
17
|
-
kalavai_client
|
18
|
-
kalavai_client-0.5.
|
19
|
-
kalavai_client-0.5.
|
20
|
-
kalavai_client-0.5.
|
13
|
+
kalavai_client/cli.py,sha256=tozKzLsDWFhztC5lcorryBcdhzSYyqb5XGWQsUtQ45A,66297
|
14
|
+
kalavai_client/cluster.py,sha256=odzfJFrkKNvZpFFiTA4pg-CeRdPnIe2UyIpSItCjK6A,12319
|
15
|
+
kalavai_client/core.py,sha256=Vb-5MBHjpuR590FIDOnytJpP1Xjt7hYqehPV2rh6P68,6863
|
16
|
+
kalavai_client/env.py,sha256=RAi37vJtIGfPR25PNxZYMZNkkEKR4AyUPN_htFiFesM,575
|
17
|
+
kalavai_client/utils.py,sha256=kQk_1QOs8u08rcfhkcfo_oC-cZzww0cij-1R_jK1ER8,12185
|
18
|
+
kalavai_client-0.5.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
19
|
+
kalavai_client-0.5.14.dist-info/METADATA,sha256=au17zuHNT9eVKuhxpsp_Gqmx6YYksrkSRhnzEqAkZmk,14347
|
20
|
+
kalavai_client-0.5.14.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
21
|
+
kalavai_client-0.5.14.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
|
22
|
+
kalavai_client-0.5.14.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|