kalavai-client 0.5.10__py3-none-any.whl → 0.5.13__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,2 +1,2 @@
1
1
 
2
- __version__ = "0.5.10"
2
+ __version__ = "0.5.13"
@@ -139,7 +139,7 @@ releases:
139
139
  - name: replicas
140
140
  value: 2
141
141
  - name: image_tag
142
- value: "v2025.01"
142
+ value: "v2025.01.9"
143
143
  - name: deployment.in_cluster
144
144
  value: "True"
145
145
  - name: deployment.use_auth_key
@@ -1,8 +1,9 @@
1
1
  services:
2
2
  {% if vpn %}
3
3
  {{vpn_name}}:
4
- image: gravitl/netclient:v0.30.0
4
+ image: gravitl/netclient:v0.24.3
5
5
  container_name: {{vpn_name}}
6
+ #privileged: true
6
7
  cap_add:
7
8
  - NET_ADMIN
8
9
  - SYS_MODULE
kalavai_client/cli.py CHANGED
@@ -15,6 +15,21 @@ import netifaces as ni
15
15
  import arguably
16
16
  from rich.console import Console
17
17
 
18
+ from kalavai_client.env import (
19
+ USER_COOKIE,
20
+ USER_LOCAL_SERVER_FILE,
21
+ TEMPLATE_LABEL,
22
+ user_path
23
+ )
24
+ from kalavai_client.core import (
25
+ fetch_resources,
26
+ fetch_job_names,
27
+ fetch_job_details,
28
+ fetch_devices,
29
+ fetch_job_logs,
30
+ fetch_gpus,
31
+ load_gpu_models
32
+ )
18
33
  from kalavai_client.utils import (
19
34
  check_gpu_drivers,
20
35
  run_cmd,
@@ -27,10 +42,8 @@ from kalavai_client.utils import (
27
42
  generate_table,
28
43
  request_to_server,
29
44
  resource_path,
30
- user_path,
31
45
  safe_remove,
32
46
  leave_vpn,
33
- join_vpn,
34
47
  get_vpn_details,
35
48
  load_server_info,
36
49
  user_login,
@@ -68,7 +81,6 @@ LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
68
81
  VERSION = 1
69
82
  RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
70
83
  CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
71
- TEMPLATE_LABEL = "kalavai.job.name"
72
84
  RAY_LABEL = "kalavai.ray.name"
73
85
  PVC_NAME_LABEL = "kalavai.storage.name"
74
86
  DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
@@ -98,9 +110,7 @@ USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
98
110
  USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
99
111
  USER_HELM_APPS_FILE = user_path("apps.yaml")
100
112
  USER_KUBECONFIG_FILE = user_path("kubeconfig")
101
- USER_LOCAL_SERVER_FILE = user_path(".server")
102
113
  USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
103
- USER_COOKIE = user_path(".user_cookie.pkl")
104
114
 
105
115
 
106
116
  console = Console()
@@ -118,7 +128,7 @@ CLUSTER = dockerCluster(
118
128
  ######################
119
129
  ## HELPER FUNCTIONS ##
120
130
  ######################
121
-
131
+
122
132
  def check_seed_compatibility():
123
133
  """Check required packages to start pools"""
124
134
  logs = []
@@ -288,21 +298,11 @@ def select_ip_address(subnet=None):
288
298
  console.log("[red] Input error")
289
299
  return ips[option]
290
300
 
291
- def fetch_gpus():
292
- data = request_to_server(
293
- method="post",
294
- endpoint="/v1/get_node_gpus",
295
- data={},
296
- server_creds=USER_LOCAL_SERVER_FILE,
297
- user_cookie=USER_COOKIE
298
- )
299
- return data.items()
300
-
301
301
  def select_gpus(message):
302
302
  console.log(f"[yellow]{message}")
303
303
  gpu_models = ["Any/None"]
304
304
  gpu_models_full = ["Any/None"]
305
- available_gpus = fetch_gpus()
305
+ available_gpus = load_gpu_models()
306
306
  for _, gpus in available_gpus:
307
307
  for gpu in gpus["gpus"]:
308
308
  #status = "free" if "ready" in gpu else "busy"
@@ -337,18 +337,19 @@ def select_token_type():
337
337
  break
338
338
  return {"admin": choice == 0, "user": choice == 1, "worker": choice == 2}
339
339
 
340
- def generate_compose_config(role, node_name, is_public, node_labels=None, pool_ip=None, vpn_token=None, pool_token=None):
340
+ def generate_compose_config(role, node_name, is_public, use_gpus=True, node_labels=None, pool_ip=None, vpn_token=None, pool_token=None):
341
341
  num_gpus = 0
342
- try:
343
- has_gpus = check_gpu_drivers()
344
- if has_gpus:
345
- max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
346
- num_gpus = user_confirm(
347
- question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
348
- options=range(max_gpus+1)
349
- )
350
- except:
351
- console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
342
+ if use_gpus:
343
+ try:
344
+ has_gpus = check_gpu_drivers()
345
+ if has_gpus:
346
+ max_gpus = int(run_cmd("nvidia-smi -L | wc -l").decode())
347
+ num_gpus = user_confirm(
348
+ question=f"{max_gpus} NVIDIA GPU(s) detected. How many GPUs would you like to include?",
349
+ options=range(max_gpus+1)
350
+ )
351
+ except:
352
+ console.log(f"[red]WARNING: error when fetching NVIDIA GPU info. GPUs will not be used on this local machine")
352
353
  if node_labels is not None:
353
354
  node_labels = " ".join([f"--node-label {key}={value}" for key, value in node_labels.items()])
354
355
  compose_values = {
@@ -736,7 +737,7 @@ def pool__check_token(token, *others, public=False):
736
737
 
737
738
 
738
739
  @arguably.command
739
- def pool__join(token, *others, node_name=None, ip_address: str=None):
740
+ def pool__join(token, *others, node_name=None):
740
741
  """
741
742
  Join Kalavai pool and start/resume sharing resources.
742
743
 
@@ -859,7 +860,7 @@ def pool__join(token, *others, node_name=None, ip_address: str=None):
859
860
  CLUSTER.start_worker_node()
860
861
  except Exception as e:
861
862
  console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
862
- leave_vpn(container_name=DEFAULT_VPN_CONTAINER_NAME)
863
+ pool__stop()
863
864
  exit()
864
865
 
865
866
  # ensure we are connected
@@ -868,6 +869,22 @@ def pool__join(token, *others, node_name=None, ip_address: str=None):
868
869
  time.sleep(30)
869
870
  if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
870
871
  break
872
+
873
+ # send note to server to let them know the node is coming online
874
+ if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
875
+ console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with [yellow]--node-name'")
876
+ pool__stop()
877
+ return
878
+
879
+ # check the node has connected successfully
880
+ try:
881
+ while not CLUSTER.is_agent_running():
882
+ console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
883
+ time.sleep(30)
884
+ except KeyboardInterrupt:
885
+ console.log("[red]Installation aborted. Leaving pool.")
886
+ pool__stop()
887
+ return
871
888
 
872
889
  init_user_workspace()
873
890
 
@@ -962,29 +979,24 @@ def pool__gpus(*others, available=False):
962
979
  console.log(f"[red]Problems with your pool: {str(e)}")
963
980
  return
964
981
 
965
- try:
966
- data = fetch_gpus()
967
- columns, rows = [], []
968
- for node, gpus in data:
969
- row_gpus = []
970
- for gpu in gpus["gpus"]:
971
- status = gpu["ready"] if "ready" in gpu else True
972
- if available and not status:
973
- continue
974
- row_gpus.append( (f"{gpu['model']} ({math.floor(int(gpu['memory'])/1000)} GBs)", str(status)))
975
- if len(row_gpus) > 0:
976
- models, statuses = zip(*row_gpus)
977
- rows.append([node, "\n".join(statuses), "\n".join(models), str(gpus["available"]), str(gpus["capacity"])])
978
-
979
- columns = ["Ready", "GPU(s)", "Available", "Total"]
980
- columns = ["Node"] + columns
981
- console.print(
982
- generate_table(columns=columns, rows=rows,end_sections=[n for n in range(len(rows))])
983
- )
984
-
985
- except Exception as e:
986
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
987
-
982
+ gpus = fetch_gpus(available=available)
983
+ if "error" in gpus:
984
+ console.log(f"[red]Error when fetching gpus: {gpus}")
985
+ return
986
+
987
+ columns = ["Node", "Ready", "GPU(s)", "Available", "Total"]
988
+ rows = []
989
+ for gpu in gpus:
990
+ rows.append([
991
+ gpu.node,
992
+ str(gpu.ready),
993
+ gpu.model,
994
+ str(gpu.available),
995
+ str(gpu.total)
996
+ ])
997
+ console.print(
998
+ generate_table(columns=columns, rows=rows,end_sections=[n for n in range(len(rows))])
999
+ )
988
1000
 
989
1001
  @arguably.command
990
1002
  def pool__resources(*others):
@@ -997,45 +1009,33 @@ def pool__resources(*others):
997
1009
  console.log(f"[red]Problems with your pool: {str(e)}")
998
1010
  return
999
1011
 
1000
- try:
1001
- total = request_to_server(
1002
- method="get",
1003
- endpoint="/v1/get_cluster_total_resources",
1004
- data={},
1005
- server_creds=USER_LOCAL_SERVER_FILE,
1006
- user_cookie=USER_COOKIE
1007
- )
1008
- available = request_to_server(
1009
- method="get",
1010
- endpoint="/v1/get_cluster_available_resources",
1011
- data={},
1012
- server_creds=USER_LOCAL_SERVER_FILE,
1013
- user_cookie=USER_COOKIE
1014
- )
1015
- columns = []
1016
- total_values = []
1017
- available_values = []
1018
- for col in total.keys():
1019
- if col in RESOURCE_EXCLUDE:
1020
- continue
1021
- columns.append(col)
1022
- total_values.append(str(total[col]))
1023
- available_values.append(str(available[col]))
1024
-
1025
- columns = [""] + columns
1026
- total_values = ["Total"] + total_values
1027
- available_values = ["Available"] + available_values
1028
-
1029
- rows = [
1030
- tuple(available_values),
1031
- tuple(total_values)
1032
- ]
1033
- console.print(
1034
- generate_table(columns=columns, rows=rows, end_sections=[0, 1])
1035
- )
1036
-
1037
- except Exception as e:
1012
+ data = fetch_resources()
1013
+ if "error" in data:
1038
1014
  console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1015
+ return
1016
+
1017
+ columns = []
1018
+ total_values = []
1019
+ available_values = []
1020
+ for col in data["total"].keys():
1021
+ if col in RESOURCE_EXCLUDE:
1022
+ continue
1023
+ columns.append(col)
1024
+ total_values.append(str(data["total"][col]))
1025
+ available_values.append(str(data["available"][col]))
1026
+
1027
+ columns = [""] + columns
1028
+ total_values = ["Total"] + total_values
1029
+ available_values = ["Available"] + available_values
1030
+
1031
+ rows = [
1032
+ tuple(available_values),
1033
+ tuple(total_values)
1034
+ ]
1035
+ console.print(
1036
+ generate_table(columns=columns, rows=rows, end_sections=[0, 1])
1037
+ )
1038
+
1039
1039
 
1040
1040
  @arguably.command
1041
1041
  def pool__update(*others):
@@ -1102,6 +1102,10 @@ def pool__attach(token, *others, node_name=None):
1102
1102
  """
1103
1103
  Set creds in token on the local instance
1104
1104
  """
1105
+
1106
+ if node_name is None:
1107
+ node_name = socket.gethostname()
1108
+
1105
1109
  # check that is not attached to another instance
1106
1110
  if os.path.exists(USER_LOCAL_SERVER_FILE):
1107
1111
  option = user_confirm(
@@ -1160,6 +1164,7 @@ def pool__attach(token, *others, node_name=None):
1160
1164
 
1161
1165
  # Generate docker compose recipe
1162
1166
  generate_compose_config(
1167
+ use_gpus=False,
1163
1168
  role="",
1164
1169
  vpn_token=vpn["key"],
1165
1170
  node_name=node_name,
@@ -1327,22 +1332,18 @@ def node__list(*others):
1327
1332
  return
1328
1333
 
1329
1334
  try:
1330
- data = request_to_server(
1331
- method="get",
1332
- endpoint="/v1/get_nodes",
1333
- data={},
1334
- server_creds=USER_LOCAL_SERVER_FILE,
1335
- user_cookie=USER_COOKIE
1336
- )
1335
+ devices = fetch_devices()
1337
1336
  rows = []
1338
- columns = ["Node name"]
1339
- for node, status in data.items():
1340
- row = [node]
1341
- for key, value in status.items():
1342
- if key not in columns:
1343
- columns.append(key)
1344
- row.append(str(value))
1345
- rows.append(tuple(row))
1337
+ columns = ["Node name", "Memory Pressure", "Disk pressure", "PID pressure", "Ready", "Unschedulable"]
1338
+ for device in devices:
1339
+ rows.append([
1340
+ device.name,
1341
+ str(device.memory_pressure),
1342
+ str(device.disk_pressure),
1343
+ str(device.pid_pressure),
1344
+ str(device.ready),
1345
+ str(device.unschedulable)
1346
+ ])
1346
1347
 
1347
1348
  console.log("Nodes with 'unschedulable=True' will not receive workload")
1348
1349
  console.log("To make a node unschedulable (i.e. won't receive workloads) use [yellow]kalavai node cordon <node name>")
@@ -1510,6 +1511,64 @@ def job__run(template_name, *others, values: str=None, force_namespace: str=None
1510
1511
  console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1511
1512
  return
1512
1513
 
1514
+ @arguably.command
1515
+ def job__test(local_template_dir, *others, values, defaults, force_namespace: str=None):
1516
+ """
1517
+ Helper to test local templates, useful for development
1518
+ """
1519
+ try:
1520
+ CLUSTER.validate_cluster()
1521
+ except Exception as e:
1522
+ console.log(f"[red]Problems with your pool: {str(e)}")
1523
+ return
1524
+
1525
+ if not os.path.isdir(local_template_dir):
1526
+ console.log(f"[red]--local_template_dir ({local_template_dir}) is not a directory")
1527
+ return
1528
+
1529
+ # load template
1530
+ with open(os.path.join(local_template_dir, "template.yaml"), "r") as f:
1531
+ template_str = f.read()
1532
+
1533
+ # load values
1534
+ if not os.path.isfile(values):
1535
+ console.log(f"[red]--values ({values}) is not a valid local file")
1536
+ return
1537
+ with open(values, "r") as f:
1538
+ raw_values = yaml.load(f, Loader=yaml.SafeLoader)
1539
+ values_dict = {variable["name"]: variable['value'] for variable in raw_values}
1540
+
1541
+ # load defaults
1542
+ if not os.path.isfile(defaults):
1543
+ console.log(f"[red]--defaults ({defaults}) is not a valid local file")
1544
+ return
1545
+ with open(defaults, "r") as f:
1546
+ defaults = f.read()
1547
+
1548
+ # submit custom deployment
1549
+ data = {
1550
+ "template": template_str,
1551
+ "template_values": values_dict,
1552
+ "default_values": defaults
1553
+ }
1554
+ if force_namespace is not None:
1555
+ data["force_namespace"] = force_namespace
1556
+
1557
+ try:
1558
+ result = request_to_server(
1559
+ method="post",
1560
+ endpoint="/v1/deploy_custom_job",
1561
+ data=data,
1562
+ server_creds=USER_LOCAL_SERVER_FILE,
1563
+ user_cookie=USER_COOKIE
1564
+ )
1565
+ console.log("Deployment result:")
1566
+ print(
1567
+ json.dumps(result,indent=3)
1568
+ )
1569
+ except Exception as e:
1570
+ console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1571
+
1513
1572
 
1514
1573
  @arguably.command
1515
1574
  def job__defaults(template_name, *others):
@@ -1586,7 +1645,7 @@ def job__estimate(billion_parameters, *others, precision=32):
1586
1645
 
1587
1646
  average_vram = 8
1588
1647
  required_memory = float(billion_parameters) * (precision / 8) / 1.2
1589
- available_gpus = fetch_gpus()
1648
+ available_gpus = load_gpu_models()
1590
1649
  vrams = []
1591
1650
  for _, gpus in available_gpus:
1592
1651
  for model in gpus["gpus"]:
@@ -1643,7 +1702,7 @@ def job__status(name, *others):
1643
1702
  return
1644
1703
 
1645
1704
  @arguably.command
1646
- def job__list(*others, detailed=False):
1705
+ def job__list(*others):
1647
1706
  """
1648
1707
  List jobs in the cluster
1649
1708
  """
@@ -1653,106 +1712,22 @@ def job__list(*others, detailed=False):
1653
1712
  console.log(f"[red]Problems with your pool: {str(e)}")
1654
1713
  return
1655
1714
 
1656
- data = {
1657
- "group": "batch.volcano.sh",
1658
- "api_version": "v1alpha1",
1659
- "plural": "jobs"
1660
- }
1661
- try:
1662
- result = request_to_server(
1663
- method="post",
1664
- endpoint="/v1/get_objects_of_type",
1665
- data=data,
1666
- server_creds=USER_LOCAL_SERVER_FILE,
1667
- user_cookie=USER_COOKIE
1668
- )
1669
- all_deployments = defaultdict(list)
1670
- for ns, ds in result.items():
1671
- all_deployments[ns].extend([d["metadata"]["labels"][TEMPLATE_LABEL] for d in ds["items"]])
1672
- #deployments = {ns: d["metadata"]["labels"][TEMPLATE_LABEL] for ns, ds in result.items() for d in ds["items"]}
1673
- except Exception as e:
1674
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1675
- return
1676
- if len(all_deployments.keys()) == 0:
1715
+ all_deployments = fetch_job_names()
1716
+ if "error" in all_deployments:
1717
+ console.log(f"[red]Error when connecting to kalavai service: {all_deployments}")
1718
+ return
1719
+
1720
+ if len(all_deployments) == 0:
1677
1721
  console.log("[green]No deployments found.")
1678
1722
  return
1679
1723
 
1724
+ details = fetch_job_details(jobs=all_deployments)
1725
+ if "error" in details:
1726
+ console.log(f"[red]{details}")
1727
+ return
1680
1728
  columns = ["Owner", "Deployment", "Workers", "Endpoint"]
1681
- if detailed:
1682
- columns.append("Status")
1683
- rows = []
1684
- for namespace, deployments in all_deployments.items():
1685
- for deployment in deployments:
1686
- try:
1687
- # get status for deployment
1688
- if detailed:
1689
- data = {
1690
- "group": "batch.volcano.sh",
1691
- "api_version": "v1alpha1",
1692
- "plural": "jobs",
1693
- # "group": "leaderworkerset.x-k8s.io",
1694
- # "api_version": "v1",
1695
- # "plural": "leaderworkersets",
1696
- "name": deployment
1697
- }
1698
- result = request_to_server(
1699
- method="post",
1700
- endpoint="/v1/get_status_for_object",
1701
- data=data,
1702
- server_creds=USER_LOCAL_SERVER_FILE,
1703
- user_cookie=USER_COOKIE
1704
- )
1705
- ss = [] # flatten results ({namespace: statuses})
1706
- [ss.extend(values) for values in result.values()]
1707
- if len(ss) > 0:
1708
- last = ss[-1]
1709
- statuses = f"[{last['lastTransitionTime']}] {last['status']}"
1710
- else:
1711
- statuses = "Unknown"
1712
- # get pod statuses
1713
- data = {
1714
- "label": TEMPLATE_LABEL,
1715
- "value": deployment
1716
- }
1717
- result = request_to_server(
1718
- method="post",
1719
- endpoint="/v1/get_pods_status_for_label",
1720
- data=data,
1721
- server_creds=USER_LOCAL_SERVER_FILE,
1722
- user_cookie=USER_COOKIE
1723
- )
1724
- workers_status = defaultdict(int)
1725
- for ns, ss in result.items():
1726
- if ns != namespace: # same job name, different namespace
1727
- continue
1728
- for _, values in ss.items():
1729
- workers_status[values["status"]] += 1
1730
- workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
1731
- # get URL details
1732
- data = {
1733
- "label": TEMPLATE_LABEL,
1734
- "value": deployment,
1735
- "types": ["NodePort"]
1736
- }
1737
- result = request_to_server(
1738
- method="post",
1739
- endpoint="/v1/get_ports_for_services",
1740
- data=data,
1741
- server_creds=USER_LOCAL_SERVER_FILE,
1742
- user_cookie=USER_COOKIE
1743
- )
1744
- node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
1745
-
1746
- urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
1747
- row = [namespace, deployment, workers, "\n".join(urls)]
1748
- if detailed:
1749
- row.append(statuses)
1750
- rows.append(row)
1751
-
1752
- except Exception as e:
1753
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1754
- return
1755
-
1729
+ rows = [[job.owner, job.name, job.workers, job.endpoint] for job in details]
1730
+
1756
1731
  console.print(
1757
1732
  generate_table(columns=columns, rows=rows, end_sections=range(len(rows)))
1758
1733
  )
@@ -1774,26 +1749,19 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
1774
1749
 
1775
1750
  if force_namespace is not None:
1776
1751
  console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1777
-
1778
- data = {
1779
- "label": TEMPLATE_LABEL,
1780
- "value": name,
1781
- "tail": tail
1782
- }
1783
- if force_namespace is not None:
1784
- data["force_namespace"] = force_namespace
1752
+
1753
+ all_logs = fetch_job_logs(
1754
+ job_name=name,
1755
+ pod_name=pod_name,
1756
+ force_namespace=force_namespace,
1757
+ tail=tail)
1758
+ if "error" in all_logs:
1759
+ console.log(f"[red]{all_logs}")
1760
+ return
1785
1761
  while True:
1786
1762
  try:
1787
- # send tail as parameter (fetch only last _tail_ lines)
1788
- result = request_to_server(
1789
- method="post",
1790
- endpoint="/v1/get_logs_for_label",
1791
- data=data,
1792
- server_creds=USER_LOCAL_SERVER_FILE,
1793
- user_cookie=USER_COOKIE
1794
- )
1795
1763
  if not stream:
1796
- for pod, logs in result.items():
1764
+ for pod, logs in all_logs.items():
1797
1765
  if pod_name is not None and pod_name != pod:
1798
1766
  continue
1799
1767
  console.log(f"[yellow]Pod {pod}")
@@ -1801,7 +1769,7 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
1801
1769
  break
1802
1770
  else:
1803
1771
  os.system("clear")
1804
- for pod, logs in result.items():
1772
+ for pod, logs in all_logs.items():
1805
1773
  if pod_name is not None and pod_name != pod:
1806
1774
  continue
1807
1775
  print(f"Pod {pod}")
@@ -1809,10 +1777,7 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
1809
1777
  time.sleep(1)
1810
1778
  except KeyboardInterrupt:
1811
1779
  break
1812
- except Exception as e:
1813
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1814
- console.log(f"Check if {name} is running with [yellow]kalavai job list")
1815
- return
1780
+
1816
1781
 
1817
1782
  @arguably.command
1818
1783
  def job__manifest(*others, name, force_namespace: str=None):
kalavai_client/cluster.py CHANGED
@@ -133,6 +133,9 @@ class dockerCluster(Cluster):
133
133
  if not os.path.isfile(self.compose_file):
134
134
  return False
135
135
  status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --status=running").decode()
136
+ if not status:
137
+ return False
138
+ status = (0 == os.system(f'docker exec {self.container_name} ps aux | grep -v grep | grep -E "k3s (server|agent)"'))
136
139
  return status
137
140
 
138
141
  def is_seed_node(self):
kalavai_client/core.py ADDED
@@ -0,0 +1,227 @@
1
+ from collections import defaultdict
2
+ import math
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from kalavai_client.utils import (
7
+ request_to_server,
8
+ load_server_info
9
+ )
10
+ from kalavai_client.env import (
11
+ USER_COOKIE,
12
+ USER_LOCAL_SERVER_FILE,
13
+ TEMPLATE_LABEL,
14
+ SERVER_IP_KEY
15
+ )
16
+
17
+ class Job(BaseModel):
18
+ owner: str = None
19
+ name: str = None
20
+ workers: str = None
21
+ endpoint: str = None
22
+
23
+ class DeviceStatus(BaseModel):
24
+ name: str
25
+ memory_pressure: bool
26
+ disk_pressure: bool
27
+ pid_pressure: bool
28
+ ready: bool
29
+ unschedulable: bool
30
+
31
+ class GPU(BaseModel):
32
+ node: str
33
+ available: int
34
+ total: int
35
+ ready: bool
36
+ model: str
37
+
38
+
39
+ def fetch_resources():
40
+ try:
41
+ total = request_to_server(
42
+ method="get",
43
+ endpoint="/v1/get_cluster_total_resources",
44
+ data={},
45
+ server_creds=USER_LOCAL_SERVER_FILE,
46
+ user_cookie=USER_COOKIE
47
+ )
48
+ available = request_to_server(
49
+ method="get",
50
+ endpoint="/v1/get_cluster_available_resources",
51
+ data={},
52
+ server_creds=USER_LOCAL_SERVER_FILE,
53
+ user_cookie=USER_COOKIE
54
+ )
55
+ except Exception as e:
56
+ return {"error": str(e)}
57
+
58
+ return {"total": total, "available": available}
59
+
60
+ def fetch_job_names():
61
+ data = {
62
+ "group": "batch.volcano.sh",
63
+ "api_version": "v1alpha1",
64
+ "plural": "jobs"
65
+ }
66
+ try:
67
+ jobs = request_to_server(
68
+ method="post",
69
+ endpoint="/v1/get_objects_of_type",
70
+ data=data,
71
+ server_creds=USER_LOCAL_SERVER_FILE,
72
+ user_cookie=USER_COOKIE
73
+ )
74
+ all_jobs = []
75
+ for ns, ds in jobs.items():
76
+ all_jobs.extend([Job(owner=ns, name=d["metadata"]["labels"][TEMPLATE_LABEL]) for d in ds["items"]])
77
+ except Exception as e:
78
+ return {"error": str(e)}
79
+
80
+ return all_jobs
81
+
82
+ def fetch_job_details(jobs: list[Job]):
83
+ """Get job details. A job is a dict:
84
+ {
85
+ "namespace": ns,
86
+ "name": name
87
+ }
88
+ """
89
+ job_details = []
90
+ for job in jobs:
91
+ namespace = job.owner
92
+ deployment = job.name
93
+ try:
94
+ # get pod statuses
95
+ data = {
96
+ "label": TEMPLATE_LABEL,
97
+ "value": deployment
98
+ }
99
+ result = request_to_server(
100
+ method="post",
101
+ endpoint="/v1/get_pods_status_for_label",
102
+ data=data,
103
+ server_creds=USER_LOCAL_SERVER_FILE,
104
+ user_cookie=USER_COOKIE
105
+ )
106
+ workers_status = defaultdict(int)
107
+ for ns, ss in result.items():
108
+ if ns != namespace: # same job name, different namespace
109
+ continue
110
+ for _, values in ss.items():
111
+ workers_status[values["status"]] += 1
112
+ workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
113
+ # get URL details
114
+ data = {
115
+ "label": TEMPLATE_LABEL,
116
+ "value": deployment,
117
+ "types": ["NodePort"]
118
+ }
119
+ result = request_to_server(
120
+ method="post",
121
+ endpoint="/v1/get_ports_for_services",
122
+ data=data,
123
+ server_creds=USER_LOCAL_SERVER_FILE,
124
+ user_cookie=USER_COOKIE
125
+ )
126
+ node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
127
+
128
+ urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
129
+ job_details.append(
130
+ Job(owner=namespace,
131
+ name=deployment,
132
+ workers=workers,
133
+ endpoint="\n".join(urls))
134
+ )
135
+
136
+ except Exception as e:
137
+ return {"error": str(e)}
138
+
139
+ return job_details
140
+
141
+ def fetch_devices():
142
+ """Load devices status info for all hosts"""
143
+ try:
144
+ data = request_to_server(
145
+ method="get",
146
+ endpoint="/v1/get_nodes",
147
+ data={},
148
+ server_creds=USER_LOCAL_SERVER_FILE,
149
+ user_cookie=USER_COOKIE
150
+ )
151
+ devices = []
152
+ for node, status in data.items():
153
+ devices.append(
154
+ DeviceStatus(
155
+ name=node,
156
+ memory_pressure=status["MemoryPressure"],
157
+ disk_pressure=status["DiskPressure"],
158
+ pid_pressure=status["PIDPressure"],
159
+ ready=status["Ready"],
160
+ unschedulable=status["unschedulable"]
161
+ )
162
+ )
163
+ return devices
164
+
165
+ except Exception as e:
166
+ return {"error": str(e)}
167
+
168
+ def fetch_job_logs(job_name, force_namespace=None, pod_name=None, tail=100):
169
+ data = {
170
+ "label": TEMPLATE_LABEL,
171
+ "value": job_name,
172
+ "tail": tail
173
+ }
174
+ if force_namespace is not None:
175
+ data["force_namespace"] = force_namespace
176
+ try:
177
+ # send tail as parameter (fetch only last _tail_ lines)
178
+ all_logs = request_to_server(
179
+ method="post",
180
+ endpoint="/v1/get_logs_for_label",
181
+ data=data,
182
+ server_creds=USER_LOCAL_SERVER_FILE,
183
+ user_cookie=USER_COOKIE
184
+ )
185
+ return {pod: logs for pod, logs in all_logs.items() if pod_name is None or pod_name == pod}
186
+
187
+ except Exception as e:
188
+ return {"error": str(e)}
189
+
190
+
191
+ def load_gpu_models():
192
+ data = request_to_server(
193
+ method="post",
194
+ endpoint="/v1/get_node_gpus",
195
+ data={},
196
+ server_creds=USER_LOCAL_SERVER_FILE,
197
+ user_cookie=USER_COOKIE
198
+ )
199
+ return data.items()
200
+
201
+ def fetch_gpus(available=False):
202
+ try:
203
+ data = load_gpu_models()
204
+ all_gpus = []
205
+ for node, gpus in data:
206
+ row_gpus = []
207
+ for gpu in gpus["gpus"]:
208
+ status = gpu["ready"] if "ready" in gpu else True
209
+ if available and not status:
210
+ continue
211
+ row_gpus.append( (f"{gpu['model']} ({math.floor(int(gpu['memory'])/1000)} GBs)", str(status)))
212
+ if len(row_gpus) > 0:
213
+ models, statuses = zip(*row_gpus)
214
+ #rows.append([node, "\n".join(statuses), "\n".join(models), str(gpus["available"]), str(gpus["capacity"])])
215
+ all_gpus.extend([
216
+ GPU(
217
+ node=node,
218
+ ready=status,
219
+ model=model,
220
+ available=gpus["available"],
221
+ total=gpus["capacity"]
222
+ ) for model, status in zip(models, statuses)
223
+ ])
224
+ return all_gpus
225
+
226
+ except Exception as e:
227
+ return {"error": str(e)}
kalavai_client/env.py ADDED
@@ -0,0 +1,19 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+
5
+ def user_path(relative_path, create_path=False):
6
+ """Transform a relative path into the user's cache folder path"""
7
+ base = os.path.expanduser("~")
8
+ kalavai_user_path = os.path.join(base, ".cache/kalavai")
9
+ full_path = os.path.join(kalavai_user_path, relative_path)
10
+ if create_path:
11
+ Path(full_path).mkdir(parents=True, exist_ok=True)
12
+
13
+ return full_path
14
+
15
+
16
+ USER_LOCAL_SERVER_FILE = user_path(".server")
17
+ USER_COOKIE = user_path(".user_cookie.pkl")
18
+ TEMPLATE_LABEL = "kalavai.job.name"
19
+ SERVER_IP_KEY = "server_ip"
kalavai_client/utils.py CHANGED
@@ -14,6 +14,9 @@ import yaml
14
14
 
15
15
 
16
16
  from kalavai_client.auth import KalavaiAuthClient
17
+ from kalavai_client.env import (
18
+ SERVER_IP_KEY
19
+ )
17
20
 
18
21
 
19
22
  GITHUB_ORG = "kalavai-net"
@@ -22,7 +25,6 @@ GITHUB_TEMPLATE_PATH = "templates"
22
25
  USER_NODE_LABEL_KEY = "user_node_label"
23
26
  CLUSTER_IP_KEY = "cluster_ip"
24
27
  CLUSTER_TOKEN_KEY = "cluster_token"
25
- SERVER_IP_KEY = "server_ip"
26
28
  NODE_NAME_KEY = "node_name"
27
29
  PUBLIC_LOCATION_KEY = "public_location"
28
30
  CLUSTER_NAME_KEY = "cluster_name"
@@ -284,8 +286,11 @@ def request_to_server(
284
286
  json=data,
285
287
  headers=headers
286
288
  )
287
- result = response.json()
288
- return result
289
+ try:
290
+ result = response.json()
291
+ return result
292
+ except Exception as e:
293
+ raise ValueError(f"Error with HTTP request: {response.text}\n{str(e)}")
289
294
 
290
295
 
291
296
  def generate_table(columns, rows, end_sections=None):
@@ -394,16 +399,6 @@ def resource_path(relative_path: str):
394
399
  return None
395
400
  return resource
396
401
 
397
- def user_path(relative_path, create_path=False):
398
- """Transform a relative path into the user's cache folder path"""
399
- base = os.path.expanduser("~")
400
- kalavai_user_path = os.path.join(base, ".cache/kalavai")
401
- full_path = os.path.join(kalavai_user_path, relative_path)
402
- if create_path:
403
- Path(full_path).mkdir(parents=True, exist_ok=True)
404
-
405
- return full_path
406
-
407
402
  def safe_remove(filepath, force=True):
408
403
  if not os.path.exists(filepath):
409
404
  return
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: kalavai-client
3
- Version: 0.5.10
3
+ Version: 0.5.13
4
4
  Summary: Client app for kalavai platform
5
5
  License: Apache-2.0
6
6
  Keywords: LLM,platform
@@ -89,6 +89,9 @@ https://github.com/user-attachments/assets/0d2316f3-79ea-46ac-b41e-8ef720f52672
89
89
 
90
90
  ### News updates
91
91
 
92
+ <img src="docs/docs/assets/images/DeepSeek-Emblem.png" width="100">
93
+
94
+ - 6 February 2025: 🔥🔥🔥 Access **DeepSeek R1 model for free** when you join our [public LLM pool](https://kalavai-net.github.io/kalavai-client/public_llm_pool/)
92
95
  - 31 January 2025: `kalavai-client` is now a [PyPI package](https://pypi.org/project/kalavai-client/), easier to install than ever!
93
96
  - 27 January 2025: Support for accessing pools from remote computers
94
97
  - 9 January 2025: Added support for [Aphrodite Engine](https://github.com/aphrodite-engine/aphrodite-engine) models
@@ -140,7 +143,7 @@ From release **v0.5.0, you can now install `kalavai-client` in non-worker comput
140
143
  For workers sharing resources with the pool:
141
144
 
142
145
  - A laptop, desktop or Virtual Machine
143
- - Docker engine installed (for [linux](https://docs.docker.com/engine/install/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
146
+ - Docker engine installed (for [linux](https://docs.docker.com/engine/install/ubuntu/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
144
147
 
145
148
  > **Support for Windows and MacOS workers is experimental**: kalavai workers run on docker containers that require access to the host network interfaces, thus systems that do not support containers natively (Windows and MacOS) may have difficulties finding each other.
146
149
 
@@ -0,0 +1,22 @@
1
+ kalavai_client/__init__.py,sha256=OKM-UDxm0absUf9IgE89lC_PpDG9RbBD4It-hbz8ORM,23
2
+ kalavai_client/__main__.py,sha256=WQUfxvRsBJH5gsCJg8pLz95QnZIj7Ol8psTO77m0QE0,73
3
+ kalavai_client/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ kalavai_client/assets/apps.yaml,sha256=yC-vtYTPE960KUQihTk5pee8xZz9RD8Reuyh1nSpRWk,5981
5
+ kalavai_client/assets/apps_values.yaml,sha256=CjKVelPQHd-hm-DTMEuya92feKiphU9mh3HrosLYYPE,1676
6
+ kalavai_client/assets/docker-compose-template.yaml,sha256=mo8LUam9-AzB_0w72wTyMyreKr4Ns-pxZGc4GVWcUHA,2747
7
+ kalavai_client/assets/nginx.conf,sha256=drVVCg8GHucz7hmt_BI6giAhK92OV71257NTs3LthwM,225
8
+ kalavai_client/assets/pool_config_template.yaml,sha256=fFz4w2-fMKD5KvyzFdfcWD_jSneRlmnjLc8hCctweX0,576
9
+ kalavai_client/assets/pool_config_values.yaml,sha256=VrM3XHQfQo6QLZ68qvagooUptaYgl1pszniY_JUtemk,233
10
+ kalavai_client/assets/user_workspace.yaml,sha256=wDvlMYknOPABAEo0dsQwU7bac8iubjAG9tdkFbJZ5Go,476
11
+ kalavai_client/assets/user_workspace_values.yaml,sha256=G0HOzQUxrDMCwuW9kbWUZaKMzDDPVwDwzBHCL2Xi2ZM,542
12
+ kalavai_client/auth.py,sha256=QsBh28L2LwjBBK6pTUE4Xu36lLDTyetyU1YfS1Hbb6g,1717
13
+ kalavai_client/cli.py,sha256=_oUZAYV397_-BQAHsTcyK0pkyK5iusdyYrJU5z6lecM,66312
14
+ kalavai_client/cluster.py,sha256=z9HCD6ZUemjabcDenszQhqB_IUiVi_vpFbfAkKwHiEU,12292
15
+ kalavai_client/core.py,sha256=Vb-5MBHjpuR590FIDOnytJpP1Xjt7hYqehPV2rh6P68,6863
16
+ kalavai_client/env.py,sha256=RAi37vJtIGfPR25PNxZYMZNkkEKR4AyUPN_htFiFesM,575
17
+ kalavai_client/utils.py,sha256=kQk_1QOs8u08rcfhkcfo_oC-cZzww0cij-1R_jK1ER8,12185
18
+ kalavai_client-0.5.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
19
+ kalavai_client-0.5.13.dist-info/METADATA,sha256=fQus2g5Q39Wu_HglzdiOGnalva3GcSG48o5iyyikbt4,14347
20
+ kalavai_client-0.5.13.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
21
+ kalavai_client-0.5.13.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
22
+ kalavai_client-0.5.13.dist-info/RECORD,,
@@ -1,20 +0,0 @@
1
- kalavai_client/__init__.py,sha256=PoEyxKkxe5kSNJwy0utYwv_ANRK98k-p4OLs19lJaBA,23
2
- kalavai_client/__main__.py,sha256=WQUfxvRsBJH5gsCJg8pLz95QnZIj7Ol8psTO77m0QE0,73
3
- kalavai_client/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- kalavai_client/assets/apps.yaml,sha256=_8BgT9F611c8uZJvhTE_0CLbQqnLaUQosqxZjzOslXQ,5979
5
- kalavai_client/assets/apps_values.yaml,sha256=CjKVelPQHd-hm-DTMEuya92feKiphU9mh3HrosLYYPE,1676
6
- kalavai_client/assets/docker-compose-template.yaml,sha256=gJ0NkhcG2c-gZPmSd385dadrXkZrWruTJkiaxcaKkQ0,2725
7
- kalavai_client/assets/nginx.conf,sha256=drVVCg8GHucz7hmt_BI6giAhK92OV71257NTs3LthwM,225
8
- kalavai_client/assets/pool_config_template.yaml,sha256=fFz4w2-fMKD5KvyzFdfcWD_jSneRlmnjLc8hCctweX0,576
9
- kalavai_client/assets/pool_config_values.yaml,sha256=VrM3XHQfQo6QLZ68qvagooUptaYgl1pszniY_JUtemk,233
10
- kalavai_client/assets/user_workspace.yaml,sha256=wDvlMYknOPABAEo0dsQwU7bac8iubjAG9tdkFbJZ5Go,476
11
- kalavai_client/assets/user_workspace_values.yaml,sha256=G0HOzQUxrDMCwuW9kbWUZaKMzDDPVwDwzBHCL2Xi2ZM,542
12
- kalavai_client/auth.py,sha256=QsBh28L2LwjBBK6pTUE4Xu36lLDTyetyU1YfS1Hbb6g,1717
13
- kalavai_client/cli.py,sha256=4qTZuYNFhsdJbnER-MSBJHPNgJc_lWzbWR0Bj2YeQe0,68889
14
- kalavai_client/cluster.py,sha256=fULTAad4KXEGeWZmp4_VBoBwT5eED_HOBUsXIKmf0CU,12119
15
- kalavai_client/utils.py,sha256=RTJNgY7ho52Q5WzV68ZK5uHNYRAwXkelIr1PmBLBJsk,12420
16
- kalavai_client-0.5.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
17
- kalavai_client-0.5.10.dist-info/METADATA,sha256=u_E2mWeRmmVE2pWCToOVwwxdgu--UWkzhkImQb67qy4,14101
18
- kalavai_client-0.5.10.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
19
- kalavai_client-0.5.10.dist-info/entry_points.txt,sha256=9T6D45gxwzfVbglMm1r6XPdXuuZdHfy_7fCeu2jUphc,50
20
- kalavai_client-0.5.10.dist-info/RECORD,,