kalavai-client 0.5.12__tar.gz → 0.5.14__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (22) hide show
  1. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/PKG-INFO +5 -2
  2. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/README.md +4 -1
  3. kalavai_client-0.5.14/kalavai_client/__init__.py +2 -0
  4. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/assets/apps.yaml +1 -1
  5. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/cli.py +119 -217
  6. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/cluster.py +5 -1
  7. kalavai_client-0.5.14/kalavai_client/core.py +227 -0
  8. kalavai_client-0.5.14/kalavai_client/env.py +19 -0
  9. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/utils.py +3 -11
  10. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/pyproject.toml +1 -1
  11. kalavai_client-0.5.12/kalavai_client/__init__.py +0 -2
  12. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/LICENSE +0 -0
  13. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/__main__.py +0 -0
  14. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/assets/__init__.py +0 -0
  15. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/assets/apps_values.yaml +0 -0
  16. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/assets/docker-compose-template.yaml +0 -0
  17. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/assets/nginx.conf +0 -0
  18. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/assets/pool_config_template.yaml +0 -0
  19. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/assets/pool_config_values.yaml +0 -0
  20. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/assets/user_workspace.yaml +0 -0
  21. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/assets/user_workspace_values.yaml +0 -0
  22. {kalavai_client-0.5.12 → kalavai_client-0.5.14}/kalavai_client/auth.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: kalavai-client
3
- Version: 0.5.12
3
+ Version: 0.5.14
4
4
  Summary: Client app for kalavai platform
5
5
  License: Apache-2.0
6
6
  Keywords: LLM,platform
@@ -89,6 +89,9 @@ https://github.com/user-attachments/assets/0d2316f3-79ea-46ac-b41e-8ef720f52672
89
89
 
90
90
  ### News updates
91
91
 
92
+ <img src="docs/docs/assets/images/DeepSeek-Emblem.png" width="100">
93
+
94
+ - 6 February 2025: 🔥🔥🔥 Access **DeepSeek R1 model for free** when you join our [public LLM pool](https://kalavai-net.github.io/kalavai-client/public_llm_pool/)
92
95
  - 31 January 2025: `kalavai-client` is now a [PyPI package](https://pypi.org/project/kalavai-client/), easier to install than ever!
93
96
  - 27 January 2025: Support for accessing pools from remote computers
94
97
  - 9 January 2025: Added support for [Aphrodite Engine](https://github.com/aphrodite-engine/aphrodite-engine) models
@@ -140,7 +143,7 @@ From release **v0.5.0, you can now install `kalavai-client` in non-worker comput
140
143
  For workers sharing resources with the pool:
141
144
 
142
145
  - A laptop, desktop or Virtual Machine
143
- - Docker engine installed (for [linux](https://docs.docker.com/engine/install/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
146
+ - Docker engine installed (for [linux](https://docs.docker.com/engine/install/ubuntu/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
144
147
 
145
148
  > **Support for Windows and MacOS workers is experimental**: kalavai workers run on docker containers that require access to the host network interfaces, thus systems that do not support containers natively (Windows and MacOS) may have difficulties finding each other.
146
149
 
@@ -46,6 +46,9 @@ https://github.com/user-attachments/assets/0d2316f3-79ea-46ac-b41e-8ef720f52672
46
46
 
47
47
  ### News updates
48
48
 
49
+ <img src="docs/docs/assets/images/DeepSeek-Emblem.png" width="100">
50
+
51
+ - 6 February 2025: 🔥🔥🔥 Access **DeepSeek R1 model for free** when you join our [public LLM pool](https://kalavai-net.github.io/kalavai-client/public_llm_pool/)
49
52
  - 31 January 2025: `kalavai-client` is now a [PyPI package](https://pypi.org/project/kalavai-client/), easier to install than ever!
50
53
  - 27 January 2025: Support for accessing pools from remote computers
51
54
  - 9 January 2025: Added support for [Aphrodite Engine](https://github.com/aphrodite-engine/aphrodite-engine) models
@@ -97,7 +100,7 @@ From release **v0.5.0, you can now install `kalavai-client` in non-worker comput
97
100
  For workers sharing resources with the pool:
98
101
 
99
102
  - A laptop, desktop or Virtual Machine
100
- - Docker engine installed (for [linux](https://docs.docker.com/engine/install/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
103
+ - Docker engine installed (for [linux](https://docs.docker.com/engine/install/ubuntu/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
101
104
 
102
105
  > **Support for Windows and MacOS workers is experimental**: kalavai workers run on docker containers that require access to the host network interfaces, thus systems that do not support containers natively (Windows and MacOS) may have difficulties finding each other.
103
106
 
@@ -0,0 +1,2 @@
1
+
2
+ __version__ = "0.5.14"
@@ -139,7 +139,7 @@ releases:
139
139
  - name: replicas
140
140
  value: 2
141
141
  - name: image_tag
142
- value: "v2025.01.1"
142
+ value: "v2025.01.9"
143
143
  - name: deployment.in_cluster
144
144
  value: "True"
145
145
  - name: deployment.use_auth_key
@@ -15,10 +15,24 @@ import netifaces as ni
15
15
  import arguably
16
16
  from rich.console import Console
17
17
 
18
+ from kalavai_client.env import (
19
+ USER_COOKIE,
20
+ USER_LOCAL_SERVER_FILE,
21
+ TEMPLATE_LABEL,
22
+ user_path
23
+ )
24
+ from kalavai_client.core import (
25
+ fetch_resources,
26
+ fetch_job_names,
27
+ fetch_job_details,
28
+ fetch_devices,
29
+ fetch_job_logs,
30
+ fetch_gpus,
31
+ load_gpu_models
32
+ )
18
33
  from kalavai_client.utils import (
19
34
  check_gpu_drivers,
20
35
  run_cmd,
21
- user_path,
22
36
  decode_dict,
23
37
  generate_join_token,
24
38
  user_confirm,
@@ -27,10 +41,8 @@ from kalavai_client.utils import (
27
41
  generate_table,
28
42
  request_to_server,
29
43
  resource_path,
30
- user_path,
31
44
  safe_remove,
32
45
  leave_vpn,
33
- join_vpn,
34
46
  get_vpn_details,
35
47
  load_server_info,
36
48
  user_login,
@@ -68,7 +80,6 @@ LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
68
80
  VERSION = 1
69
81
  RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
70
82
  CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
71
- TEMPLATE_LABEL = "kalavai.job.name"
72
83
  RAY_LABEL = "kalavai.ray.name"
73
84
  PVC_NAME_LABEL = "kalavai.storage.name"
74
85
  DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
@@ -98,9 +109,7 @@ USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
98
109
  USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
99
110
  USER_HELM_APPS_FILE = user_path("apps.yaml")
100
111
  USER_KUBECONFIG_FILE = user_path("kubeconfig")
101
- USER_LOCAL_SERVER_FILE = user_path(".server")
102
112
  USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
103
- USER_COOKIE = user_path(".user_cookie.pkl")
104
113
 
105
114
 
106
115
  console = Console()
@@ -118,7 +127,7 @@ CLUSTER = dockerCluster(
118
127
  ######################
119
128
  ## HELPER FUNCTIONS ##
120
129
  ######################
121
-
130
+
122
131
  def check_seed_compatibility():
123
132
  """Check required packages to start pools"""
124
133
  logs = []
@@ -288,21 +297,11 @@ def select_ip_address(subnet=None):
288
297
  console.log("[red] Input error")
289
298
  return ips[option]
290
299
 
291
- def fetch_gpus():
292
- data = request_to_server(
293
- method="post",
294
- endpoint="/v1/get_node_gpus",
295
- data={},
296
- server_creds=USER_LOCAL_SERVER_FILE,
297
- user_cookie=USER_COOKIE
298
- )
299
- return data.items()
300
-
301
300
  def select_gpus(message):
302
301
  console.log(f"[yellow]{message}")
303
302
  gpu_models = ["Any/None"]
304
303
  gpu_models_full = ["Any/None"]
305
- available_gpus = fetch_gpus()
304
+ available_gpus = load_gpu_models()
306
305
  for _, gpus in available_gpus:
307
306
  for gpu in gpus["gpus"]:
308
307
  #status = "free" if "ready" in gpu else "busy"
@@ -737,7 +736,7 @@ def pool__check_token(token, *others, public=False):
737
736
 
738
737
 
739
738
  @arguably.command
740
- def pool__join(token, *others, node_name=None, ip_address: str=None):
739
+ def pool__join(token, *others, node_name=None):
741
740
  """
742
741
  Join Kalavai pool and start/resume sharing resources.
743
742
 
@@ -860,7 +859,7 @@ def pool__join(token, *others, node_name=None, ip_address: str=None):
860
859
  CLUSTER.start_worker_node()
861
860
  except Exception as e:
862
861
  console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
863
- leave_vpn(container_name=DEFAULT_VPN_CONTAINER_NAME)
862
+ pool__stop()
864
863
  exit()
865
864
 
866
865
  # ensure we are connected
@@ -869,6 +868,22 @@ def pool__join(token, *others, node_name=None, ip_address: str=None):
869
868
  time.sleep(30)
870
869
  if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
871
870
  break
871
+
872
+ # send note to server to let them know the node is coming online
873
+ if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
874
+ console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with [yellow]--node-name'")
875
+ pool__stop()
876
+ return
877
+
878
+ # check the node has connected successfully
879
+ try:
880
+ while not CLUSTER.is_agent_running():
881
+ console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
882
+ time.sleep(30)
883
+ except KeyboardInterrupt:
884
+ console.log("[red]Installation aborted. Leaving pool.")
885
+ pool__stop()
886
+ return
872
887
 
873
888
  init_user_workspace()
874
889
 
@@ -963,29 +978,24 @@ def pool__gpus(*others, available=False):
963
978
  console.log(f"[red]Problems with your pool: {str(e)}")
964
979
  return
965
980
 
966
- try:
967
- data = fetch_gpus()
968
- columns, rows = [], []
969
- for node, gpus in data:
970
- row_gpus = []
971
- for gpu in gpus["gpus"]:
972
- status = gpu["ready"] if "ready" in gpu else True
973
- if available and not status:
974
- continue
975
- row_gpus.append( (f"{gpu['model']} ({math.floor(int(gpu['memory'])/1000)} GBs)", str(status)))
976
- if len(row_gpus) > 0:
977
- models, statuses = zip(*row_gpus)
978
- rows.append([node, "\n".join(statuses), "\n".join(models), str(gpus["available"]), str(gpus["capacity"])])
979
-
980
- columns = ["Ready", "GPU(s)", "Available", "Total"]
981
- columns = ["Node"] + columns
982
- console.print(
983
- generate_table(columns=columns, rows=rows,end_sections=[n for n in range(len(rows))])
984
- )
985
-
986
- except Exception as e:
987
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
988
-
981
+ gpus = fetch_gpus(available=available)
982
+ if "error" in gpus:
983
+ console.log(f"[red]Error when fetching gpus: {gpus}")
984
+ return
985
+
986
+ columns = ["Node", "Ready", "GPU(s)", "Available", "Total"]
987
+ rows = []
988
+ for gpu in gpus:
989
+ rows.append([
990
+ gpu.node,
991
+ str(gpu.ready),
992
+ gpu.model,
993
+ str(gpu.available),
994
+ str(gpu.total)
995
+ ])
996
+ console.print(
997
+ generate_table(columns=columns, rows=rows,end_sections=[n for n in range(len(rows))])
998
+ )
989
999
 
990
1000
  @arguably.command
991
1001
  def pool__resources(*others):
@@ -998,45 +1008,33 @@ def pool__resources(*others):
998
1008
  console.log(f"[red]Problems with your pool: {str(e)}")
999
1009
  return
1000
1010
 
1001
- try:
1002
- total = request_to_server(
1003
- method="get",
1004
- endpoint="/v1/get_cluster_total_resources",
1005
- data={},
1006
- server_creds=USER_LOCAL_SERVER_FILE,
1007
- user_cookie=USER_COOKIE
1008
- )
1009
- available = request_to_server(
1010
- method="get",
1011
- endpoint="/v1/get_cluster_available_resources",
1012
- data={},
1013
- server_creds=USER_LOCAL_SERVER_FILE,
1014
- user_cookie=USER_COOKIE
1015
- )
1016
- columns = []
1017
- total_values = []
1018
- available_values = []
1019
- for col in total.keys():
1020
- if col in RESOURCE_EXCLUDE:
1021
- continue
1022
- columns.append(col)
1023
- total_values.append(str(total[col]))
1024
- available_values.append(str(available[col]))
1025
-
1026
- columns = [""] + columns
1027
- total_values = ["Total"] + total_values
1028
- available_values = ["Available"] + available_values
1029
-
1030
- rows = [
1031
- tuple(available_values),
1032
- tuple(total_values)
1033
- ]
1034
- console.print(
1035
- generate_table(columns=columns, rows=rows, end_sections=[0, 1])
1036
- )
1037
-
1038
- except Exception as e:
1011
+ data = fetch_resources()
1012
+ if "error" in data:
1039
1013
  console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1014
+ return
1015
+
1016
+ columns = []
1017
+ total_values = []
1018
+ available_values = []
1019
+ for col in data["total"].keys():
1020
+ if col in RESOURCE_EXCLUDE:
1021
+ continue
1022
+ columns.append(col)
1023
+ total_values.append(str(data["total"][col]))
1024
+ available_values.append(str(data["available"][col]))
1025
+
1026
+ columns = [""] + columns
1027
+ total_values = ["Total"] + total_values
1028
+ available_values = ["Available"] + available_values
1029
+
1030
+ rows = [
1031
+ tuple(available_values),
1032
+ tuple(total_values)
1033
+ ]
1034
+ console.print(
1035
+ generate_table(columns=columns, rows=rows, end_sections=[0, 1])
1036
+ )
1037
+
1040
1038
 
1041
1039
  @arguably.command
1042
1040
  def pool__update(*others):
@@ -1333,22 +1331,18 @@ def node__list(*others):
1333
1331
  return
1334
1332
 
1335
1333
  try:
1336
- data = request_to_server(
1337
- method="get",
1338
- endpoint="/v1/get_nodes",
1339
- data={},
1340
- server_creds=USER_LOCAL_SERVER_FILE,
1341
- user_cookie=USER_COOKIE
1342
- )
1334
+ devices = fetch_devices()
1343
1335
  rows = []
1344
- columns = ["Node name"]
1345
- for node, status in data.items():
1346
- row = [node]
1347
- for key, value in status.items():
1348
- if key not in columns:
1349
- columns.append(key)
1350
- row.append(str(value))
1351
- rows.append(tuple(row))
1336
+ columns = ["Node name", "Memory Pressure", "Disk pressure", "PID pressure", "Ready", "Unschedulable"]
1337
+ for device in devices:
1338
+ rows.append([
1339
+ device.name,
1340
+ str(device.memory_pressure),
1341
+ str(device.disk_pressure),
1342
+ str(device.pid_pressure),
1343
+ str(device.ready),
1344
+ str(device.unschedulable)
1345
+ ])
1352
1346
 
1353
1347
  console.log("Nodes with 'unschedulable=True' will not receive workload")
1354
1348
  console.log("To make a node unschedulable (i.e. won't receive workloads) use [yellow]kalavai node cordon <node name>")
@@ -1540,7 +1534,9 @@ def job__test(local_template_dir, *others, values, defaults, force_namespace: st
1540
1534
  console.log(f"[red]--values ({values}) is not a valid local file")
1541
1535
  return
1542
1536
  with open(values, "r") as f:
1543
- values_dict = yaml.safe_load(f)
1537
+ raw_values = yaml.load(f, Loader=yaml.SafeLoader)
1538
+ values_dict = {variable["name"]: variable['value'] for variable in raw_values}
1539
+
1544
1540
  # load defaults
1545
1541
  if not os.path.isfile(defaults):
1546
1542
  console.log(f"[red]--defaults ({defaults}) is not a valid local file")
@@ -1648,7 +1644,7 @@ def job__estimate(billion_parameters, *others, precision=32):
1648
1644
 
1649
1645
  average_vram = 8
1650
1646
  required_memory = float(billion_parameters) * (precision / 8) / 1.2
1651
- available_gpus = fetch_gpus()
1647
+ available_gpus = load_gpu_models()
1652
1648
  vrams = []
1653
1649
  for _, gpus in available_gpus:
1654
1650
  for model in gpus["gpus"]:
@@ -1705,7 +1701,7 @@ def job__status(name, *others):
1705
1701
  return
1706
1702
 
1707
1703
  @arguably.command
1708
- def job__list(*others, detailed=False):
1704
+ def job__list(*others):
1709
1705
  """
1710
1706
  List jobs in the cluster
1711
1707
  """
@@ -1715,106 +1711,22 @@ def job__list(*others, detailed=False):
1715
1711
  console.log(f"[red]Problems with your pool: {str(e)}")
1716
1712
  return
1717
1713
 
1718
- data = {
1719
- "group": "batch.volcano.sh",
1720
- "api_version": "v1alpha1",
1721
- "plural": "jobs"
1722
- }
1723
- try:
1724
- result = request_to_server(
1725
- method="post",
1726
- endpoint="/v1/get_objects_of_type",
1727
- data=data,
1728
- server_creds=USER_LOCAL_SERVER_FILE,
1729
- user_cookie=USER_COOKIE
1730
- )
1731
- all_deployments = defaultdict(list)
1732
- for ns, ds in result.items():
1733
- all_deployments[ns].extend([d["metadata"]["labels"][TEMPLATE_LABEL] for d in ds["items"]])
1734
- #deployments = {ns: d["metadata"]["labels"][TEMPLATE_LABEL] for ns, ds in result.items() for d in ds["items"]}
1735
- except Exception as e:
1736
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1737
- return
1738
- if len(all_deployments.keys()) == 0:
1714
+ all_deployments = fetch_job_names()
1715
+ if "error" in all_deployments:
1716
+ console.log(f"[red]Error when connecting to kalavai service: {all_deployments}")
1717
+ return
1718
+
1719
+ if len(all_deployments) == 0:
1739
1720
  console.log("[green]No deployments found.")
1740
1721
  return
1741
1722
 
1723
+ details = fetch_job_details(jobs=all_deployments)
1724
+ if "error" in details:
1725
+ console.log(f"[red]{details}")
1726
+ return
1742
1727
  columns = ["Owner", "Deployment", "Workers", "Endpoint"]
1743
- if detailed:
1744
- columns.append("Status")
1745
- rows = []
1746
- for namespace, deployments in all_deployments.items():
1747
- for deployment in deployments:
1748
- try:
1749
- # get status for deployment
1750
- if detailed:
1751
- data = {
1752
- "group": "batch.volcano.sh",
1753
- "api_version": "v1alpha1",
1754
- "plural": "jobs",
1755
- # "group": "leaderworkerset.x-k8s.io",
1756
- # "api_version": "v1",
1757
- # "plural": "leaderworkersets",
1758
- "name": deployment
1759
- }
1760
- result = request_to_server(
1761
- method="post",
1762
- endpoint="/v1/get_status_for_object",
1763
- data=data,
1764
- server_creds=USER_LOCAL_SERVER_FILE,
1765
- user_cookie=USER_COOKIE
1766
- )
1767
- ss = [] # flatten results ({namespace: statuses})
1768
- [ss.extend(values) for values in result.values()]
1769
- if len(ss) > 0:
1770
- last = ss[-1]
1771
- statuses = f"[{last['lastTransitionTime']}] {last['status']}"
1772
- else:
1773
- statuses = "Unknown"
1774
- # get pod statuses
1775
- data = {
1776
- "label": TEMPLATE_LABEL,
1777
- "value": deployment
1778
- }
1779
- result = request_to_server(
1780
- method="post",
1781
- endpoint="/v1/get_pods_status_for_label",
1782
- data=data,
1783
- server_creds=USER_LOCAL_SERVER_FILE,
1784
- user_cookie=USER_COOKIE
1785
- )
1786
- workers_status = defaultdict(int)
1787
- for ns, ss in result.items():
1788
- if ns != namespace: # same job name, different namespace
1789
- continue
1790
- for _, values in ss.items():
1791
- workers_status[values["status"]] += 1
1792
- workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
1793
- # get URL details
1794
- data = {
1795
- "label": TEMPLATE_LABEL,
1796
- "value": deployment,
1797
- "types": ["NodePort"]
1798
- }
1799
- result = request_to_server(
1800
- method="post",
1801
- endpoint="/v1/get_ports_for_services",
1802
- data=data,
1803
- server_creds=USER_LOCAL_SERVER_FILE,
1804
- user_cookie=USER_COOKIE
1805
- )
1806
- node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
1807
-
1808
- urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
1809
- row = [namespace, deployment, workers, "\n".join(urls)]
1810
- if detailed:
1811
- row.append(statuses)
1812
- rows.append(row)
1813
-
1814
- except Exception as e:
1815
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1816
- return
1817
-
1728
+ rows = [[job.owner, job.name, job.workers, job.endpoint] for job in details]
1729
+
1818
1730
  console.print(
1819
1731
  generate_table(columns=columns, rows=rows, end_sections=range(len(rows)))
1820
1732
  )
@@ -1836,26 +1748,19 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
1836
1748
 
1837
1749
  if force_namespace is not None:
1838
1750
  console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1839
-
1840
- data = {
1841
- "label": TEMPLATE_LABEL,
1842
- "value": name,
1843
- "tail": tail
1844
- }
1845
- if force_namespace is not None:
1846
- data["force_namespace"] = force_namespace
1751
+
1752
+ all_logs = fetch_job_logs(
1753
+ job_name=name,
1754
+ pod_name=pod_name,
1755
+ force_namespace=force_namespace,
1756
+ tail=tail)
1757
+ if "error" in all_logs:
1758
+ console.log(f"[red]{all_logs}")
1759
+ return
1847
1760
  while True:
1848
1761
  try:
1849
- # send tail as parameter (fetch only last _tail_ lines)
1850
- result = request_to_server(
1851
- method="post",
1852
- endpoint="/v1/get_logs_for_label",
1853
- data=data,
1854
- server_creds=USER_LOCAL_SERVER_FILE,
1855
- user_cookie=USER_COOKIE
1856
- )
1857
1762
  if not stream:
1858
- for pod, logs in result.items():
1763
+ for pod, logs in all_logs.items():
1859
1764
  if pod_name is not None and pod_name != pod:
1860
1765
  continue
1861
1766
  console.log(f"[yellow]Pod {pod}")
@@ -1863,7 +1768,7 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
1863
1768
  break
1864
1769
  else:
1865
1770
  os.system("clear")
1866
- for pod, logs in result.items():
1771
+ for pod, logs in all_logs.items():
1867
1772
  if pod_name is not None and pod_name != pod:
1868
1773
  continue
1869
1774
  print(f"Pod {pod}")
@@ -1871,10 +1776,7 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
1871
1776
  time.sleep(1)
1872
1777
  except KeyboardInterrupt:
1873
1778
  break
1874
- except Exception as e:
1875
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1876
- console.log(f"Check if {name} is running with [yellow]kalavai job list")
1877
- return
1779
+
1878
1780
 
1879
1781
  @arguably.command
1880
1782
  def job__manifest(*others, name, force_namespace: str=None):
@@ -7,10 +7,11 @@ from kalavai_client.utils import (
7
7
  run_cmd,
8
8
  check_gpu_drivers,
9
9
  validate_poolconfig,
10
- user_path,
11
10
  populate_template
12
11
  )
13
12
 
13
+ from kalavai_client.env import user_path
14
+
14
15
 
15
16
  class Cluster(ABC):
16
17
  @abstractmethod
@@ -133,6 +134,9 @@ class dockerCluster(Cluster):
133
134
  if not os.path.isfile(self.compose_file):
134
135
  return False
135
136
  status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --status=running").decode()
137
+ if not status:
138
+ return False
139
+ status = (0 == os.system(f'docker exec {self.container_name} ps aux | grep -v grep | grep -E "k3s (server|agent)"'))
136
140
  return status
137
141
 
138
142
  def is_seed_node(self):
@@ -0,0 +1,227 @@
1
+ from collections import defaultdict
2
+ import math
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from kalavai_client.utils import (
7
+ request_to_server,
8
+ load_server_info
9
+ )
10
+ from kalavai_client.env import (
11
+ USER_COOKIE,
12
+ USER_LOCAL_SERVER_FILE,
13
+ TEMPLATE_LABEL,
14
+ SERVER_IP_KEY
15
+ )
16
+
17
+ class Job(BaseModel):
18
+ owner: str = None
19
+ name: str = None
20
+ workers: str = None
21
+ endpoint: str = None
22
+
23
+ class DeviceStatus(BaseModel):
24
+ name: str
25
+ memory_pressure: bool
26
+ disk_pressure: bool
27
+ pid_pressure: bool
28
+ ready: bool
29
+ unschedulable: bool
30
+
31
+ class GPU(BaseModel):
32
+ node: str
33
+ available: int
34
+ total: int
35
+ ready: bool
36
+ model: str
37
+
38
+
39
+ def fetch_resources():
40
+ try:
41
+ total = request_to_server(
42
+ method="get",
43
+ endpoint="/v1/get_cluster_total_resources",
44
+ data={},
45
+ server_creds=USER_LOCAL_SERVER_FILE,
46
+ user_cookie=USER_COOKIE
47
+ )
48
+ available = request_to_server(
49
+ method="get",
50
+ endpoint="/v1/get_cluster_available_resources",
51
+ data={},
52
+ server_creds=USER_LOCAL_SERVER_FILE,
53
+ user_cookie=USER_COOKIE
54
+ )
55
+ except Exception as e:
56
+ return {"error": str(e)}
57
+
58
+ return {"total": total, "available": available}
59
+
60
+ def fetch_job_names():
61
+ data = {
62
+ "group": "batch.volcano.sh",
63
+ "api_version": "v1alpha1",
64
+ "plural": "jobs"
65
+ }
66
+ try:
67
+ jobs = request_to_server(
68
+ method="post",
69
+ endpoint="/v1/get_objects_of_type",
70
+ data=data,
71
+ server_creds=USER_LOCAL_SERVER_FILE,
72
+ user_cookie=USER_COOKIE
73
+ )
74
+ all_jobs = []
75
+ for ns, ds in jobs.items():
76
+ all_jobs.extend([Job(owner=ns, name=d["metadata"]["labels"][TEMPLATE_LABEL]) for d in ds["items"]])
77
+ except Exception as e:
78
+ return {"error": str(e)}
79
+
80
+ return all_jobs
81
+
82
+ def fetch_job_details(jobs: list[Job]):
83
+ """Get job details. A job is a dict:
84
+ {
85
+ "namespace": ns,
86
+ "name": name
87
+ }
88
+ """
89
+ job_details = []
90
+ for job in jobs:
91
+ namespace = job.owner
92
+ deployment = job.name
93
+ try:
94
+ # get pod statuses
95
+ data = {
96
+ "label": TEMPLATE_LABEL,
97
+ "value": deployment
98
+ }
99
+ result = request_to_server(
100
+ method="post",
101
+ endpoint="/v1/get_pods_status_for_label",
102
+ data=data,
103
+ server_creds=USER_LOCAL_SERVER_FILE,
104
+ user_cookie=USER_COOKIE
105
+ )
106
+ workers_status = defaultdict(int)
107
+ for ns, ss in result.items():
108
+ if ns != namespace: # same job name, different namespace
109
+ continue
110
+ for _, values in ss.items():
111
+ workers_status[values["status"]] += 1
112
+ workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
113
+ # get URL details
114
+ data = {
115
+ "label": TEMPLATE_LABEL,
116
+ "value": deployment,
117
+ "types": ["NodePort"]
118
+ }
119
+ result = request_to_server(
120
+ method="post",
121
+ endpoint="/v1/get_ports_for_services",
122
+ data=data,
123
+ server_creds=USER_LOCAL_SERVER_FILE,
124
+ user_cookie=USER_COOKIE
125
+ )
126
+ node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
127
+
128
+ urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
129
+ job_details.append(
130
+ Job(owner=namespace,
131
+ name=deployment,
132
+ workers=workers,
133
+ endpoint="\n".join(urls))
134
+ )
135
+
136
+ except Exception as e:
137
+ return {"error": str(e)}
138
+
139
+ return job_details
140
+
141
+ def fetch_devices():
142
+ """Load devices status info for all hosts"""
143
+ try:
144
+ data = request_to_server(
145
+ method="get",
146
+ endpoint="/v1/get_nodes",
147
+ data={},
148
+ server_creds=USER_LOCAL_SERVER_FILE,
149
+ user_cookie=USER_COOKIE
150
+ )
151
+ devices = []
152
+ for node, status in data.items():
153
+ devices.append(
154
+ DeviceStatus(
155
+ name=node,
156
+ memory_pressure=status["MemoryPressure"],
157
+ disk_pressure=status["DiskPressure"],
158
+ pid_pressure=status["PIDPressure"],
159
+ ready=status["Ready"],
160
+ unschedulable=status["unschedulable"]
161
+ )
162
+ )
163
+ return devices
164
+
165
+ except Exception as e:
166
+ return {"error": str(e)}
167
+
168
+ def fetch_job_logs(job_name, force_namespace=None, pod_name=None, tail=100):
169
+ data = {
170
+ "label": TEMPLATE_LABEL,
171
+ "value": job_name,
172
+ "tail": tail
173
+ }
174
+ if force_namespace is not None:
175
+ data["force_namespace"] = force_namespace
176
+ try:
177
+ # send tail as parameter (fetch only last _tail_ lines)
178
+ all_logs = request_to_server(
179
+ method="post",
180
+ endpoint="/v1/get_logs_for_label",
181
+ data=data,
182
+ server_creds=USER_LOCAL_SERVER_FILE,
183
+ user_cookie=USER_COOKIE
184
+ )
185
+ return {pod: logs for pod, logs in all_logs.items() if pod_name is None or pod_name == pod}
186
+
187
+ except Exception as e:
188
+ return {"error": str(e)}
189
+
190
+
191
+ def load_gpu_models():
192
+ data = request_to_server(
193
+ method="post",
194
+ endpoint="/v1/get_node_gpus",
195
+ data={},
196
+ server_creds=USER_LOCAL_SERVER_FILE,
197
+ user_cookie=USER_COOKIE
198
+ )
199
+ return data.items()
200
+
201
+ def fetch_gpus(available=False):
202
+ try:
203
+ data = load_gpu_models()
204
+ all_gpus = []
205
+ for node, gpus in data:
206
+ row_gpus = []
207
+ for gpu in gpus["gpus"]:
208
+ status = gpu["ready"] if "ready" in gpu else True
209
+ if available and not status:
210
+ continue
211
+ row_gpus.append( (f"{gpu['model']} ({math.floor(int(gpu['memory'])/1000)} GBs)", str(status)))
212
+ if len(row_gpus) > 0:
213
+ models, statuses = zip(*row_gpus)
214
+ #rows.append([node, "\n".join(statuses), "\n".join(models), str(gpus["available"]), str(gpus["capacity"])])
215
+ all_gpus.extend([
216
+ GPU(
217
+ node=node,
218
+ ready=status,
219
+ model=model,
220
+ available=gpus["available"],
221
+ total=gpus["capacity"]
222
+ ) for model, status in zip(models, statuses)
223
+ ])
224
+ return all_gpus
225
+
226
+ except Exception as e:
227
+ return {"error": str(e)}
@@ -0,0 +1,19 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+
5
+ def user_path(relative_path, create_path=False):
6
+ """Transform a relative path into the user's cache folder path"""
7
+ base = os.path.expanduser("~")
8
+ kalavai_user_path = os.path.join(base, ".cache/kalavai")
9
+ full_path = os.path.join(kalavai_user_path, relative_path)
10
+ if create_path:
11
+ Path(full_path).mkdir(parents=True, exist_ok=True)
12
+
13
+ return full_path
14
+
15
+
16
+ USER_LOCAL_SERVER_FILE = user_path(".server")
17
+ USER_COOKIE = user_path(".user_cookie.pkl")
18
+ TEMPLATE_LABEL = "kalavai.job.name"
19
+ SERVER_IP_KEY = "server_ip"
@@ -14,6 +14,9 @@ import yaml
14
14
 
15
15
 
16
16
  from kalavai_client.auth import KalavaiAuthClient
17
+ from kalavai_client.env import (
18
+ SERVER_IP_KEY
19
+ )
17
20
 
18
21
 
19
22
  GITHUB_ORG = "kalavai-net"
@@ -22,7 +25,6 @@ GITHUB_TEMPLATE_PATH = "templates"
22
25
  USER_NODE_LABEL_KEY = "user_node_label"
23
26
  CLUSTER_IP_KEY = "cluster_ip"
24
27
  CLUSTER_TOKEN_KEY = "cluster_token"
25
- SERVER_IP_KEY = "server_ip"
26
28
  NODE_NAME_KEY = "node_name"
27
29
  PUBLIC_LOCATION_KEY = "public_location"
28
30
  CLUSTER_NAME_KEY = "cluster_name"
@@ -397,16 +399,6 @@ def resource_path(relative_path: str):
397
399
  return None
398
400
  return resource
399
401
 
400
- def user_path(relative_path, create_path=False):
401
- """Transform a relative path into the user's cache folder path"""
402
- base = os.path.expanduser("~")
403
- kalavai_user_path = os.path.join(base, ".cache/kalavai")
404
- full_path = os.path.join(kalavai_user_path, relative_path)
405
- if create_path:
406
- Path(full_path).mkdir(parents=True, exist_ok=True)
407
-
408
- return full_path
409
-
410
402
  def safe_remove(filepath, force=True):
411
403
  if not os.path.exists(filepath):
412
404
  return
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "kalavai-client"
3
- version = "0.5.12"
3
+ version = "0.5.14"
4
4
  authors = [
5
5
  {name = "Carlos Fernandez Musoles", email = "carlos@kalavai.net"}
6
6
  ]
@@ -1,2 +0,0 @@
1
-
2
- __version__ = "0.5.12"
File without changes