kalavai-client 0.5.12__tar.gz → 0.5.13__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (22) hide show
  1. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/PKG-INFO +5 -2
  2. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/README.md +4 -1
  3. kalavai_client-0.5.13/kalavai_client/__init__.py +2 -0
  4. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/assets/apps.yaml +1 -1
  5. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/cli.py +119 -216
  6. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/cluster.py +3 -0
  7. kalavai_client-0.5.13/kalavai_client/core.py +227 -0
  8. kalavai_client-0.5.13/kalavai_client/env.py +19 -0
  9. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/utils.py +3 -11
  10. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/pyproject.toml +1 -1
  11. kalavai_client-0.5.12/kalavai_client/__init__.py +0 -2
  12. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/LICENSE +0 -0
  13. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/__main__.py +0 -0
  14. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/assets/__init__.py +0 -0
  15. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/assets/apps_values.yaml +0 -0
  16. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/assets/docker-compose-template.yaml +0 -0
  17. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/assets/nginx.conf +0 -0
  18. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/assets/pool_config_template.yaml +0 -0
  19. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/assets/pool_config_values.yaml +0 -0
  20. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/assets/user_workspace.yaml +0 -0
  21. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/assets/user_workspace_values.yaml +0 -0
  22. {kalavai_client-0.5.12 → kalavai_client-0.5.13}/kalavai_client/auth.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: kalavai-client
3
- Version: 0.5.12
3
+ Version: 0.5.13
4
4
  Summary: Client app for kalavai platform
5
5
  License: Apache-2.0
6
6
  Keywords: LLM,platform
@@ -89,6 +89,9 @@ https://github.com/user-attachments/assets/0d2316f3-79ea-46ac-b41e-8ef720f52672
89
89
 
90
90
  ### News updates
91
91
 
92
+ <img src="docs/docs/assets/images/DeepSeek-Emblem.png" width="100">
93
+
94
+ - 6 February 2025: 🔥🔥🔥 Access **DeepSeek R1 model for free** when you join our [public LLM pool](https://kalavai-net.github.io/kalavai-client/public_llm_pool/)
92
95
  - 31 January 2025: `kalavai-client` is now a [PyPI package](https://pypi.org/project/kalavai-client/), easier to install than ever!
93
96
  - 27 January 2025: Support for accessing pools from remote computers
94
97
  - 9 January 2025: Added support for [Aphrodite Engine](https://github.com/aphrodite-engine/aphrodite-engine) models
@@ -140,7 +143,7 @@ From release **v0.5.0, you can now install `kalavai-client` in non-worker comput
140
143
  For workers sharing resources with the pool:
141
144
 
142
145
  - A laptop, desktop or Virtual Machine
143
- - Docker engine installed (for [linux](https://docs.docker.com/engine/install/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
146
+ - Docker engine installed (for [linux](https://docs.docker.com/engine/install/ubuntu/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
144
147
 
145
148
  > **Support for Windows and MacOS workers is experimental**: kalavai workers run on docker containers that require access to the host network interfaces, thus systems that do not support containers natively (Windows and MacOS) may have difficulties finding each other.
146
149
 
@@ -46,6 +46,9 @@ https://github.com/user-attachments/assets/0d2316f3-79ea-46ac-b41e-8ef720f52672
46
46
 
47
47
  ### News updates
48
48
 
49
+ <img src="docs/docs/assets/images/DeepSeek-Emblem.png" width="100">
50
+
51
+ - 6 February 2025: 🔥🔥🔥 Access **DeepSeek R1 model for free** when you join our [public LLM pool](https://kalavai-net.github.io/kalavai-client/public_llm_pool/)
49
52
  - 31 January 2025: `kalavai-client` is now a [PyPI package](https://pypi.org/project/kalavai-client/), easier to install than ever!
50
53
  - 27 January 2025: Support for accessing pools from remote computers
51
54
  - 9 January 2025: Added support for [Aphrodite Engine](https://github.com/aphrodite-engine/aphrodite-engine) models
@@ -97,7 +100,7 @@ From release **v0.5.0, you can now install `kalavai-client` in non-worker comput
97
100
  For workers sharing resources with the pool:
98
101
 
99
102
  - A laptop, desktop or Virtual Machine
100
- - Docker engine installed (for [linux](https://docs.docker.com/engine/install/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
103
+ - Docker engine installed (for [linux](https://docs.docker.com/engine/install/ubuntu/), [Windows and MacOS](https://docs.docker.com/desktop/)) with [privilege access](https://docs.docker.com/engine/containers/run/#runtime-privilege-and-linux-capabilities).
101
104
 
102
105
  > **Support for Windows and MacOS workers is experimental**: kalavai workers run on docker containers that require access to the host network interfaces, thus systems that do not support containers natively (Windows and MacOS) may have difficulties finding each other.
103
106
 
@@ -0,0 +1,2 @@
1
+
2
+ __version__ = "0.5.13"
@@ -139,7 +139,7 @@ releases:
139
139
  - name: replicas
140
140
  value: 2
141
141
  - name: image_tag
142
- value: "v2025.01.1"
142
+ value: "v2025.01.9"
143
143
  - name: deployment.in_cluster
144
144
  value: "True"
145
145
  - name: deployment.use_auth_key
@@ -15,6 +15,21 @@ import netifaces as ni
15
15
  import arguably
16
16
  from rich.console import Console
17
17
 
18
+ from kalavai_client.env import (
19
+ USER_COOKIE,
20
+ USER_LOCAL_SERVER_FILE,
21
+ TEMPLATE_LABEL,
22
+ user_path
23
+ )
24
+ from kalavai_client.core import (
25
+ fetch_resources,
26
+ fetch_job_names,
27
+ fetch_job_details,
28
+ fetch_devices,
29
+ fetch_job_logs,
30
+ fetch_gpus,
31
+ load_gpu_models
32
+ )
18
33
  from kalavai_client.utils import (
19
34
  check_gpu_drivers,
20
35
  run_cmd,
@@ -27,10 +42,8 @@ from kalavai_client.utils import (
27
42
  generate_table,
28
43
  request_to_server,
29
44
  resource_path,
30
- user_path,
31
45
  safe_remove,
32
46
  leave_vpn,
33
- join_vpn,
34
47
  get_vpn_details,
35
48
  load_server_info,
36
49
  user_login,
@@ -68,7 +81,6 @@ LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None)
68
81
  VERSION = 1
69
82
  RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"]
70
83
  CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"]
71
- TEMPLATE_LABEL = "kalavai.job.name"
72
84
  RAY_LABEL = "kalavai.ray.name"
73
85
  PVC_NAME_LABEL = "kalavai.storage.name"
74
86
  DOCKER_COMPOSE_TEMPLATE = resource_path("kalavai_client/assets/docker-compose-template.yaml")
@@ -98,9 +110,7 @@ USER_COMPOSE_FILE = user_path("docker-compose-worker.yaml")
98
110
  USER_VPN_COMPOSE_FILE = user_path("docker-compose-vpn.yaml")
99
111
  USER_HELM_APPS_FILE = user_path("apps.yaml")
100
112
  USER_KUBECONFIG_FILE = user_path("kubeconfig")
101
- USER_LOCAL_SERVER_FILE = user_path(".server")
102
113
  USER_TEMPLATES_FOLDER = user_path("templates", create_path=True)
103
- USER_COOKIE = user_path(".user_cookie.pkl")
104
114
 
105
115
 
106
116
  console = Console()
@@ -118,7 +128,7 @@ CLUSTER = dockerCluster(
118
128
  ######################
119
129
  ## HELPER FUNCTIONS ##
120
130
  ######################
121
-
131
+
122
132
  def check_seed_compatibility():
123
133
  """Check required packages to start pools"""
124
134
  logs = []
@@ -288,21 +298,11 @@ def select_ip_address(subnet=None):
288
298
  console.log("[red] Input error")
289
299
  return ips[option]
290
300
 
291
- def fetch_gpus():
292
- data = request_to_server(
293
- method="post",
294
- endpoint="/v1/get_node_gpus",
295
- data={},
296
- server_creds=USER_LOCAL_SERVER_FILE,
297
- user_cookie=USER_COOKIE
298
- )
299
- return data.items()
300
-
301
301
  def select_gpus(message):
302
302
  console.log(f"[yellow]{message}")
303
303
  gpu_models = ["Any/None"]
304
304
  gpu_models_full = ["Any/None"]
305
- available_gpus = fetch_gpus()
305
+ available_gpus = load_gpu_models()
306
306
  for _, gpus in available_gpus:
307
307
  for gpu in gpus["gpus"]:
308
308
  #status = "free" if "ready" in gpu else "busy"
@@ -737,7 +737,7 @@ def pool__check_token(token, *others, public=False):
737
737
 
738
738
 
739
739
  @arguably.command
740
- def pool__join(token, *others, node_name=None, ip_address: str=None):
740
+ def pool__join(token, *others, node_name=None):
741
741
  """
742
742
  Join Kalavai pool and start/resume sharing resources.
743
743
 
@@ -860,7 +860,7 @@ def pool__join(token, *others, node_name=None, ip_address: str=None):
860
860
  CLUSTER.start_worker_node()
861
861
  except Exception as e:
862
862
  console.log(f"[red] Error connecting to {cluster_name} @ {kalavai_seed_ip}. Check with the admin if the token is still valid.")
863
- leave_vpn(container_name=DEFAULT_VPN_CONTAINER_NAME)
863
+ pool__stop()
864
864
  exit()
865
865
 
866
866
  # ensure we are connected
@@ -869,6 +869,22 @@ def pool__join(token, *others, node_name=None, ip_address: str=None):
869
869
  time.sleep(30)
870
870
  if is_watcher_alive(server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE):
871
871
  break
872
+
873
+ # send note to server to let them know the node is coming online
874
+ if not pre_join_check(node_name=node_name, server_url=watcher_service, server_key=auth_key):
875
+ console.log(f"[red] Failed pre join checks. Server offline or node '{node_name}' may already exist. Please specify a different one with [yellow]--node-name'")
876
+ pool__stop()
877
+ return
878
+
879
+ # check the node has connected successfully
880
+ try:
881
+ while not CLUSTER.is_agent_running():
882
+ console.log("waiting for runner, may take a few minutes... Press <ctrl+c> to stop")
883
+ time.sleep(30)
884
+ except KeyboardInterrupt:
885
+ console.log("[red]Installation aborted. Leaving pool.")
886
+ pool__stop()
887
+ return
872
888
 
873
889
  init_user_workspace()
874
890
 
@@ -963,29 +979,24 @@ def pool__gpus(*others, available=False):
963
979
  console.log(f"[red]Problems with your pool: {str(e)}")
964
980
  return
965
981
 
966
- try:
967
- data = fetch_gpus()
968
- columns, rows = [], []
969
- for node, gpus in data:
970
- row_gpus = []
971
- for gpu in gpus["gpus"]:
972
- status = gpu["ready"] if "ready" in gpu else True
973
- if available and not status:
974
- continue
975
- row_gpus.append( (f"{gpu['model']} ({math.floor(int(gpu['memory'])/1000)} GBs)", str(status)))
976
- if len(row_gpus) > 0:
977
- models, statuses = zip(*row_gpus)
978
- rows.append([node, "\n".join(statuses), "\n".join(models), str(gpus["available"]), str(gpus["capacity"])])
979
-
980
- columns = ["Ready", "GPU(s)", "Available", "Total"]
981
- columns = ["Node"] + columns
982
- console.print(
983
- generate_table(columns=columns, rows=rows,end_sections=[n for n in range(len(rows))])
984
- )
985
-
986
- except Exception as e:
987
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
988
-
982
+ gpus = fetch_gpus(available=available)
983
+ if "error" in gpus:
984
+ console.log(f"[red]Error when fetching gpus: {gpus}")
985
+ return
986
+
987
+ columns = ["Node", "Ready", "GPU(s)", "Available", "Total"]
988
+ rows = []
989
+ for gpu in gpus:
990
+ rows.append([
991
+ gpu.node,
992
+ str(gpu.ready),
993
+ gpu.model,
994
+ str(gpu.available),
995
+ str(gpu.total)
996
+ ])
997
+ console.print(
998
+ generate_table(columns=columns, rows=rows,end_sections=[n for n in range(len(rows))])
999
+ )
989
1000
 
990
1001
  @arguably.command
991
1002
  def pool__resources(*others):
@@ -998,45 +1009,33 @@ def pool__resources(*others):
998
1009
  console.log(f"[red]Problems with your pool: {str(e)}")
999
1010
  return
1000
1011
 
1001
- try:
1002
- total = request_to_server(
1003
- method="get",
1004
- endpoint="/v1/get_cluster_total_resources",
1005
- data={},
1006
- server_creds=USER_LOCAL_SERVER_FILE,
1007
- user_cookie=USER_COOKIE
1008
- )
1009
- available = request_to_server(
1010
- method="get",
1011
- endpoint="/v1/get_cluster_available_resources",
1012
- data={},
1013
- server_creds=USER_LOCAL_SERVER_FILE,
1014
- user_cookie=USER_COOKIE
1015
- )
1016
- columns = []
1017
- total_values = []
1018
- available_values = []
1019
- for col in total.keys():
1020
- if col in RESOURCE_EXCLUDE:
1021
- continue
1022
- columns.append(col)
1023
- total_values.append(str(total[col]))
1024
- available_values.append(str(available[col]))
1025
-
1026
- columns = [""] + columns
1027
- total_values = ["Total"] + total_values
1028
- available_values = ["Available"] + available_values
1029
-
1030
- rows = [
1031
- tuple(available_values),
1032
- tuple(total_values)
1033
- ]
1034
- console.print(
1035
- generate_table(columns=columns, rows=rows, end_sections=[0, 1])
1036
- )
1037
-
1038
- except Exception as e:
1012
+ data = fetch_resources()
1013
+ if "error" in data:
1039
1014
  console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1015
+ return
1016
+
1017
+ columns = []
1018
+ total_values = []
1019
+ available_values = []
1020
+ for col in data["total"].keys():
1021
+ if col in RESOURCE_EXCLUDE:
1022
+ continue
1023
+ columns.append(col)
1024
+ total_values.append(str(data["total"][col]))
1025
+ available_values.append(str(data["available"][col]))
1026
+
1027
+ columns = [""] + columns
1028
+ total_values = ["Total"] + total_values
1029
+ available_values = ["Available"] + available_values
1030
+
1031
+ rows = [
1032
+ tuple(available_values),
1033
+ tuple(total_values)
1034
+ ]
1035
+ console.print(
1036
+ generate_table(columns=columns, rows=rows, end_sections=[0, 1])
1037
+ )
1038
+
1040
1039
 
1041
1040
  @arguably.command
1042
1041
  def pool__update(*others):
@@ -1333,22 +1332,18 @@ def node__list(*others):
1333
1332
  return
1334
1333
 
1335
1334
  try:
1336
- data = request_to_server(
1337
- method="get",
1338
- endpoint="/v1/get_nodes",
1339
- data={},
1340
- server_creds=USER_LOCAL_SERVER_FILE,
1341
- user_cookie=USER_COOKIE
1342
- )
1335
+ devices = fetch_devices()
1343
1336
  rows = []
1344
- columns = ["Node name"]
1345
- for node, status in data.items():
1346
- row = [node]
1347
- for key, value in status.items():
1348
- if key not in columns:
1349
- columns.append(key)
1350
- row.append(str(value))
1351
- rows.append(tuple(row))
1337
+ columns = ["Node name", "Memory Pressure", "Disk pressure", "PID pressure", "Ready", "Unschedulable"]
1338
+ for device in devices:
1339
+ rows.append([
1340
+ device.name,
1341
+ str(device.memory_pressure),
1342
+ str(device.disk_pressure),
1343
+ str(device.pid_pressure),
1344
+ str(device.ready),
1345
+ str(device.unschedulable)
1346
+ ])
1352
1347
 
1353
1348
  console.log("Nodes with 'unschedulable=True' will not receive workload")
1354
1349
  console.log("To make a node unschedulable (i.e. won't receive workloads) use [yellow]kalavai node cordon <node name>")
@@ -1540,7 +1535,9 @@ def job__test(local_template_dir, *others, values, defaults, force_namespace: st
1540
1535
  console.log(f"[red]--values ({values}) is not a valid local file")
1541
1536
  return
1542
1537
  with open(values, "r") as f:
1543
- values_dict = yaml.safe_load(f)
1538
+ raw_values = yaml.load(f, Loader=yaml.SafeLoader)
1539
+ values_dict = {variable["name"]: variable['value'] for variable in raw_values}
1540
+
1544
1541
  # load defaults
1545
1542
  if not os.path.isfile(defaults):
1546
1543
  console.log(f"[red]--defaults ({defaults}) is not a valid local file")
@@ -1648,7 +1645,7 @@ def job__estimate(billion_parameters, *others, precision=32):
1648
1645
 
1649
1646
  average_vram = 8
1650
1647
  required_memory = float(billion_parameters) * (precision / 8) / 1.2
1651
- available_gpus = fetch_gpus()
1648
+ available_gpus = load_gpu_models()
1652
1649
  vrams = []
1653
1650
  for _, gpus in available_gpus:
1654
1651
  for model in gpus["gpus"]:
@@ -1705,7 +1702,7 @@ def job__status(name, *others):
1705
1702
  return
1706
1703
 
1707
1704
  @arguably.command
1708
- def job__list(*others, detailed=False):
1705
+ def job__list(*others):
1709
1706
  """
1710
1707
  List jobs in the cluster
1711
1708
  """
@@ -1715,106 +1712,22 @@ def job__list(*others, detailed=False):
1715
1712
  console.log(f"[red]Problems with your pool: {str(e)}")
1716
1713
  return
1717
1714
 
1718
- data = {
1719
- "group": "batch.volcano.sh",
1720
- "api_version": "v1alpha1",
1721
- "plural": "jobs"
1722
- }
1723
- try:
1724
- result = request_to_server(
1725
- method="post",
1726
- endpoint="/v1/get_objects_of_type",
1727
- data=data,
1728
- server_creds=USER_LOCAL_SERVER_FILE,
1729
- user_cookie=USER_COOKIE
1730
- )
1731
- all_deployments = defaultdict(list)
1732
- for ns, ds in result.items():
1733
- all_deployments[ns].extend([d["metadata"]["labels"][TEMPLATE_LABEL] for d in ds["items"]])
1734
- #deployments = {ns: d["metadata"]["labels"][TEMPLATE_LABEL] for ns, ds in result.items() for d in ds["items"]}
1735
- except Exception as e:
1736
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1737
- return
1738
- if len(all_deployments.keys()) == 0:
1715
+ all_deployments = fetch_job_names()
1716
+ if "error" in all_deployments:
1717
+ console.log(f"[red]Error when connecting to kalavai service: {all_deployments}")
1718
+ return
1719
+
1720
+ if len(all_deployments) == 0:
1739
1721
  console.log("[green]No deployments found.")
1740
1722
  return
1741
1723
 
1724
+ details = fetch_job_details(jobs=all_deployments)
1725
+ if "error" in details:
1726
+ console.log(f"[red]{details}")
1727
+ return
1742
1728
  columns = ["Owner", "Deployment", "Workers", "Endpoint"]
1743
- if detailed:
1744
- columns.append("Status")
1745
- rows = []
1746
- for namespace, deployments in all_deployments.items():
1747
- for deployment in deployments:
1748
- try:
1749
- # get status for deployment
1750
- if detailed:
1751
- data = {
1752
- "group": "batch.volcano.sh",
1753
- "api_version": "v1alpha1",
1754
- "plural": "jobs",
1755
- # "group": "leaderworkerset.x-k8s.io",
1756
- # "api_version": "v1",
1757
- # "plural": "leaderworkersets",
1758
- "name": deployment
1759
- }
1760
- result = request_to_server(
1761
- method="post",
1762
- endpoint="/v1/get_status_for_object",
1763
- data=data,
1764
- server_creds=USER_LOCAL_SERVER_FILE,
1765
- user_cookie=USER_COOKIE
1766
- )
1767
- ss = [] # flatten results ({namespace: statuses})
1768
- [ss.extend(values) for values in result.values()]
1769
- if len(ss) > 0:
1770
- last = ss[-1]
1771
- statuses = f"[{last['lastTransitionTime']}] {last['status']}"
1772
- else:
1773
- statuses = "Unknown"
1774
- # get pod statuses
1775
- data = {
1776
- "label": TEMPLATE_LABEL,
1777
- "value": deployment
1778
- }
1779
- result = request_to_server(
1780
- method="post",
1781
- endpoint="/v1/get_pods_status_for_label",
1782
- data=data,
1783
- server_creds=USER_LOCAL_SERVER_FILE,
1784
- user_cookie=USER_COOKIE
1785
- )
1786
- workers_status = defaultdict(int)
1787
- for ns, ss in result.items():
1788
- if ns != namespace: # same job name, different namespace
1789
- continue
1790
- for _, values in ss.items():
1791
- workers_status[values["status"]] += 1
1792
- workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
1793
- # get URL details
1794
- data = {
1795
- "label": TEMPLATE_LABEL,
1796
- "value": deployment,
1797
- "types": ["NodePort"]
1798
- }
1799
- result = request_to_server(
1800
- method="post",
1801
- endpoint="/v1/get_ports_for_services",
1802
- data=data,
1803
- server_creds=USER_LOCAL_SERVER_FILE,
1804
- user_cookie=USER_COOKIE
1805
- )
1806
- node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
1807
-
1808
- urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
1809
- row = [namespace, deployment, workers, "\n".join(urls)]
1810
- if detailed:
1811
- row.append(statuses)
1812
- rows.append(row)
1813
-
1814
- except Exception as e:
1815
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1816
- return
1817
-
1729
+ rows = [[job.owner, job.name, job.workers, job.endpoint] for job in details]
1730
+
1818
1731
  console.print(
1819
1732
  generate_table(columns=columns, rows=rows, end_sections=range(len(rows)))
1820
1733
  )
@@ -1836,26 +1749,19 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
1836
1749
 
1837
1750
  if force_namespace is not None:
1838
1751
  console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
1839
-
1840
- data = {
1841
- "label": TEMPLATE_LABEL,
1842
- "value": name,
1843
- "tail": tail
1844
- }
1845
- if force_namespace is not None:
1846
- data["force_namespace"] = force_namespace
1752
+
1753
+ all_logs = fetch_job_logs(
1754
+ job_name=name,
1755
+ pod_name=pod_name,
1756
+ force_namespace=force_namespace,
1757
+ tail=tail)
1758
+ if "error" in all_logs:
1759
+ console.log(f"[red]{all_logs}")
1760
+ return
1847
1761
  while True:
1848
1762
  try:
1849
- # send tail as parameter (fetch only last _tail_ lines)
1850
- result = request_to_server(
1851
- method="post",
1852
- endpoint="/v1/get_logs_for_label",
1853
- data=data,
1854
- server_creds=USER_LOCAL_SERVER_FILE,
1855
- user_cookie=USER_COOKIE
1856
- )
1857
1763
  if not stream:
1858
- for pod, logs in result.items():
1764
+ for pod, logs in all_logs.items():
1859
1765
  if pod_name is not None and pod_name != pod:
1860
1766
  continue
1861
1767
  console.log(f"[yellow]Pod {pod}")
@@ -1863,7 +1769,7 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
1863
1769
  break
1864
1770
  else:
1865
1771
  os.system("clear")
1866
- for pod, logs in result.items():
1772
+ for pod, logs in all_logs.items():
1867
1773
  if pod_name is not None and pod_name != pod:
1868
1774
  continue
1869
1775
  print(f"Pod {pod}")
@@ -1871,10 +1777,7 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
1871
1777
  time.sleep(1)
1872
1778
  except KeyboardInterrupt:
1873
1779
  break
1874
- except Exception as e:
1875
- console.log(f"[red]Error when connecting to kalavai service: {str(e)}")
1876
- console.log(f"Check if {name} is running with [yellow]kalavai job list")
1877
- return
1780
+
1878
1781
 
1879
1782
  @arguably.command
1880
1783
  def job__manifest(*others, name, force_namespace: str=None):
@@ -133,6 +133,9 @@ class dockerCluster(Cluster):
133
133
  if not os.path.isfile(self.compose_file):
134
134
  return False
135
135
  status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --status=running").decode()
136
+ if not status:
137
+ return False
138
+ status = (0 == os.system(f'docker exec {self.container_name} ps aux | grep -v grep | grep -E "k3s (server|agent)"'))
136
139
  return status
137
140
 
138
141
  def is_seed_node(self):
@@ -0,0 +1,227 @@
1
+ from collections import defaultdict
2
+ import math
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from kalavai_client.utils import (
7
+ request_to_server,
8
+ load_server_info
9
+ )
10
+ from kalavai_client.env import (
11
+ USER_COOKIE,
12
+ USER_LOCAL_SERVER_FILE,
13
+ TEMPLATE_LABEL,
14
+ SERVER_IP_KEY
15
+ )
16
+
17
+ class Job(BaseModel):
18
+ owner: str = None
19
+ name: str = None
20
+ workers: str = None
21
+ endpoint: str = None
22
+
23
+ class DeviceStatus(BaseModel):
24
+ name: str
25
+ memory_pressure: bool
26
+ disk_pressure: bool
27
+ pid_pressure: bool
28
+ ready: bool
29
+ unschedulable: bool
30
+
31
+ class GPU(BaseModel):
32
+ node: str
33
+ available: int
34
+ total: int
35
+ ready: bool
36
+ model: str
37
+
38
+
39
+ def fetch_resources():
40
+ try:
41
+ total = request_to_server(
42
+ method="get",
43
+ endpoint="/v1/get_cluster_total_resources",
44
+ data={},
45
+ server_creds=USER_LOCAL_SERVER_FILE,
46
+ user_cookie=USER_COOKIE
47
+ )
48
+ available = request_to_server(
49
+ method="get",
50
+ endpoint="/v1/get_cluster_available_resources",
51
+ data={},
52
+ server_creds=USER_LOCAL_SERVER_FILE,
53
+ user_cookie=USER_COOKIE
54
+ )
55
+ except Exception as e:
56
+ return {"error": str(e)}
57
+
58
+ return {"total": total, "available": available}
59
+
60
+ def fetch_job_names():
61
+ data = {
62
+ "group": "batch.volcano.sh",
63
+ "api_version": "v1alpha1",
64
+ "plural": "jobs"
65
+ }
66
+ try:
67
+ jobs = request_to_server(
68
+ method="post",
69
+ endpoint="/v1/get_objects_of_type",
70
+ data=data,
71
+ server_creds=USER_LOCAL_SERVER_FILE,
72
+ user_cookie=USER_COOKIE
73
+ )
74
+ all_jobs = []
75
+ for ns, ds in jobs.items():
76
+ all_jobs.extend([Job(owner=ns, name=d["metadata"]["labels"][TEMPLATE_LABEL]) for d in ds["items"]])
77
+ except Exception as e:
78
+ return {"error": str(e)}
79
+
80
+ return all_jobs
81
+
82
+ def fetch_job_details(jobs: list[Job]):
83
+ """Get job details. A job is a dict:
84
+ {
85
+ "namespace": ns,
86
+ "name": name
87
+ }
88
+ """
89
+ job_details = []
90
+ for job in jobs:
91
+ namespace = job.owner
92
+ deployment = job.name
93
+ try:
94
+ # get pod statuses
95
+ data = {
96
+ "label": TEMPLATE_LABEL,
97
+ "value": deployment
98
+ }
99
+ result = request_to_server(
100
+ method="post",
101
+ endpoint="/v1/get_pods_status_for_label",
102
+ data=data,
103
+ server_creds=USER_LOCAL_SERVER_FILE,
104
+ user_cookie=USER_COOKIE
105
+ )
106
+ workers_status = defaultdict(int)
107
+ for ns, ss in result.items():
108
+ if ns != namespace: # same job name, different namespace
109
+ continue
110
+ for _, values in ss.items():
111
+ workers_status[values["status"]] += 1
112
+ workers = "\n".join([f"{k}: {v}" for k, v in workers_status.items()])
113
+ # get URL details
114
+ data = {
115
+ "label": TEMPLATE_LABEL,
116
+ "value": deployment,
117
+ "types": ["NodePort"]
118
+ }
119
+ result = request_to_server(
120
+ method="post",
121
+ endpoint="/v1/get_ports_for_services",
122
+ data=data,
123
+ server_creds=USER_LOCAL_SERVER_FILE,
124
+ user_cookie=USER_COOKIE
125
+ )
126
+ node_ports = [f"{p['node_port']} (mapped to {p['port']})" for s in result.values() for p in s["ports"]]
127
+
128
+ urls = [f"http://{load_server_info(data_key=SERVER_IP_KEY, file=USER_LOCAL_SERVER_FILE)}:{node_port}" for node_port in node_ports]
129
+ job_details.append(
130
+ Job(owner=namespace,
131
+ name=deployment,
132
+ workers=workers,
133
+ endpoint="\n".join(urls))
134
+ )
135
+
136
+ except Exception as e:
137
+ return {"error": str(e)}
138
+
139
+ return job_details
140
+
141
+ def fetch_devices():
142
+ """Load devices status info for all hosts"""
143
+ try:
144
+ data = request_to_server(
145
+ method="get",
146
+ endpoint="/v1/get_nodes",
147
+ data={},
148
+ server_creds=USER_LOCAL_SERVER_FILE,
149
+ user_cookie=USER_COOKIE
150
+ )
151
+ devices = []
152
+ for node, status in data.items():
153
+ devices.append(
154
+ DeviceStatus(
155
+ name=node,
156
+ memory_pressure=status["MemoryPressure"],
157
+ disk_pressure=status["DiskPressure"],
158
+ pid_pressure=status["PIDPressure"],
159
+ ready=status["Ready"],
160
+ unschedulable=status["unschedulable"]
161
+ )
162
+ )
163
+ return devices
164
+
165
+ except Exception as e:
166
+ return {"error": str(e)}
167
+
168
+ def fetch_job_logs(job_name, force_namespace=None, pod_name=None, tail=100):
169
+ data = {
170
+ "label": TEMPLATE_LABEL,
171
+ "value": job_name,
172
+ "tail": tail
173
+ }
174
+ if force_namespace is not None:
175
+ data["force_namespace"] = force_namespace
176
+ try:
177
+ # send tail as parameter (fetch only last _tail_ lines)
178
+ all_logs = request_to_server(
179
+ method="post",
180
+ endpoint="/v1/get_logs_for_label",
181
+ data=data,
182
+ server_creds=USER_LOCAL_SERVER_FILE,
183
+ user_cookie=USER_COOKIE
184
+ )
185
+ return {pod: logs for pod, logs in all_logs.items() if pod_name is None or pod_name == pod}
186
+
187
+ except Exception as e:
188
+ return {"error": str(e)}
189
+
190
+
191
+ def load_gpu_models():
192
+ data = request_to_server(
193
+ method="post",
194
+ endpoint="/v1/get_node_gpus",
195
+ data={},
196
+ server_creds=USER_LOCAL_SERVER_FILE,
197
+ user_cookie=USER_COOKIE
198
+ )
199
+ return data.items()
200
+
201
+ def fetch_gpus(available=False):
202
+ try:
203
+ data = load_gpu_models()
204
+ all_gpus = []
205
+ for node, gpus in data:
206
+ row_gpus = []
207
+ for gpu in gpus["gpus"]:
208
+ status = gpu["ready"] if "ready" in gpu else True
209
+ if available and not status:
210
+ continue
211
+ row_gpus.append( (f"{gpu['model']} ({math.floor(int(gpu['memory'])/1000)} GBs)", str(status)))
212
+ if len(row_gpus) > 0:
213
+ models, statuses = zip(*row_gpus)
214
+ #rows.append([node, "\n".join(statuses), "\n".join(models), str(gpus["available"]), str(gpus["capacity"])])
215
+ all_gpus.extend([
216
+ GPU(
217
+ node=node,
218
+ ready=status,
219
+ model=model,
220
+ available=gpus["available"],
221
+ total=gpus["capacity"]
222
+ ) for model, status in zip(models, statuses)
223
+ ])
224
+ return all_gpus
225
+
226
+ except Exception as e:
227
+ return {"error": str(e)}
@@ -0,0 +1,19 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+
5
+ def user_path(relative_path, create_path=False):
6
+ """Transform a relative path into the user's cache folder path"""
7
+ base = os.path.expanduser("~")
8
+ kalavai_user_path = os.path.join(base, ".cache/kalavai")
9
+ full_path = os.path.join(kalavai_user_path, relative_path)
10
+ if create_path:
11
+ Path(full_path).mkdir(parents=True, exist_ok=True)
12
+
13
+ return full_path
14
+
15
+
16
+ USER_LOCAL_SERVER_FILE = user_path(".server")
17
+ USER_COOKIE = user_path(".user_cookie.pkl")
18
+ TEMPLATE_LABEL = "kalavai.job.name"
19
+ SERVER_IP_KEY = "server_ip"
@@ -14,6 +14,9 @@ import yaml
14
14
 
15
15
 
16
16
  from kalavai_client.auth import KalavaiAuthClient
17
+ from kalavai_client.env import (
18
+ SERVER_IP_KEY
19
+ )
17
20
 
18
21
 
19
22
  GITHUB_ORG = "kalavai-net"
@@ -22,7 +25,6 @@ GITHUB_TEMPLATE_PATH = "templates"
22
25
  USER_NODE_LABEL_KEY = "user_node_label"
23
26
  CLUSTER_IP_KEY = "cluster_ip"
24
27
  CLUSTER_TOKEN_KEY = "cluster_token"
25
- SERVER_IP_KEY = "server_ip"
26
28
  NODE_NAME_KEY = "node_name"
27
29
  PUBLIC_LOCATION_KEY = "public_location"
28
30
  CLUSTER_NAME_KEY = "cluster_name"
@@ -397,16 +399,6 @@ def resource_path(relative_path: str):
397
399
  return None
398
400
  return resource
399
401
 
400
- def user_path(relative_path, create_path=False):
401
- """Transform a relative path into the user's cache folder path"""
402
- base = os.path.expanduser("~")
403
- kalavai_user_path = os.path.join(base, ".cache/kalavai")
404
- full_path = os.path.join(kalavai_user_path, relative_path)
405
- if create_path:
406
- Path(full_path).mkdir(parents=True, exist_ok=True)
407
-
408
- return full_path
409
-
410
402
  def safe_remove(filepath, force=True):
411
403
  if not os.path.exists(filepath):
412
404
  return
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "kalavai-client"
3
- version = "0.5.12"
3
+ version = "0.5.13"
4
4
  authors = [
5
5
  {name = "Carlos Fernandez Musoles", email = "carlos@kalavai.net"}
6
6
  ]
@@ -1,2 +0,0 @@
1
-
2
- __version__ = "0.5.12"
File without changes