kalavai-client 0.5.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,308 @@
1
+ import os
2
+ import time
3
+ from pathlib import Path
4
+ from abc import ABC, abstractmethod
5
+
6
+ from kalavai_client.utils import (
7
+ run_cmd,
8
+ check_gpu_drivers,
9
+ validate_poolconfig,
10
+ user_path
11
+ )
12
+
13
+
14
+ class Cluster(ABC):
15
+ @abstractmethod
16
+ def start_seed_node(self, ip_address, labels, flannel_iface):
17
+ raise NotImplementedError()
18
+
19
+ @abstractmethod
20
+ def start_worker_node(self, url, token, node_name, auth_key, watcher_service, ip_address, labels, flannel_iface):
21
+ raise NotImplementedError()
22
+
23
+
24
+ @abstractmethod
25
+ def update_dependencies(self, dependencies_files):
26
+ raise NotImplementedError()
27
+
28
+
29
+ @abstractmethod
30
+ def remove_agent(self):
31
+ raise NotImplementedError()
32
+
33
+ @abstractmethod
34
+ def is_agent_running(self) -> bool:
35
+ raise NotImplementedError()
36
+
37
+ @abstractmethod
38
+ def is_seed_node(self) -> bool:
39
+ raise NotImplementedError()
40
+
41
+ @abstractmethod
42
+ def is_cluster_init(self) -> bool:
43
+ raise NotImplementedError()
44
+
45
+ @abstractmethod
46
+ def pause_agent(self) -> bool:
47
+ raise NotImplementedError()
48
+
49
+ @abstractmethod
50
+ def restart_agent(self) -> bool:
51
+ raise NotImplementedError()
52
+
53
+ @abstractmethod
54
+ def get_cluster_token(self) -> str:
55
+ raise NotImplementedError()
56
+
57
+ @abstractmethod
58
+ def diagnostics(self) -> str:
59
+ raise NotImplementedError()
60
+
61
+ @abstractmethod
62
+ def validate_cluster(self) -> bool:
63
+ raise NotImplementedError
64
+
65
+ class dockerCluster(Cluster):
66
+ def __init__(self, container_name, compose_file, kubeconfig_file, poolconfig_file, dependencies_file, kube_version="v1.31.1+k3s1", flannel_iface=None):
67
+ self.kube_version = kube_version
68
+ self.container_name = container_name
69
+ self.compose_file = compose_file
70
+ self.kubeconfig_file = kubeconfig_file
71
+ self.poolconfig_file = poolconfig_file
72
+ self.dependencies_file = dependencies_file
73
+
74
+ if flannel_iface is not None:
75
+ self.default_flannel_iface = flannel_iface
76
+ else:
77
+ self.default_flannel_iface = ""
78
+
79
+ def start_seed_node(self):
80
+
81
+ run_cmd(f"docker compose -f {self.compose_file} up -d")
82
+ time.sleep(5)
83
+ run_cmd(f"docker cp {self.container_name}:/etc/rancher/k3s/k3s.yaml {self.kubeconfig_file}")
84
+
85
+ def start_worker_node(self):
86
+ run_cmd(f"docker compose -f {self.compose_file} up -d")
87
+
88
+
89
+ def update_dependencies(self, dependencies_file=None, debug=False, retries=3):
90
+ if dependencies_file is not None:
91
+ self.dependencies_file = dependencies_file
92
+ if debug:
93
+ output = ""
94
+ else:
95
+ output = " >/dev/null 2>&1"
96
+ while True:
97
+ try:
98
+ home = user_path("")
99
+ run_cmd(f"docker run --rm --net=host -v {home}:{home} ghcr.io/helmfile/helmfile:v0.169.2 helmfile sync --file {self.dependencies_file} --kubeconfig {self.kubeconfig_file} {output}")
100
+ #run_cmd(f"helmfile sync --file {self.dependencies_file} --kubeconfig {self.kubeconfig_file} {output}")
101
+ break
102
+ except Exception as e:
103
+ if retries > 0:
104
+ retries -= 1
105
+ print(f"[{retries}] Dependencies failed. Retrying...")
106
+ else:
107
+ raise Exception(f"Dependencies failed. Are you connected to the internet?\n\nTrace: {str(e)}")
108
+
109
+ def remove_agent(self):
110
+ try:
111
+ run_cmd(f'docker compose -f {self.compose_file} down')
112
+ return True
113
+ except:
114
+ return False
115
+
116
+ def is_agent_running(self):
117
+ if not os.path.isfile(self.compose_file):
118
+ return False
119
+ status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --status=running").decode()
120
+ return status
121
+
122
+ def is_seed_node(self):
123
+ if not os.path.isfile(self.compose_file):
124
+ return False
125
+ status = "server" in run_cmd(f"docker compose -f {self.compose_file} ps --services --status=running").decode()
126
+ return status
127
+
128
+ def is_cluster_init(self):
129
+ if not os.path.isfile(self.compose_file):
130
+ return False
131
+ status = self.container_name in run_cmd(f"docker compose -f {self.compose_file} ps --services --all").decode()
132
+ return status
133
+
134
+ def pause_agent(self):
135
+ status = False
136
+ try:
137
+ run_cmd(f'docker compose -f {self.compose_file} stop')
138
+ status = True
139
+ except:
140
+ pass
141
+ return status
142
+
143
+ def restart_agent(self):
144
+ try:
145
+ run_cmd(f'docker compose -f {self.compose_file} start')
146
+
147
+ except:
148
+ pass
149
+ return self.is_agent_running()
150
+
151
+ def get_cluster_token(self):
152
+ if self.is_seed_node():
153
+ return run_cmd(f"docker container exec {self.container_name} cat /var/lib/rancher/k3s/server/node-token").decode()
154
+ #return run_cmd("sudo k3s token create --kubeconfig /etc/rancher/k3s/k3s.yaml --ttl 0").decode()
155
+ else:
156
+ return None
157
+
158
+ def diagnostics(self) -> str:
159
+ # TODO: check cache files are in order
160
+ # get cluster status
161
+ if self.is_seed_node():
162
+ return run_cmd(f"docker exec {self.container_name} kubectl get pods -A -o wide").decode() + "\n\n" + run_cmd(f"docker exec {self.container_name} kubectl get nodes").decode()
163
+ else:
164
+ return None
165
+
166
+ def validate_cluster(self) -> bool:
167
+ # check if credentials are present
168
+ return os.path.isfile(self.poolconfig_file)
169
+
170
+
171
+ class k3sCluster(Cluster):
172
+
173
+ def __init__(self, kubeconfig_file, poolconfig_file, dependencies_file, kube_version="v1.31.1+k3s1", flannel_iface=None):
174
+ self.kube_version = kube_version
175
+ self.kubeconfig_file = kubeconfig_file
176
+ self.poolconfig_file = poolconfig_file
177
+ self.dependencies_file = dependencies_file
178
+
179
+ if flannel_iface is not None:
180
+ self.default_flannel_iface = flannel_iface
181
+ else:
182
+ self.default_flannel_iface = ""
183
+ try:
184
+ if check_gpu_drivers():
185
+ self.node_labels = "--node-label gpu=on"
186
+ except:
187
+ print("[Warning] issues detected with nvidia, GPU has been disabled for this node")
188
+ self.node_labels = ""
189
+
190
+ def start_seed_node(self, ip_address, labels=None, is_public=False):
191
+ node_labels = self.node_labels
192
+ if labels is not None:
193
+ for key, value in labels.items():
194
+ node_labels += f" --node-label {key}={value}"
195
+ if is_public:
196
+ flannel_iface = f"--flannel-iface {self.default_flannel_iface}"
197
+ else:
198
+ flannel_iface = ""
199
+ run_cmd(f'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="{self.kube_version}" INSTALL_K3S_EXEC="server --node-ip {ip_address} --node-external-ip {ip_address} {flannel_iface} --flannel-backend wireguard-native {node_labels}" sh - >/dev/null 2>&1')
200
+ run_cmd(f"sudo cp /etc/rancher/k3s/k3s.yaml {self.kubeconfig_file}")
201
+ run_cmd(f"sudo chown $USER {self.kubeconfig_file}")
202
+
203
+
204
+ def start_worker_node(self, url, token, node_name, ip_address, labels=None, is_public=False):
205
+ node_labels = self.node_labels
206
+ if labels is not None:
207
+ for key, value in labels.items():
208
+ node_labels += f" --node-label {key}={value}"
209
+ if is_public:
210
+ flannel_iface = f"--flannel-iface {self.default_flannel_iface}"
211
+ else:
212
+ flannel_iface = ""
213
+ command = f'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION="{self.kube_version}" INSTALL_K3S_EXEC="agent --token {token} --server https://{url}:6443 --node-name {node_name} --node-ip {ip_address} --node-external-ip {ip_address} {flannel_iface} {node_labels}" sh - >/dev/null 2>&1'
214
+ run_cmd(command)
215
+
216
+
217
+ def update_dependencies(self, dependencies_file=None, debug=False, retries=3):
218
+ if dependencies_file is not None:
219
+ self.dependencies_file = dependencies_file
220
+ if debug:
221
+ output = ""
222
+ else:
223
+ output = " >/dev/null 2>&1"
224
+ while True:
225
+ try:
226
+ run_cmd(f"helmfile sync --file {self.dependencies_file} --kubeconfig {self.kubeconfig_file} {output}")
227
+ break
228
+ except Exception as e:
229
+ if retries > 0:
230
+ retries -= 1
231
+ print(f"[{retries}] Dependencies failed. Retrying...")
232
+ else:
233
+ raise Exception(f"Dependencies failed. Are you connected to the internet?\n\nTrace: {str(e)}")
234
+
235
+
236
+ def remove_agent(self):
237
+ try:
238
+ run_cmd('/usr/local/bin/k3s-uninstall.sh >/dev/null 2>&1')
239
+ run_cmd('sudo rm -r /etc/rancher/node/ >/dev/null 2>&1')
240
+ return True
241
+ except:
242
+ pass
243
+ try:
244
+ run_cmd('/usr/local/bin/k3s-agent-uninstall.sh >/dev/null 2>&1')
245
+ return True
246
+ except:
247
+ pass
248
+ return False
249
+
250
+ def is_agent_running(self):
251
+ status = (0 == os.system('sudo systemctl is-active --quiet k3s-agent.service')) or (0 == os.system('sudo systemctl is-active --quiet k3s.service'))
252
+ return status
253
+
254
+ def is_seed_node(self):
255
+ return 0 == os.system('sudo systemctl is-active --quiet k3s.service')
256
+
257
+ def is_cluster_init(self):
258
+ status = Path("/usr/local/bin/k3s-agent-uninstall.sh").is_file() or Path("/usr/local/bin/k3s-uninstall.sh").is_file()
259
+ return status
260
+
261
+ def pause_agent(self):
262
+ status = False
263
+ try:
264
+ run_cmd('sudo systemctl stop k3s >/dev/null 2>&1')
265
+ status = True
266
+ except:
267
+ pass
268
+ try:
269
+ run_cmd('sudo systemctl stop k3s-agent >/dev/null 2>&1')
270
+ status = True
271
+ except:
272
+ pass
273
+ return status
274
+
275
+ def restart_agent(self):
276
+ try:
277
+ run_cmd('sudo systemctl start k3s >/dev/null 2>&1')
278
+ except:
279
+ pass
280
+ try:
281
+ run_cmd('sudo systemctl start k3s-agent >/dev/null 2>&1')
282
+ except:
283
+ pass
284
+ return self.is_agent_running()
285
+
286
+ def get_cluster_token(self):
287
+ if self.is_seed_node():
288
+ return run_cmd("sudo cat /var/lib/rancher/k3s/server/node-token").decode()
289
+ #return run_cmd("sudo k3s token create --kubeconfig /etc/rancher/k3s/k3s.yaml --ttl 0").decode()
290
+ else:
291
+ return None
292
+
293
+ def diagnostics(self) -> str:
294
+ if self.is_seed_node():
295
+ return run_cmd(f"k3s kubectl get pods -A -o wide --kubeconfig {self.kubeconfig_file}").decode() + "\n\n" + run_cmd(f"k3s kubectl get nodes --kubeconfig {self.kubeconfig_file}").decode()
296
+ else:
297
+ return None
298
+
299
+ def validate_cluster(self) -> bool:
300
+ if not self.is_cluster_init():
301
+ raise ValueError("Pool not initialised")
302
+ if not self.is_agent_running():
303
+ raise ValueError("Pool initialised but agent is not running")
304
+ # check cache files
305
+ if self.is_seed_node():
306
+ if not validate_poolconfig(self.poolconfig_file):
307
+ raise ValueError("Cache missconfigured. Run 'kalavai pool stop' to clear.")
308
+ return True