krkn-lib 5.1.4__py3-none-any.whl → 5.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,308 @@
1
+ import os
2
+ import re
3
+ from concurrent.futures import Future
4
+ from concurrent.futures.thread import ThreadPoolExecutor
5
+ from functools import partial
6
+
7
+ from kubernetes import config, watch
8
+ from kubernetes.client import V1Pod, CoreV1Api
9
+
10
+ from krkn_lib.models.pod_monitor.models import (
11
+ PodsSnapshot,
12
+ MonitoredPod,
13
+ PodEvent,
14
+ PodStatus,
15
+ )
16
+
17
+ config.load_kube_config(os.path.join(os.environ["HOME"], ".kube/config"))
18
+
19
+
20
+ def _select_pods(
21
+ select_partial: partial,
22
+ namespace_pattern: str = None,
23
+ name_pattern: str = None,
24
+ ):
25
+ initial_pods = select_partial()
26
+ snapshot = PodsSnapshot()
27
+ snapshot.resource_version = initial_pods.metadata.resource_version
28
+
29
+ for pod in initial_pods.items:
30
+ match_name = True
31
+ match_namespace = True
32
+ if namespace_pattern:
33
+ match = re.match(namespace_pattern, pod.metadata.namespace)
34
+ match_namespace = match is not None
35
+ if name_pattern:
36
+ match = re.match(name_pattern, pod.metadata.name)
37
+ match_name = match is not None
38
+ if match_name and match_namespace:
39
+ mon_pod = MonitoredPod()
40
+ snapshot.initial_pods.append(pod.metadata.name)
41
+ mon_pod.name = pod.metadata.name
42
+ mon_pod.namespace = pod.metadata.namespace
43
+ snapshot.pods[mon_pod.name] = mon_pod
44
+ return snapshot
45
+
46
+
47
+ def _monitor_pods(
48
+ monitor_partial: partial,
49
+ snapshot: PodsSnapshot,
50
+ max_timeout: int,
51
+ name_pattern: str = None,
52
+ namespace_pattern: str = None,
53
+ ) -> PodsSnapshot:
54
+ w = watch.Watch(return_type=V1Pod)
55
+ deleted_parent_pods = []
56
+ restored_pods = []
57
+ cluster_restored = False
58
+ for event in w.stream(monitor_partial, timeout_seconds=max_timeout):
59
+ match_name = True
60
+ match_namespace = True
61
+ event_type = event["type"]
62
+ pod = event["object"]
63
+
64
+ if namespace_pattern:
65
+ match = re.match(namespace_pattern, pod.metadata.namespace)
66
+ match_namespace = match is not None
67
+ if name_pattern:
68
+ match = re.match(name_pattern, pod.metadata.name)
69
+ match_name = match is not None
70
+
71
+ if match_name and match_namespace:
72
+ pod_event = PodEvent()
73
+ if event_type == "MODIFIED":
74
+ if pod.metadata.deletion_timestamp is not None:
75
+ pod_event.status = PodStatus.DELETION_SCHEDULED
76
+ deleted_parent_pods.append(pod.metadata.name)
77
+ elif _is_pod_ready(pod):
78
+ pod_event.status = PodStatus.READY
79
+ # if there are at least the same number of ready
80
+ # pods as the snapshot.initial_pods set we assume that
81
+ # the cluster is restored to the initial condition
82
+ restored_pods.append(pod.metadata.name)
83
+ if len(restored_pods) >= len(snapshot.initial_pods):
84
+ cluster_restored = True
85
+ else:
86
+ pod_event.status = PodStatus.NOT_READY
87
+
88
+ elif event_type == "DELETED":
89
+ pod_event.status = PodStatus.DELETED
90
+ elif event_type == "ADDED":
91
+ pod_event.status = PodStatus.ADDED
92
+
93
+ if pod_event.status == PodStatus.ADDED:
94
+ snapshot.added_pods.append(pod.metadata.name)
95
+ # in case a pod is respawn with the same name
96
+ # the dictionary must not be reinitialized
97
+ if pod.metadata.name not in snapshot.pods:
98
+ snapshot.pods[pod.metadata.name] = MonitoredPod()
99
+ snapshot.pods[pod.metadata.name].name = pod.metadata.name
100
+ snapshot.pods[pod.metadata.name].namespace = (
101
+ pod.metadata.namespace
102
+ )
103
+ # skips events out of the snapshot
104
+ if pod.metadata.name in snapshot.pods:
105
+ snapshot.pods[pod.metadata.name].status_changes.append(
106
+ pod_event
107
+ )
108
+ # this flag is set when all the pods
109
+ # that has been deleted or not ready
110
+ # have been restored, if True the
111
+ # monitoring is stopeed earlier
112
+ if cluster_restored:
113
+ w.stop()
114
+
115
+ return snapshot
116
+
117
+
118
+ def _is_pod_ready(pod: V1Pod) -> bool:
119
+ if not pod.status.container_statuses:
120
+ return False
121
+ for status in pod.status.container_statuses:
122
+ if not status.ready:
123
+ return False
124
+ return True
125
+
126
+
127
+ def _is_pod_terminating(pod: V1Pod) -> bool:
128
+ if pod.metadata.deletion_timestamp is not None:
129
+ return True
130
+ return False
131
+
132
+
133
+ def select_and_monitor_by_label(
134
+ label_selector: str,
135
+ max_timeout: int,
136
+ v1_client: CoreV1Api,
137
+ ) -> Future:
138
+ """
139
+ Monitors all the pods identified
140
+ by a label selector and collects infos about the
141
+ pods recovery after a kill scenario while the scenario is running.
142
+
143
+ :param label_selector: the label selector used
144
+ to filter the pods to monitor (must be the
145
+ same used in `select_pods_by_label`)
146
+ :param max_timeout: the expected time the pods should take
147
+ to recover. If the killed pods are replaced in this time frame,
148
+ but they didn't reach the Ready State, they will be marked as
149
+ unrecovered. If during the time frame the pods are not replaced
150
+ at all the error field of the PodsStatus structure will be
151
+ valorized with an exception.
152
+ :param v1_client: kubernetes V1Api client
153
+ :return:
154
+ a future which result (PodsSnapshot) must be
155
+ gathered to obtain the pod infos.
156
+
157
+ """
158
+ select_partial = partial(
159
+ v1_client.list_pod_for_all_namespaces,
160
+ label_selector=label_selector,
161
+ field_selector="status.phase=Running",
162
+ )
163
+ snapshot = _select_pods(select_partial)
164
+ monitor_partial = partial(
165
+ v1_client.list_pod_for_all_namespaces,
166
+ resource_version=snapshot.resource_version,
167
+ label_selector=label_selector,
168
+ )
169
+ pool = ThreadPoolExecutor(max_workers=1)
170
+ future = pool.submit(
171
+ _monitor_pods,
172
+ monitor_partial,
173
+ snapshot,
174
+ max_timeout,
175
+ name_pattern=None,
176
+ namespace_pattern=None,
177
+ )
178
+ return future
179
+
180
+
181
+ def select_and_monitor_by_name_pattern_and_namespace_pattern(
182
+ pod_name_pattern: str,
183
+ namespace_pattern: str,
184
+ max_timeout: int,
185
+ v1_client: CoreV1Api,
186
+ ):
187
+ """
188
+ Monitors all the pods identified by a pod name regex pattern
189
+ and a namespace regex pattern, that collects infos about the
190
+ pods recovery after a kill scenario while the scenario is running.
191
+ Raises an exception if the regex format is not correct.
192
+
193
+ :param pod_name_pattern: a regex representing the
194
+ pod name pattern used to filter the pods to be monitored
195
+ (must be the same used in
196
+ `select_pods_by_name_pattern_and_namespace_pattern`)
197
+ :param namespace_pattern: a regex representing the namespace
198
+ pattern used to filter the pods to be monitored
199
+ (must be the same used in
200
+ `select_pods_by_name_pattern_and_namespace_pattern`)
201
+ :param max_timeout: the expected time the pods should take to
202
+ recover. If the killed pods are replaced in this time frame,
203
+ but they didn't reach the Ready State, they will be marked as
204
+ unrecovered. If during the time frame the pods are not replaced
205
+ at all the error field of the PodsStatus structure will be
206
+ valorized with an exception.
207
+ :param v1_client: kubernetes V1Api client
208
+ :return:
209
+ a future which result (PodsSnapshot) must be
210
+ gathered to obtain the pod infos.
211
+
212
+ """
213
+ try:
214
+ re.compile(pod_name_pattern)
215
+ except re.error as e:
216
+ raise Exception(f"invalid pod name pattern regex: {e}")
217
+
218
+ try:
219
+ re.compile(namespace_pattern)
220
+ except re.error as e:
221
+ raise Exception(f"invalid pod namespace regex: {e}")
222
+
223
+ select_partial = partial(
224
+ v1_client.list_pod_for_all_namespaces,
225
+ field_selector="status.phase=Running",
226
+ )
227
+ snapshot = _select_pods(
228
+ select_partial,
229
+ name_pattern=pod_name_pattern,
230
+ namespace_pattern=namespace_pattern,
231
+ )
232
+ monitor_partial = partial(
233
+ v1_client.list_pod_for_all_namespaces,
234
+ resource_version=snapshot.resource_version,
235
+ )
236
+ pool = ThreadPoolExecutor(max_workers=1)
237
+ future = pool.submit(
238
+ _monitor_pods,
239
+ monitor_partial,
240
+ snapshot,
241
+ max_timeout,
242
+ name_pattern=pod_name_pattern,
243
+ namespace_pattern=namespace_pattern,
244
+ )
245
+ return future
246
+
247
+
248
+ def select_and_monitor_by_namespace_pattern_and_label(
249
+ namespace_pattern: str,
250
+ label_selector: str,
251
+ v1_client: CoreV1Api,
252
+ max_timeout=30,
253
+ ):
254
+ """
255
+ Monitors all the pods identified
256
+ by a namespace regex pattern
257
+ and a pod label selector, that collects infos about the
258
+ pods recovery after a kill scenario while the scenario is running.
259
+ Raises an exception if the regex format is not correct.
260
+
261
+ :param label_selector: the label selector used to filter
262
+ the pods to monitor (must be the same used in
263
+ `select_pods_by_label`)
264
+ :param v1_client: kubernetes V1Api client
265
+ :param namespace_pattern: a regex representing the namespace
266
+ pattern used to filter the pods to be monitored (must be
267
+ the same used
268
+ in `select_pods_by_name_pattern_and_namespace_pattern`)
269
+ :param max_timeout: the expected time the pods should take to recover.
270
+ If the killed pods are replaced in this time frame, but they
271
+ didn't reach the Ready State, they will be marked as unrecovered.
272
+ If during the time frame the pods are not replaced
273
+ at all the error field of the PodsStatus structure will be
274
+ valorized with an exception.
275
+ :return:
276
+ a future which result (PodsSnapshot) must be
277
+ gathered to obtain the pod infos.
278
+
279
+ """
280
+ try:
281
+ re.compile(namespace_pattern)
282
+ except re.error as e:
283
+ raise Exception(f"invalid pod namespace regex: {e}")
284
+
285
+ select_partial = partial(
286
+ v1_client.list_pod_for_all_namespaces,
287
+ label_selector=label_selector,
288
+ field_selector="status.phase=Running",
289
+ )
290
+ snapshot = _select_pods(
291
+ select_partial,
292
+ namespace_pattern=namespace_pattern,
293
+ )
294
+ monitor_partial = partial(
295
+ v1_client.list_pod_for_all_namespaces,
296
+ resource_version=snapshot.resource_version,
297
+ label_selector=label_selector,
298
+ )
299
+ pool = ThreadPoolExecutor(max_workers=1)
300
+ future = pool.submit(
301
+ _monitor_pods,
302
+ monitor_partial,
303
+ snapshot,
304
+ max_timeout,
305
+ name_pattern=None,
306
+ namespace_pattern=namespace_pattern,
307
+ )
308
+ return future
@@ -114,6 +114,7 @@ class ElasticHealthChecks(InnerDoc):
114
114
  end_timestamp = Date()
115
115
  duration = Float()
116
116
 
117
+
117
118
  class ElasticVirtChecks(InnerDoc):
118
119
  vm_name = Text()
119
120
  ip_address = Text()
@@ -124,6 +125,7 @@ class ElasticVirtChecks(InnerDoc):
124
125
  end_timestamp = Date()
125
126
  duration = Float()
126
127
 
128
+
127
129
  class ElasticChaosRunTelemetry(Document):
128
130
  scenarios = Nested(ElasticScenarioTelemetry, multi=True)
129
131
  node_summary_infos = Nested(ElasticNodeInfo, multi=True)
@@ -141,6 +143,7 @@ class ElasticChaosRunTelemetry(Document):
141
143
  run_uuid = Text(fields={"keyword": Keyword()})
142
144
  health_checks = Nested(ElasticHealthChecks, multi=True)
143
145
  virt_checks = Nested(ElasticVirtChecks, multi=True)
146
+
144
147
  class Index:
145
148
  name = "chaos_run_telemetry"
146
149
 
@@ -215,7 +218,7 @@ class ElasticChaosRunTelemetry(Document):
215
218
  chaos_run_telemetry.kubernetes_objects_count
216
219
  )
217
220
  self.network_plugins = chaos_run_telemetry.network_plugins
218
-
221
+
219
222
  if chaos_run_telemetry.health_checks:
220
223
  self.health_checks = [
221
224
  ElasticHealthChecks(
@@ -234,7 +237,7 @@ class ElasticChaosRunTelemetry(Document):
234
237
  ]
235
238
  else:
236
239
  self.health_checks = None
237
-
240
+
238
241
  if chaos_run_telemetry.virt_checks:
239
242
  self.virt_checks = [
240
243
  ElasticVirtChecks(
@@ -1,6 +1,5 @@
1
- from concurrent.futures import Future, ThreadPoolExecutor
2
1
  from dataclasses import dataclass
3
- from typing import Any, Optional
2
+ from typing import Any
4
3
 
5
4
 
6
5
  @dataclass(frozen=True, order=False)
@@ -186,7 +185,6 @@ class PodsStatus:
186
185
 
187
186
  recovered: list[AffectedPod]
188
187
  unrecovered: list[AffectedPod]
189
- error: Optional[str]
190
188
 
191
189
  def __init__(self, json_object: str = None):
192
190
  self.recovered = []
@@ -220,28 +218,6 @@ class PodsStatus:
220
218
  self.unrecovered.append(unrecovered)
221
219
 
222
220
 
223
- class PodsMonitorThread:
224
- executor: ThreadPoolExecutor
225
- future: Future
226
-
227
- def __init__(self, executor: ThreadPoolExecutor, future: Future):
228
- self.future = future
229
- self.executor = executor
230
-
231
- def join(self, timeout: int = 120) -> PodsStatus:
232
- try:
233
- result = self.future.result(timeout=timeout)
234
- self.executor.shutdown(wait=False, cancel_futures=True)
235
- return result
236
- except Exception as e:
237
- pods_status = PodsStatus()
238
- pods_status.error = Exception(
239
- f"Thread pool did not shutdown correctly,"
240
- f"aborting.\nException: {e}"
241
- )
242
- return pods_status
243
-
244
-
245
221
  class AffectedNode:
246
222
  """
247
223
  A node affected by a chaos scenario
File without changes
@@ -0,0 +1,224 @@
1
+ import json
2
+ import time
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from typing import Optional, Any
6
+
7
+ from krkn_lib.models.k8s import PodsStatus, AffectedPod
8
+
9
+
10
+ class PodStatus(Enum):
11
+ READY = 1
12
+ NOT_READY = 2
13
+ DELETION_SCHEDULED = 3
14
+ DELETED = 4
15
+ ADDED = 5
16
+
17
+
18
+ @dataclass
19
+ class PodEvent:
20
+ status: PodStatus
21
+
22
+ def __init__(self, timestamp: float = None):
23
+ if not timestamp:
24
+ self._timestamp = time.time()
25
+ else:
26
+ self._timestamp = timestamp
27
+
28
+ @property
29
+ def timestamp(self):
30
+ return self._timestamp
31
+
32
+ @timestamp.setter
33
+ def timestamp(self, value):
34
+ raise AttributeError("timestamp cannot be set")
35
+
36
+
37
+ @dataclass
38
+ class MonitoredPod:
39
+ namespace: str
40
+ name: str
41
+ status_changes: list[PodEvent]
42
+
43
+ def __init__(self):
44
+ self.namespace = ""
45
+ self.name = ""
46
+ self.status_changes = []
47
+
48
+ def to_dict(self) -> dict[str, Any]:
49
+ return {
50
+ "namespace": self.namespace,
51
+ "name": self.name,
52
+ "status_changes": [
53
+ {"status": v.status.name, "timestamp": v.timestamp}
54
+ for v in self.status_changes
55
+ ],
56
+ }
57
+
58
+
59
+ @dataclass
60
+ class PodsSnapshot:
61
+ resource_version: str
62
+ pods: dict[str, MonitoredPod]
63
+ added_pods: list[str]
64
+ initial_pods: list[str]
65
+ _found_rescheduled_pods: dict[str, str]
66
+
67
+ def __init__(self, json_str: str = None):
68
+ self.resource_version = ""
69
+ self.pods = {}
70
+ self.added_pods = []
71
+ self.initial_pods = []
72
+ self._found_rescheduled_pods = {}
73
+ if json_str:
74
+ json_obj = json.loads(json_str)
75
+ for _, pod in json_obj["pods"]:
76
+ p = MonitoredPod()
77
+ p.name = pod["name"]
78
+ p.namespace = pod["namespace"]
79
+ for status in pod["status_changes"]:
80
+ s = PodEvent(timestamp=status["timestamp"])
81
+ if status["status"] == "READY":
82
+ s.status = PodStatus.READY
83
+ elif status["status"] == "NOT_READY":
84
+ s.status = PodStatus.NOT_READY
85
+ elif status["status"] == "DELETION_SCHEDULED":
86
+ s.status = PodStatus.DELETION_SCHEDULED
87
+ elif status["status"] == "DELETED":
88
+ s.status = PodStatus.DELETED
89
+ elif status["status"] == "ADDED":
90
+ s.status = PodStatus.ADDED
91
+ p.status_changes.append(s)
92
+ self.pods[p.name] = p
93
+ for p in json_obj["added_pods"]:
94
+ self.added_pods.append(p)
95
+ for p in json_obj["initial_pods"]:
96
+ self.initial_pods.append(p)
97
+
98
+ pass
99
+
100
+ def to_dict(self) -> dict[str, Any]:
101
+ return {
102
+ "resource_version": self.resource_version,
103
+ "pods": [[k, v.to_dict()] for k, v in self.pods.items()],
104
+ "added_pods": self.added_pods,
105
+ "initial_pods": self.initial_pods,
106
+ }
107
+
108
+ def _find_rescheduled_pod(self, parent: str) -> Optional[MonitoredPod]:
109
+ for _, v in self.pods.items():
110
+ found_pod = next(
111
+ filter(
112
+ lambda p: p.status == PodStatus.ADDED,
113
+ v.status_changes,
114
+ ),
115
+ None,
116
+ )
117
+ if found_pod and v.name not in self._found_rescheduled_pods:
118
+ # just pick rescheduled pods once
119
+ # keeping the parent for future uses
120
+ self._found_rescheduled_pods[v.name] = parent
121
+ return v
122
+ return None
123
+
124
+ def get_pods_status(self) -> PodsStatus:
125
+
126
+ pods_status = PodsStatus()
127
+ for pod_name in self.initial_pods:
128
+ pod = self.pods[pod_name]
129
+ for status_change in pod.status_changes:
130
+ if status_change.status == PodStatus.NOT_READY:
131
+ ready_status = next(
132
+ filter(
133
+ lambda s: s.status == PodStatus.READY,
134
+ pod.status_changes,
135
+ ),
136
+ None,
137
+ )
138
+ if not ready_status:
139
+ pods_status.unrecovered.append(
140
+ AffectedPod(
141
+ pod_name=pod.name, namespace=pod.namespace
142
+ )
143
+ )
144
+ else:
145
+ pods_status.recovered.append(
146
+ AffectedPod(
147
+ pod_name=pod.name,
148
+ namespace=pod.namespace,
149
+ pod_readiness_time=ready_status.timestamp
150
+ - status_change.timestamp,
151
+ )
152
+ )
153
+ break
154
+
155
+ # if there's a DELETION_SCHEDULED events
156
+ # looks for the rescheduled pod
157
+ # and calculates its scheduling and readiness time
158
+ if status_change.status == PodStatus.DELETION_SCHEDULED:
159
+ rescheduled_pod = self._find_rescheduled_pod(pod_name)
160
+ if not rescheduled_pod:
161
+ pods_status.unrecovered.append(
162
+ AffectedPod(
163
+ pod_name=pod.name, namespace=pod.namespace
164
+ )
165
+ )
166
+ else:
167
+ rescheduled_start_ts = next(
168
+ map(
169
+ lambda e: e.timestamp,
170
+ filter(
171
+ lambda s: s.status == PodStatus.ADDED,
172
+ rescheduled_pod.status_changes,
173
+ ),
174
+ ),
175
+ None,
176
+ )
177
+ rescheduled_ready_ts = next(
178
+ map(
179
+ lambda e: e.timestamp,
180
+ filter(
181
+ lambda s: s.status == PodStatus.READY,
182
+ rescheduled_pod.status_changes,
183
+ ),
184
+ ),
185
+ None,
186
+ )
187
+ # the pod might be rescheduled correctly
188
+ # but do not become ready in the expected time
189
+ # so it must be marked as `unrecovered` in that
190
+ # case
191
+ if not rescheduled_ready_ts:
192
+ pods_status.unrecovered.append(
193
+ AffectedPod(
194
+ pod_name=rescheduled_pod.name,
195
+ namespace=pod.namespace,
196
+ )
197
+ )
198
+ else:
199
+ rescheduling_time = (
200
+ rescheduled_start_ts - status_change.timestamp
201
+ if rescheduled_start_ts
202
+ else None
203
+ )
204
+ readiness_time = (
205
+ rescheduled_ready_ts - status_change.timestamp
206
+ if rescheduled_ready_ts
207
+ else None
208
+ )
209
+ pods_status.recovered.append(
210
+ AffectedPod(
211
+ pod_name=rescheduled_pod.name,
212
+ namespace=rescheduled_pod.namespace,
213
+ pod_rescheduling_time=rescheduling_time,
214
+ pod_readiness_time=readiness_time,
215
+ total_recovery_time=(
216
+ rescheduling_time + readiness_time
217
+ if rescheduling_time and readiness_time
218
+ else None
219
+ ),
220
+ )
221
+ )
222
+ break
223
+
224
+ return pods_status
@@ -421,6 +421,7 @@ class HealthCheck:
421
421
  self.end_timestamp = json_dict["end_timestamp"]
422
422
  self.duration = json_dict["duration"]
423
423
 
424
+
424
425
  @dataclass(order=False)
425
426
  class VirtCheck:
426
427
  """
@@ -466,10 +467,11 @@ class VirtCheck:
466
467
  self.ip_address = json_dict["ip_address"]
467
468
  self.namespace = json_dict["namespace"]
468
469
  self.vm_name = json_dict["vm_name"]
469
- self.status = json_dict.get("status",True)
470
- self.start_timestamp = json_dict.get("start_timestamp","")
471
- self.end_timestamp = json_dict.get("end_timestamp","")
472
- self.duration = json_dict.get("duration","")
470
+ self.status = json_dict.get("status", True)
471
+ self.start_timestamp = json_dict.get("start_timestamp", "")
472
+ self.end_timestamp = json_dict.get("end_timestamp", "")
473
+ self.duration = json_dict.get("duration", "")
474
+
473
475
 
474
476
  @dataclass(order=False)
475
477
  class ChaosRunTelemetry: