krkn-lib 5.1.4__py3-none-any.whl → 5.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,304 @@
1
+ import re
2
+ from concurrent.futures import Future
3
+ from concurrent.futures.thread import ThreadPoolExecutor
4
+ from functools import partial
5
+
6
+ from kubernetes import watch
7
+ from kubernetes.client import V1Pod, CoreV1Api
8
+
9
+ from krkn_lib.models.pod_monitor.models import (
10
+ PodsSnapshot,
11
+ MonitoredPod,
12
+ PodEvent,
13
+ PodStatus,
14
+ )
15
+
16
+ def _select_pods(
17
+ select_partial: partial,
18
+ namespace_pattern: str = None,
19
+ name_pattern: str = None,
20
+ ):
21
+ initial_pods = select_partial()
22
+ snapshot = PodsSnapshot()
23
+ snapshot.resource_version = initial_pods.metadata.resource_version
24
+
25
+ for pod in initial_pods.items:
26
+ match_name = True
27
+ match_namespace = True
28
+ if namespace_pattern:
29
+ match = re.match(namespace_pattern, pod.metadata.namespace)
30
+ match_namespace = match is not None
31
+ if name_pattern:
32
+ match = re.match(name_pattern, pod.metadata.name)
33
+ match_name = match is not None
34
+ if match_name and match_namespace:
35
+ mon_pod = MonitoredPod()
36
+ snapshot.initial_pods.append(pod.metadata.name)
37
+ mon_pod.name = pod.metadata.name
38
+ mon_pod.namespace = pod.metadata.namespace
39
+ snapshot.pods[mon_pod.name] = mon_pod
40
+ return snapshot
41
+
42
+
43
+ def _monitor_pods(
44
+ monitor_partial: partial,
45
+ snapshot: PodsSnapshot,
46
+ max_timeout: int,
47
+ name_pattern: str = None,
48
+ namespace_pattern: str = None,
49
+ ) -> PodsSnapshot:
50
+ w = watch.Watch(return_type=V1Pod)
51
+ deleted_parent_pods = []
52
+ restored_pods = []
53
+ cluster_restored = False
54
+ for event in w.stream(monitor_partial, timeout_seconds=max_timeout):
55
+ match_name = True
56
+ match_namespace = True
57
+ event_type = event["type"]
58
+ pod = event["object"]
59
+
60
+ if namespace_pattern:
61
+ match = re.match(namespace_pattern, pod.metadata.namespace)
62
+ match_namespace = match is not None
63
+ if name_pattern:
64
+ match = re.match(name_pattern, pod.metadata.name)
65
+ match_name = match is not None
66
+
67
+ if match_name and match_namespace:
68
+ pod_event = PodEvent()
69
+ if event_type == "MODIFIED":
70
+ if pod.metadata.deletion_timestamp is not None:
71
+ pod_event.status = PodStatus.DELETION_SCHEDULED
72
+ deleted_parent_pods.append(pod.metadata.name)
73
+ elif _is_pod_ready(pod):
74
+ pod_event.status = PodStatus.READY
75
+ # if there are at least the same number of ready
76
+ # pods as the snapshot.initial_pods set we assume that
77
+ # the cluster is restored to the initial condition
78
+ restored_pods.append(pod.metadata.name)
79
+ if len(restored_pods) >= len(snapshot.initial_pods):
80
+ cluster_restored = True
81
+ else:
82
+ pod_event.status = PodStatus.NOT_READY
83
+
84
+ elif event_type == "DELETED":
85
+ pod_event.status = PodStatus.DELETED
86
+ elif event_type == "ADDED":
87
+ pod_event.status = PodStatus.ADDED
88
+
89
+ if pod_event.status == PodStatus.ADDED:
90
+ snapshot.added_pods.append(pod.metadata.name)
91
+ # in case a pod is respawn with the same name
92
+ # the dictionary must not be reinitialized
93
+ if pod.metadata.name not in snapshot.pods:
94
+ snapshot.pods[pod.metadata.name] = MonitoredPod()
95
+ snapshot.pods[pod.metadata.name].name = pod.metadata.name
96
+ snapshot.pods[pod.metadata.name].namespace = (
97
+ pod.metadata.namespace
98
+ )
99
+ # skips events out of the snapshot
100
+ if pod.metadata.name in snapshot.pods:
101
+ snapshot.pods[pod.metadata.name].status_changes.append(
102
+ pod_event
103
+ )
104
+ # this flag is set when all the pods
105
+ # that has been deleted or not ready
106
+ # have been restored, if True the
107
+ # monitoring is stopeed earlier
108
+ if cluster_restored:
109
+ w.stop()
110
+
111
+ return snapshot
112
+
113
+
114
+ def _is_pod_ready(pod: V1Pod) -> bool:
115
+ if not pod.status.container_statuses:
116
+ return False
117
+ for status in pod.status.container_statuses:
118
+ if not status.ready:
119
+ return False
120
+ return True
121
+
122
+
123
+ def _is_pod_terminating(pod: V1Pod) -> bool:
124
+ if pod.metadata.deletion_timestamp is not None:
125
+ return True
126
+ return False
127
+
128
+
129
+ def select_and_monitor_by_label(
130
+ label_selector: str,
131
+ max_timeout: int,
132
+ v1_client: CoreV1Api,
133
+ ) -> Future:
134
+ """
135
+ Monitors all the pods identified
136
+ by a label selector and collects infos about the
137
+ pods recovery after a kill scenario while the scenario is running.
138
+
139
+ :param label_selector: the label selector used
140
+ to filter the pods to monitor (must be the
141
+ same used in `select_pods_by_label`)
142
+ :param max_timeout: the expected time the pods should take
143
+ to recover. If the killed pods are replaced in this time frame,
144
+ but they didn't reach the Ready State, they will be marked as
145
+ unrecovered. If during the time frame the pods are not replaced
146
+ at all the error field of the PodsStatus structure will be
147
+ valorized with an exception.
148
+ :param v1_client: kubernetes V1Api client
149
+ :return:
150
+ a future which result (PodsSnapshot) must be
151
+ gathered to obtain the pod infos.
152
+
153
+ """
154
+ select_partial = partial(
155
+ v1_client.list_pod_for_all_namespaces,
156
+ label_selector=label_selector,
157
+ field_selector="status.phase=Running",
158
+ )
159
+ snapshot = _select_pods(select_partial)
160
+ monitor_partial = partial(
161
+ v1_client.list_pod_for_all_namespaces,
162
+ resource_version=snapshot.resource_version,
163
+ label_selector=label_selector,
164
+ )
165
+ pool = ThreadPoolExecutor(max_workers=1)
166
+ future = pool.submit(
167
+ _monitor_pods,
168
+ monitor_partial,
169
+ snapshot,
170
+ max_timeout,
171
+ name_pattern=None,
172
+ namespace_pattern=None,
173
+ )
174
+ return future
175
+
176
+
177
+ def select_and_monitor_by_name_pattern_and_namespace_pattern(
178
+ pod_name_pattern: str,
179
+ namespace_pattern: str,
180
+ max_timeout: int,
181
+ v1_client: CoreV1Api,
182
+ ):
183
+ """
184
+ Monitors all the pods identified by a pod name regex pattern
185
+ and a namespace regex pattern, that collects infos about the
186
+ pods recovery after a kill scenario while the scenario is running.
187
+ Raises an exception if the regex format is not correct.
188
+
189
+ :param pod_name_pattern: a regex representing the
190
+ pod name pattern used to filter the pods to be monitored
191
+ (must be the same used in
192
+ `select_pods_by_name_pattern_and_namespace_pattern`)
193
+ :param namespace_pattern: a regex representing the namespace
194
+ pattern used to filter the pods to be monitored
195
+ (must be the same used in
196
+ `select_pods_by_name_pattern_and_namespace_pattern`)
197
+ :param max_timeout: the expected time the pods should take to
198
+ recover. If the killed pods are replaced in this time frame,
199
+ but they didn't reach the Ready State, they will be marked as
200
+ unrecovered. If during the time frame the pods are not replaced
201
+ at all the error field of the PodsStatus structure will be
202
+ valorized with an exception.
203
+ :param v1_client: kubernetes V1Api client
204
+ :return:
205
+ a future which result (PodsSnapshot) must be
206
+ gathered to obtain the pod infos.
207
+
208
+ """
209
+ try:
210
+ re.compile(pod_name_pattern)
211
+ except re.error as e:
212
+ raise Exception(f"invalid pod name pattern regex: {e}")
213
+
214
+ try:
215
+ re.compile(namespace_pattern)
216
+ except re.error as e:
217
+ raise Exception(f"invalid pod namespace regex: {e}")
218
+
219
+ select_partial = partial(
220
+ v1_client.list_pod_for_all_namespaces,
221
+ field_selector="status.phase=Running",
222
+ )
223
+ snapshot = _select_pods(
224
+ select_partial,
225
+ name_pattern=pod_name_pattern,
226
+ namespace_pattern=namespace_pattern,
227
+ )
228
+ monitor_partial = partial(
229
+ v1_client.list_pod_for_all_namespaces,
230
+ resource_version=snapshot.resource_version,
231
+ )
232
+ pool = ThreadPoolExecutor(max_workers=1)
233
+ future = pool.submit(
234
+ _monitor_pods,
235
+ monitor_partial,
236
+ snapshot,
237
+ max_timeout,
238
+ name_pattern=pod_name_pattern,
239
+ namespace_pattern=namespace_pattern,
240
+ )
241
+ return future
242
+
243
+
244
+ def select_and_monitor_by_namespace_pattern_and_label(
245
+ namespace_pattern: str,
246
+ label_selector: str,
247
+ v1_client: CoreV1Api,
248
+ max_timeout=30,
249
+ ):
250
+ """
251
+ Monitors all the pods identified
252
+ by a namespace regex pattern
253
+ and a pod label selector, that collects infos about the
254
+ pods recovery after a kill scenario while the scenario is running.
255
+ Raises an exception if the regex format is not correct.
256
+
257
+ :param label_selector: the label selector used to filter
258
+ the pods to monitor (must be the same used in
259
+ `select_pods_by_label`)
260
+ :param v1_client: kubernetes V1Api client
261
+ :param namespace_pattern: a regex representing the namespace
262
+ pattern used to filter the pods to be monitored (must be
263
+ the same used
264
+ in `select_pods_by_name_pattern_and_namespace_pattern`)
265
+ :param max_timeout: the expected time the pods should take to recover.
266
+ If the killed pods are replaced in this time frame, but they
267
+ didn't reach the Ready State, they will be marked as unrecovered.
268
+ If during the time frame the pods are not replaced
269
+ at all the error field of the PodsStatus structure will be
270
+ valorized with an exception.
271
+ :return:
272
+ a future which result (PodsSnapshot) must be
273
+ gathered to obtain the pod infos.
274
+
275
+ """
276
+ try:
277
+ re.compile(namespace_pattern)
278
+ except re.error as e:
279
+ raise Exception(f"invalid pod namespace regex: {e}")
280
+
281
+ select_partial = partial(
282
+ v1_client.list_pod_for_all_namespaces,
283
+ label_selector=label_selector,
284
+ field_selector="status.phase=Running",
285
+ )
286
+ snapshot = _select_pods(
287
+ select_partial,
288
+ namespace_pattern=namespace_pattern,
289
+ )
290
+ monitor_partial = partial(
291
+ v1_client.list_pod_for_all_namespaces,
292
+ resource_version=snapshot.resource_version,
293
+ label_selector=label_selector,
294
+ )
295
+ pool = ThreadPoolExecutor(max_workers=1)
296
+ future = pool.submit(
297
+ _monitor_pods,
298
+ monitor_partial,
299
+ snapshot,
300
+ max_timeout,
301
+ name_pattern=None,
302
+ namespace_pattern=namespace_pattern,
303
+ )
304
+ return future
@@ -114,6 +114,7 @@ class ElasticHealthChecks(InnerDoc):
114
114
  end_timestamp = Date()
115
115
  duration = Float()
116
116
 
117
+
117
118
  class ElasticVirtChecks(InnerDoc):
118
119
  vm_name = Text()
119
120
  ip_address = Text()
@@ -124,6 +125,7 @@ class ElasticVirtChecks(InnerDoc):
124
125
  end_timestamp = Date()
125
126
  duration = Float()
126
127
 
128
+
127
129
  class ElasticChaosRunTelemetry(Document):
128
130
  scenarios = Nested(ElasticScenarioTelemetry, multi=True)
129
131
  node_summary_infos = Nested(ElasticNodeInfo, multi=True)
@@ -141,6 +143,7 @@ class ElasticChaosRunTelemetry(Document):
141
143
  run_uuid = Text(fields={"keyword": Keyword()})
142
144
  health_checks = Nested(ElasticHealthChecks, multi=True)
143
145
  virt_checks = Nested(ElasticVirtChecks, multi=True)
146
+
144
147
  class Index:
145
148
  name = "chaos_run_telemetry"
146
149
 
@@ -215,7 +218,7 @@ class ElasticChaosRunTelemetry(Document):
215
218
  chaos_run_telemetry.kubernetes_objects_count
216
219
  )
217
220
  self.network_plugins = chaos_run_telemetry.network_plugins
218
-
221
+
219
222
  if chaos_run_telemetry.health_checks:
220
223
  self.health_checks = [
221
224
  ElasticHealthChecks(
@@ -234,7 +237,7 @@ class ElasticChaosRunTelemetry(Document):
234
237
  ]
235
238
  else:
236
239
  self.health_checks = None
237
-
240
+
238
241
  if chaos_run_telemetry.virt_checks:
239
242
  self.virt_checks = [
240
243
  ElasticVirtChecks(
@@ -1,6 +1,5 @@
1
- from concurrent.futures import Future, ThreadPoolExecutor
2
1
  from dataclasses import dataclass
3
- from typing import Any, Optional
2
+ from typing import Any
4
3
 
5
4
 
6
5
  @dataclass(frozen=True, order=False)
@@ -186,7 +185,6 @@ class PodsStatus:
186
185
 
187
186
  recovered: list[AffectedPod]
188
187
  unrecovered: list[AffectedPod]
189
- error: Optional[str]
190
188
 
191
189
  def __init__(self, json_object: str = None):
192
190
  self.recovered = []
@@ -220,28 +218,6 @@ class PodsStatus:
220
218
  self.unrecovered.append(unrecovered)
221
219
 
222
220
 
223
- class PodsMonitorThread:
224
- executor: ThreadPoolExecutor
225
- future: Future
226
-
227
- def __init__(self, executor: ThreadPoolExecutor, future: Future):
228
- self.future = future
229
- self.executor = executor
230
-
231
- def join(self, timeout: int = 120) -> PodsStatus:
232
- try:
233
- result = self.future.result(timeout=timeout)
234
- self.executor.shutdown(wait=False, cancel_futures=True)
235
- return result
236
- except Exception as e:
237
- pods_status = PodsStatus()
238
- pods_status.error = Exception(
239
- f"Thread pool did not shutdown correctly,"
240
- f"aborting.\nException: {e}"
241
- )
242
- return pods_status
243
-
244
-
245
221
  class AffectedNode:
246
222
  """
247
223
  A node affected by a chaos scenario
File without changes
@@ -0,0 +1,224 @@
1
+ import json
2
+ import time
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from typing import Optional, Any
6
+
7
+ from krkn_lib.models.k8s import PodsStatus, AffectedPod
8
+
9
+
10
+ class PodStatus(Enum):
11
+ READY = 1
12
+ NOT_READY = 2
13
+ DELETION_SCHEDULED = 3
14
+ DELETED = 4
15
+ ADDED = 5
16
+
17
+
18
+ @dataclass
19
+ class PodEvent:
20
+ status: PodStatus
21
+
22
+ def __init__(self, timestamp: float = None):
23
+ if not timestamp:
24
+ self._timestamp = time.time()
25
+ else:
26
+ self._timestamp = timestamp
27
+
28
+ @property
29
+ def timestamp(self):
30
+ return self._timestamp
31
+
32
+ @timestamp.setter
33
+ def timestamp(self, value):
34
+ raise AttributeError("timestamp cannot be set")
35
+
36
+
37
+ @dataclass
38
+ class MonitoredPod:
39
+ namespace: str
40
+ name: str
41
+ status_changes: list[PodEvent]
42
+
43
+ def __init__(self):
44
+ self.namespace = ""
45
+ self.name = ""
46
+ self.status_changes = []
47
+
48
+ def to_dict(self) -> dict[str, Any]:
49
+ return {
50
+ "namespace": self.namespace,
51
+ "name": self.name,
52
+ "status_changes": [
53
+ {"status": v.status.name, "timestamp": v.timestamp}
54
+ for v in self.status_changes
55
+ ],
56
+ }
57
+
58
+
59
+ @dataclass
60
+ class PodsSnapshot:
61
+ resource_version: str
62
+ pods: dict[str, MonitoredPod]
63
+ added_pods: list[str]
64
+ initial_pods: list[str]
65
+ _found_rescheduled_pods: dict[str, str]
66
+
67
+ def __init__(self, json_str: str = None):
68
+ self.resource_version = ""
69
+ self.pods = {}
70
+ self.added_pods = []
71
+ self.initial_pods = []
72
+ self._found_rescheduled_pods = {}
73
+ if json_str:
74
+ json_obj = json.loads(json_str)
75
+ for _, pod in json_obj["pods"]:
76
+ p = MonitoredPod()
77
+ p.name = pod["name"]
78
+ p.namespace = pod["namespace"]
79
+ for status in pod["status_changes"]:
80
+ s = PodEvent(timestamp=status["timestamp"])
81
+ if status["status"] == "READY":
82
+ s.status = PodStatus.READY
83
+ elif status["status"] == "NOT_READY":
84
+ s.status = PodStatus.NOT_READY
85
+ elif status["status"] == "DELETION_SCHEDULED":
86
+ s.status = PodStatus.DELETION_SCHEDULED
87
+ elif status["status"] == "DELETED":
88
+ s.status = PodStatus.DELETED
89
+ elif status["status"] == "ADDED":
90
+ s.status = PodStatus.ADDED
91
+ p.status_changes.append(s)
92
+ self.pods[p.name] = p
93
+ for p in json_obj["added_pods"]:
94
+ self.added_pods.append(p)
95
+ for p in json_obj["initial_pods"]:
96
+ self.initial_pods.append(p)
97
+
98
+ pass
99
+
100
+ def to_dict(self) -> dict[str, Any]:
101
+ return {
102
+ "resource_version": self.resource_version,
103
+ "pods": [[k, v.to_dict()] for k, v in self.pods.items()],
104
+ "added_pods": self.added_pods,
105
+ "initial_pods": self.initial_pods,
106
+ }
107
+
108
+ def _find_rescheduled_pod(self, parent: str) -> Optional[MonitoredPod]:
109
+ for _, v in self.pods.items():
110
+ found_pod = next(
111
+ filter(
112
+ lambda p: p.status == PodStatus.ADDED,
113
+ v.status_changes,
114
+ ),
115
+ None,
116
+ )
117
+ if found_pod and v.name not in self._found_rescheduled_pods:
118
+ # just pick rescheduled pods once
119
+ # keeping the parent for future uses
120
+ self._found_rescheduled_pods[v.name] = parent
121
+ return v
122
+ return None
123
+
124
+ def get_pods_status(self) -> PodsStatus:
125
+
126
+ pods_status = PodsStatus()
127
+ for pod_name in self.initial_pods:
128
+ pod = self.pods[pod_name]
129
+ for status_change in pod.status_changes:
130
+ if status_change.status == PodStatus.NOT_READY:
131
+ ready_status = next(
132
+ filter(
133
+ lambda s: s.status == PodStatus.READY,
134
+ pod.status_changes,
135
+ ),
136
+ None,
137
+ )
138
+ if not ready_status:
139
+ pods_status.unrecovered.append(
140
+ AffectedPod(
141
+ pod_name=pod.name, namespace=pod.namespace
142
+ )
143
+ )
144
+ else:
145
+ pods_status.recovered.append(
146
+ AffectedPod(
147
+ pod_name=pod.name,
148
+ namespace=pod.namespace,
149
+ pod_readiness_time=ready_status.timestamp
150
+ - status_change.timestamp,
151
+ )
152
+ )
153
+ break
154
+
155
+ # if there's a DELETION_SCHEDULED events
156
+ # looks for the rescheduled pod
157
+ # and calculates its scheduling and readiness time
158
+ if status_change.status == PodStatus.DELETION_SCHEDULED:
159
+ rescheduled_pod = self._find_rescheduled_pod(pod_name)
160
+ if not rescheduled_pod:
161
+ pods_status.unrecovered.append(
162
+ AffectedPod(
163
+ pod_name=pod.name, namespace=pod.namespace
164
+ )
165
+ )
166
+ else:
167
+ rescheduled_start_ts = next(
168
+ map(
169
+ lambda e: e.timestamp,
170
+ filter(
171
+ lambda s: s.status == PodStatus.ADDED,
172
+ rescheduled_pod.status_changes,
173
+ ),
174
+ ),
175
+ None,
176
+ )
177
+ rescheduled_ready_ts = next(
178
+ map(
179
+ lambda e: e.timestamp,
180
+ filter(
181
+ lambda s: s.status == PodStatus.READY,
182
+ rescheduled_pod.status_changes,
183
+ ),
184
+ ),
185
+ None,
186
+ )
187
+ # the pod might be rescheduled correctly
188
+ # but do not become ready in the expected time
189
+ # so it must be marked as `unrecovered` in that
190
+ # case
191
+ if not rescheduled_ready_ts:
192
+ pods_status.unrecovered.append(
193
+ AffectedPod(
194
+ pod_name=rescheduled_pod.name,
195
+ namespace=pod.namespace,
196
+ )
197
+ )
198
+ else:
199
+ rescheduling_time = (
200
+ rescheduled_start_ts - status_change.timestamp
201
+ if rescheduled_start_ts
202
+ else None
203
+ )
204
+ readiness_time = (
205
+ rescheduled_ready_ts - status_change.timestamp
206
+ if rescheduled_ready_ts
207
+ else None
208
+ )
209
+ pods_status.recovered.append(
210
+ AffectedPod(
211
+ pod_name=rescheduled_pod.name,
212
+ namespace=rescheduled_pod.namespace,
213
+ pod_rescheduling_time=rescheduling_time,
214
+ pod_readiness_time=readiness_time,
215
+ total_recovery_time=(
216
+ rescheduling_time + readiness_time
217
+ if rescheduling_time and readiness_time
218
+ else None
219
+ ),
220
+ )
221
+ )
222
+ break
223
+
224
+ return pods_status
@@ -421,6 +421,7 @@ class HealthCheck:
421
421
  self.end_timestamp = json_dict["end_timestamp"]
422
422
  self.duration = json_dict["duration"]
423
423
 
424
+
424
425
  @dataclass(order=False)
425
426
  class VirtCheck:
426
427
  """
@@ -466,10 +467,11 @@ class VirtCheck:
466
467
  self.ip_address = json_dict["ip_address"]
467
468
  self.namespace = json_dict["namespace"]
468
469
  self.vm_name = json_dict["vm_name"]
469
- self.status = json_dict.get("status",True)
470
- self.start_timestamp = json_dict.get("start_timestamp","")
471
- self.end_timestamp = json_dict.get("end_timestamp","")
472
- self.duration = json_dict.get("duration","")
470
+ self.status = json_dict.get("status", True)
471
+ self.start_timestamp = json_dict.get("start_timestamp", "")
472
+ self.end_timestamp = json_dict.get("end_timestamp", "")
473
+ self.duration = json_dict.get("duration", "")
474
+
473
475
 
474
476
  @dataclass(order=False)
475
477
  class ChaosRunTelemetry: