krkn-lib 5.1.4__py3-none-any.whl → 5.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krkn_lib/__init__.py +0 -0
- krkn_lib/k8s/krkn_kubernetes.py +12 -380
- krkn_lib/k8s/pod_monitor/__init__.py +12 -0
- krkn_lib/k8s/pod_monitor/pod_monitor.py +304 -0
- krkn_lib/models/elastic/models.py +5 -2
- krkn_lib/models/k8s/models.py +1 -25
- krkn_lib/models/pod_monitor/__init__.py +0 -0
- krkn_lib/models/pod_monitor/models.py +224 -0
- krkn_lib/models/telemetry/models.py +6 -4
- krkn_lib/tests/base_test.py +32 -31
- krkn_lib/tests/test_krkn_elastic_models.py +5 -4
- krkn_lib/tests/test_krkn_kubernetes_pods_monitor.py +513 -0
- krkn_lib/tests/test_krkn_kubernetes_pods_monitor_models.py +405 -0
- krkn_lib/tests/test_utils.py +12 -8
- {krkn_lib-5.1.4.dist-info → krkn_lib-5.1.6.dist-info}/METADATA +1 -2
- {krkn_lib-5.1.4.dist-info → krkn_lib-5.1.6.dist-info}/RECORD +18 -14
- krkn_lib/k8s/pods_monitor_pool.py +0 -202
- krkn_lib/tests/test_krkn_kubernetes_monitor.py +0 -367
- krkn_lib/tests/test_krkn_kubernetes_pods_monitor_pool.py +0 -128
- {krkn_lib-5.1.4.dist-info → krkn_lib-5.1.6.dist-info}/LICENSE +0 -0
- {krkn_lib-5.1.4.dist-info → krkn_lib-5.1.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from concurrent.futures import Future
|
|
3
|
+
from concurrent.futures.thread import ThreadPoolExecutor
|
|
4
|
+
from functools import partial
|
|
5
|
+
|
|
6
|
+
from kubernetes import watch
|
|
7
|
+
from kubernetes.client import V1Pod, CoreV1Api
|
|
8
|
+
|
|
9
|
+
from krkn_lib.models.pod_monitor.models import (
|
|
10
|
+
PodsSnapshot,
|
|
11
|
+
MonitoredPod,
|
|
12
|
+
PodEvent,
|
|
13
|
+
PodStatus,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
def _select_pods(
|
|
17
|
+
select_partial: partial,
|
|
18
|
+
namespace_pattern: str = None,
|
|
19
|
+
name_pattern: str = None,
|
|
20
|
+
):
|
|
21
|
+
initial_pods = select_partial()
|
|
22
|
+
snapshot = PodsSnapshot()
|
|
23
|
+
snapshot.resource_version = initial_pods.metadata.resource_version
|
|
24
|
+
|
|
25
|
+
for pod in initial_pods.items:
|
|
26
|
+
match_name = True
|
|
27
|
+
match_namespace = True
|
|
28
|
+
if namespace_pattern:
|
|
29
|
+
match = re.match(namespace_pattern, pod.metadata.namespace)
|
|
30
|
+
match_namespace = match is not None
|
|
31
|
+
if name_pattern:
|
|
32
|
+
match = re.match(name_pattern, pod.metadata.name)
|
|
33
|
+
match_name = match is not None
|
|
34
|
+
if match_name and match_namespace:
|
|
35
|
+
mon_pod = MonitoredPod()
|
|
36
|
+
snapshot.initial_pods.append(pod.metadata.name)
|
|
37
|
+
mon_pod.name = pod.metadata.name
|
|
38
|
+
mon_pod.namespace = pod.metadata.namespace
|
|
39
|
+
snapshot.pods[mon_pod.name] = mon_pod
|
|
40
|
+
return snapshot
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _monitor_pods(
|
|
44
|
+
monitor_partial: partial,
|
|
45
|
+
snapshot: PodsSnapshot,
|
|
46
|
+
max_timeout: int,
|
|
47
|
+
name_pattern: str = None,
|
|
48
|
+
namespace_pattern: str = None,
|
|
49
|
+
) -> PodsSnapshot:
|
|
50
|
+
w = watch.Watch(return_type=V1Pod)
|
|
51
|
+
deleted_parent_pods = []
|
|
52
|
+
restored_pods = []
|
|
53
|
+
cluster_restored = False
|
|
54
|
+
for event in w.stream(monitor_partial, timeout_seconds=max_timeout):
|
|
55
|
+
match_name = True
|
|
56
|
+
match_namespace = True
|
|
57
|
+
event_type = event["type"]
|
|
58
|
+
pod = event["object"]
|
|
59
|
+
|
|
60
|
+
if namespace_pattern:
|
|
61
|
+
match = re.match(namespace_pattern, pod.metadata.namespace)
|
|
62
|
+
match_namespace = match is not None
|
|
63
|
+
if name_pattern:
|
|
64
|
+
match = re.match(name_pattern, pod.metadata.name)
|
|
65
|
+
match_name = match is not None
|
|
66
|
+
|
|
67
|
+
if match_name and match_namespace:
|
|
68
|
+
pod_event = PodEvent()
|
|
69
|
+
if event_type == "MODIFIED":
|
|
70
|
+
if pod.metadata.deletion_timestamp is not None:
|
|
71
|
+
pod_event.status = PodStatus.DELETION_SCHEDULED
|
|
72
|
+
deleted_parent_pods.append(pod.metadata.name)
|
|
73
|
+
elif _is_pod_ready(pod):
|
|
74
|
+
pod_event.status = PodStatus.READY
|
|
75
|
+
# if there are at least the same number of ready
|
|
76
|
+
# pods as the snapshot.initial_pods set we assume that
|
|
77
|
+
# the cluster is restored to the initial condition
|
|
78
|
+
restored_pods.append(pod.metadata.name)
|
|
79
|
+
if len(restored_pods) >= len(snapshot.initial_pods):
|
|
80
|
+
cluster_restored = True
|
|
81
|
+
else:
|
|
82
|
+
pod_event.status = PodStatus.NOT_READY
|
|
83
|
+
|
|
84
|
+
elif event_type == "DELETED":
|
|
85
|
+
pod_event.status = PodStatus.DELETED
|
|
86
|
+
elif event_type == "ADDED":
|
|
87
|
+
pod_event.status = PodStatus.ADDED
|
|
88
|
+
|
|
89
|
+
if pod_event.status == PodStatus.ADDED:
|
|
90
|
+
snapshot.added_pods.append(pod.metadata.name)
|
|
91
|
+
# in case a pod is respawn with the same name
|
|
92
|
+
# the dictionary must not be reinitialized
|
|
93
|
+
if pod.metadata.name not in snapshot.pods:
|
|
94
|
+
snapshot.pods[pod.metadata.name] = MonitoredPod()
|
|
95
|
+
snapshot.pods[pod.metadata.name].name = pod.metadata.name
|
|
96
|
+
snapshot.pods[pod.metadata.name].namespace = (
|
|
97
|
+
pod.metadata.namespace
|
|
98
|
+
)
|
|
99
|
+
# skips events out of the snapshot
|
|
100
|
+
if pod.metadata.name in snapshot.pods:
|
|
101
|
+
snapshot.pods[pod.metadata.name].status_changes.append(
|
|
102
|
+
pod_event
|
|
103
|
+
)
|
|
104
|
+
# this flag is set when all the pods
|
|
105
|
+
# that has been deleted or not ready
|
|
106
|
+
# have been restored, if True the
|
|
107
|
+
# monitoring is stopeed earlier
|
|
108
|
+
if cluster_restored:
|
|
109
|
+
w.stop()
|
|
110
|
+
|
|
111
|
+
return snapshot
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _is_pod_ready(pod: V1Pod) -> bool:
|
|
115
|
+
if not pod.status.container_statuses:
|
|
116
|
+
return False
|
|
117
|
+
for status in pod.status.container_statuses:
|
|
118
|
+
if not status.ready:
|
|
119
|
+
return False
|
|
120
|
+
return True
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _is_pod_terminating(pod: V1Pod) -> bool:
|
|
124
|
+
if pod.metadata.deletion_timestamp is not None:
|
|
125
|
+
return True
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def select_and_monitor_by_label(
|
|
130
|
+
label_selector: str,
|
|
131
|
+
max_timeout: int,
|
|
132
|
+
v1_client: CoreV1Api,
|
|
133
|
+
) -> Future:
|
|
134
|
+
"""
|
|
135
|
+
Monitors all the pods identified
|
|
136
|
+
by a label selector and collects infos about the
|
|
137
|
+
pods recovery after a kill scenario while the scenario is running.
|
|
138
|
+
|
|
139
|
+
:param label_selector: the label selector used
|
|
140
|
+
to filter the pods to monitor (must be the
|
|
141
|
+
same used in `select_pods_by_label`)
|
|
142
|
+
:param max_timeout: the expected time the pods should take
|
|
143
|
+
to recover. If the killed pods are replaced in this time frame,
|
|
144
|
+
but they didn't reach the Ready State, they will be marked as
|
|
145
|
+
unrecovered. If during the time frame the pods are not replaced
|
|
146
|
+
at all the error field of the PodsStatus structure will be
|
|
147
|
+
valorized with an exception.
|
|
148
|
+
:param v1_client: kubernetes V1Api client
|
|
149
|
+
:return:
|
|
150
|
+
a future which result (PodsSnapshot) must be
|
|
151
|
+
gathered to obtain the pod infos.
|
|
152
|
+
|
|
153
|
+
"""
|
|
154
|
+
select_partial = partial(
|
|
155
|
+
v1_client.list_pod_for_all_namespaces,
|
|
156
|
+
label_selector=label_selector,
|
|
157
|
+
field_selector="status.phase=Running",
|
|
158
|
+
)
|
|
159
|
+
snapshot = _select_pods(select_partial)
|
|
160
|
+
monitor_partial = partial(
|
|
161
|
+
v1_client.list_pod_for_all_namespaces,
|
|
162
|
+
resource_version=snapshot.resource_version,
|
|
163
|
+
label_selector=label_selector,
|
|
164
|
+
)
|
|
165
|
+
pool = ThreadPoolExecutor(max_workers=1)
|
|
166
|
+
future = pool.submit(
|
|
167
|
+
_monitor_pods,
|
|
168
|
+
monitor_partial,
|
|
169
|
+
snapshot,
|
|
170
|
+
max_timeout,
|
|
171
|
+
name_pattern=None,
|
|
172
|
+
namespace_pattern=None,
|
|
173
|
+
)
|
|
174
|
+
return future
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def select_and_monitor_by_name_pattern_and_namespace_pattern(
|
|
178
|
+
pod_name_pattern: str,
|
|
179
|
+
namespace_pattern: str,
|
|
180
|
+
max_timeout: int,
|
|
181
|
+
v1_client: CoreV1Api,
|
|
182
|
+
):
|
|
183
|
+
"""
|
|
184
|
+
Monitors all the pods identified by a pod name regex pattern
|
|
185
|
+
and a namespace regex pattern, that collects infos about the
|
|
186
|
+
pods recovery after a kill scenario while the scenario is running.
|
|
187
|
+
Raises an exception if the regex format is not correct.
|
|
188
|
+
|
|
189
|
+
:param pod_name_pattern: a regex representing the
|
|
190
|
+
pod name pattern used to filter the pods to be monitored
|
|
191
|
+
(must be the same used in
|
|
192
|
+
`select_pods_by_name_pattern_and_namespace_pattern`)
|
|
193
|
+
:param namespace_pattern: a regex representing the namespace
|
|
194
|
+
pattern used to filter the pods to be monitored
|
|
195
|
+
(must be the same used in
|
|
196
|
+
`select_pods_by_name_pattern_and_namespace_pattern`)
|
|
197
|
+
:param max_timeout: the expected time the pods should take to
|
|
198
|
+
recover. If the killed pods are replaced in this time frame,
|
|
199
|
+
but they didn't reach the Ready State, they will be marked as
|
|
200
|
+
unrecovered. If during the time frame the pods are not replaced
|
|
201
|
+
at all the error field of the PodsStatus structure will be
|
|
202
|
+
valorized with an exception.
|
|
203
|
+
:param v1_client: kubernetes V1Api client
|
|
204
|
+
:return:
|
|
205
|
+
a future which result (PodsSnapshot) must be
|
|
206
|
+
gathered to obtain the pod infos.
|
|
207
|
+
|
|
208
|
+
"""
|
|
209
|
+
try:
|
|
210
|
+
re.compile(pod_name_pattern)
|
|
211
|
+
except re.error as e:
|
|
212
|
+
raise Exception(f"invalid pod name pattern regex: {e}")
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
re.compile(namespace_pattern)
|
|
216
|
+
except re.error as e:
|
|
217
|
+
raise Exception(f"invalid pod namespace regex: {e}")
|
|
218
|
+
|
|
219
|
+
select_partial = partial(
|
|
220
|
+
v1_client.list_pod_for_all_namespaces,
|
|
221
|
+
field_selector="status.phase=Running",
|
|
222
|
+
)
|
|
223
|
+
snapshot = _select_pods(
|
|
224
|
+
select_partial,
|
|
225
|
+
name_pattern=pod_name_pattern,
|
|
226
|
+
namespace_pattern=namespace_pattern,
|
|
227
|
+
)
|
|
228
|
+
monitor_partial = partial(
|
|
229
|
+
v1_client.list_pod_for_all_namespaces,
|
|
230
|
+
resource_version=snapshot.resource_version,
|
|
231
|
+
)
|
|
232
|
+
pool = ThreadPoolExecutor(max_workers=1)
|
|
233
|
+
future = pool.submit(
|
|
234
|
+
_monitor_pods,
|
|
235
|
+
monitor_partial,
|
|
236
|
+
snapshot,
|
|
237
|
+
max_timeout,
|
|
238
|
+
name_pattern=pod_name_pattern,
|
|
239
|
+
namespace_pattern=namespace_pattern,
|
|
240
|
+
)
|
|
241
|
+
return future
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def select_and_monitor_by_namespace_pattern_and_label(
|
|
245
|
+
namespace_pattern: str,
|
|
246
|
+
label_selector: str,
|
|
247
|
+
v1_client: CoreV1Api,
|
|
248
|
+
max_timeout=30,
|
|
249
|
+
):
|
|
250
|
+
"""
|
|
251
|
+
Monitors all the pods identified
|
|
252
|
+
by a namespace regex pattern
|
|
253
|
+
and a pod label selector, that collects infos about the
|
|
254
|
+
pods recovery after a kill scenario while the scenario is running.
|
|
255
|
+
Raises an exception if the regex format is not correct.
|
|
256
|
+
|
|
257
|
+
:param label_selector: the label selector used to filter
|
|
258
|
+
the pods to monitor (must be the same used in
|
|
259
|
+
`select_pods_by_label`)
|
|
260
|
+
:param v1_client: kubernetes V1Api client
|
|
261
|
+
:param namespace_pattern: a regex representing the namespace
|
|
262
|
+
pattern used to filter the pods to be monitored (must be
|
|
263
|
+
the same used
|
|
264
|
+
in `select_pods_by_name_pattern_and_namespace_pattern`)
|
|
265
|
+
:param max_timeout: the expected time the pods should take to recover.
|
|
266
|
+
If the killed pods are replaced in this time frame, but they
|
|
267
|
+
didn't reach the Ready State, they will be marked as unrecovered.
|
|
268
|
+
If during the time frame the pods are not replaced
|
|
269
|
+
at all the error field of the PodsStatus structure will be
|
|
270
|
+
valorized with an exception.
|
|
271
|
+
:return:
|
|
272
|
+
a future which result (PodsSnapshot) must be
|
|
273
|
+
gathered to obtain the pod infos.
|
|
274
|
+
|
|
275
|
+
"""
|
|
276
|
+
try:
|
|
277
|
+
re.compile(namespace_pattern)
|
|
278
|
+
except re.error as e:
|
|
279
|
+
raise Exception(f"invalid pod namespace regex: {e}")
|
|
280
|
+
|
|
281
|
+
select_partial = partial(
|
|
282
|
+
v1_client.list_pod_for_all_namespaces,
|
|
283
|
+
label_selector=label_selector,
|
|
284
|
+
field_selector="status.phase=Running",
|
|
285
|
+
)
|
|
286
|
+
snapshot = _select_pods(
|
|
287
|
+
select_partial,
|
|
288
|
+
namespace_pattern=namespace_pattern,
|
|
289
|
+
)
|
|
290
|
+
monitor_partial = partial(
|
|
291
|
+
v1_client.list_pod_for_all_namespaces,
|
|
292
|
+
resource_version=snapshot.resource_version,
|
|
293
|
+
label_selector=label_selector,
|
|
294
|
+
)
|
|
295
|
+
pool = ThreadPoolExecutor(max_workers=1)
|
|
296
|
+
future = pool.submit(
|
|
297
|
+
_monitor_pods,
|
|
298
|
+
monitor_partial,
|
|
299
|
+
snapshot,
|
|
300
|
+
max_timeout,
|
|
301
|
+
name_pattern=None,
|
|
302
|
+
namespace_pattern=namespace_pattern,
|
|
303
|
+
)
|
|
304
|
+
return future
|
|
@@ -114,6 +114,7 @@ class ElasticHealthChecks(InnerDoc):
|
|
|
114
114
|
end_timestamp = Date()
|
|
115
115
|
duration = Float()
|
|
116
116
|
|
|
117
|
+
|
|
117
118
|
class ElasticVirtChecks(InnerDoc):
|
|
118
119
|
vm_name = Text()
|
|
119
120
|
ip_address = Text()
|
|
@@ -124,6 +125,7 @@ class ElasticVirtChecks(InnerDoc):
|
|
|
124
125
|
end_timestamp = Date()
|
|
125
126
|
duration = Float()
|
|
126
127
|
|
|
128
|
+
|
|
127
129
|
class ElasticChaosRunTelemetry(Document):
|
|
128
130
|
scenarios = Nested(ElasticScenarioTelemetry, multi=True)
|
|
129
131
|
node_summary_infos = Nested(ElasticNodeInfo, multi=True)
|
|
@@ -141,6 +143,7 @@ class ElasticChaosRunTelemetry(Document):
|
|
|
141
143
|
run_uuid = Text(fields={"keyword": Keyword()})
|
|
142
144
|
health_checks = Nested(ElasticHealthChecks, multi=True)
|
|
143
145
|
virt_checks = Nested(ElasticVirtChecks, multi=True)
|
|
146
|
+
|
|
144
147
|
class Index:
|
|
145
148
|
name = "chaos_run_telemetry"
|
|
146
149
|
|
|
@@ -215,7 +218,7 @@ class ElasticChaosRunTelemetry(Document):
|
|
|
215
218
|
chaos_run_telemetry.kubernetes_objects_count
|
|
216
219
|
)
|
|
217
220
|
self.network_plugins = chaos_run_telemetry.network_plugins
|
|
218
|
-
|
|
221
|
+
|
|
219
222
|
if chaos_run_telemetry.health_checks:
|
|
220
223
|
self.health_checks = [
|
|
221
224
|
ElasticHealthChecks(
|
|
@@ -234,7 +237,7 @@ class ElasticChaosRunTelemetry(Document):
|
|
|
234
237
|
]
|
|
235
238
|
else:
|
|
236
239
|
self.health_checks = None
|
|
237
|
-
|
|
240
|
+
|
|
238
241
|
if chaos_run_telemetry.virt_checks:
|
|
239
242
|
self.virt_checks = [
|
|
240
243
|
ElasticVirtChecks(
|
krkn_lib/models/k8s/models.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
from concurrent.futures import Future, ThreadPoolExecutor
|
|
2
1
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
4
3
|
|
|
5
4
|
|
|
6
5
|
@dataclass(frozen=True, order=False)
|
|
@@ -186,7 +185,6 @@ class PodsStatus:
|
|
|
186
185
|
|
|
187
186
|
recovered: list[AffectedPod]
|
|
188
187
|
unrecovered: list[AffectedPod]
|
|
189
|
-
error: Optional[str]
|
|
190
188
|
|
|
191
189
|
def __init__(self, json_object: str = None):
|
|
192
190
|
self.recovered = []
|
|
@@ -220,28 +218,6 @@ class PodsStatus:
|
|
|
220
218
|
self.unrecovered.append(unrecovered)
|
|
221
219
|
|
|
222
220
|
|
|
223
|
-
class PodsMonitorThread:
|
|
224
|
-
executor: ThreadPoolExecutor
|
|
225
|
-
future: Future
|
|
226
|
-
|
|
227
|
-
def __init__(self, executor: ThreadPoolExecutor, future: Future):
|
|
228
|
-
self.future = future
|
|
229
|
-
self.executor = executor
|
|
230
|
-
|
|
231
|
-
def join(self, timeout: int = 120) -> PodsStatus:
|
|
232
|
-
try:
|
|
233
|
-
result = self.future.result(timeout=timeout)
|
|
234
|
-
self.executor.shutdown(wait=False, cancel_futures=True)
|
|
235
|
-
return result
|
|
236
|
-
except Exception as e:
|
|
237
|
-
pods_status = PodsStatus()
|
|
238
|
-
pods_status.error = Exception(
|
|
239
|
-
f"Thread pool did not shutdown correctly,"
|
|
240
|
-
f"aborting.\nException: {e}"
|
|
241
|
-
)
|
|
242
|
-
return pods_status
|
|
243
|
-
|
|
244
|
-
|
|
245
221
|
class AffectedNode:
|
|
246
222
|
"""
|
|
247
223
|
A node affected by a chaos scenario
|
|
File without changes
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Optional, Any
|
|
6
|
+
|
|
7
|
+
from krkn_lib.models.k8s import PodsStatus, AffectedPod
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PodStatus(Enum):
|
|
11
|
+
READY = 1
|
|
12
|
+
NOT_READY = 2
|
|
13
|
+
DELETION_SCHEDULED = 3
|
|
14
|
+
DELETED = 4
|
|
15
|
+
ADDED = 5
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class PodEvent:
|
|
20
|
+
status: PodStatus
|
|
21
|
+
|
|
22
|
+
def __init__(self, timestamp: float = None):
|
|
23
|
+
if not timestamp:
|
|
24
|
+
self._timestamp = time.time()
|
|
25
|
+
else:
|
|
26
|
+
self._timestamp = timestamp
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def timestamp(self):
|
|
30
|
+
return self._timestamp
|
|
31
|
+
|
|
32
|
+
@timestamp.setter
|
|
33
|
+
def timestamp(self, value):
|
|
34
|
+
raise AttributeError("timestamp cannot be set")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class MonitoredPod:
|
|
39
|
+
namespace: str
|
|
40
|
+
name: str
|
|
41
|
+
status_changes: list[PodEvent]
|
|
42
|
+
|
|
43
|
+
def __init__(self):
|
|
44
|
+
self.namespace = ""
|
|
45
|
+
self.name = ""
|
|
46
|
+
self.status_changes = []
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> dict[str, Any]:
|
|
49
|
+
return {
|
|
50
|
+
"namespace": self.namespace,
|
|
51
|
+
"name": self.name,
|
|
52
|
+
"status_changes": [
|
|
53
|
+
{"status": v.status.name, "timestamp": v.timestamp}
|
|
54
|
+
for v in self.status_changes
|
|
55
|
+
],
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class PodsSnapshot:
|
|
61
|
+
resource_version: str
|
|
62
|
+
pods: dict[str, MonitoredPod]
|
|
63
|
+
added_pods: list[str]
|
|
64
|
+
initial_pods: list[str]
|
|
65
|
+
_found_rescheduled_pods: dict[str, str]
|
|
66
|
+
|
|
67
|
+
def __init__(self, json_str: str = None):
|
|
68
|
+
self.resource_version = ""
|
|
69
|
+
self.pods = {}
|
|
70
|
+
self.added_pods = []
|
|
71
|
+
self.initial_pods = []
|
|
72
|
+
self._found_rescheduled_pods = {}
|
|
73
|
+
if json_str:
|
|
74
|
+
json_obj = json.loads(json_str)
|
|
75
|
+
for _, pod in json_obj["pods"]:
|
|
76
|
+
p = MonitoredPod()
|
|
77
|
+
p.name = pod["name"]
|
|
78
|
+
p.namespace = pod["namespace"]
|
|
79
|
+
for status in pod["status_changes"]:
|
|
80
|
+
s = PodEvent(timestamp=status["timestamp"])
|
|
81
|
+
if status["status"] == "READY":
|
|
82
|
+
s.status = PodStatus.READY
|
|
83
|
+
elif status["status"] == "NOT_READY":
|
|
84
|
+
s.status = PodStatus.NOT_READY
|
|
85
|
+
elif status["status"] == "DELETION_SCHEDULED":
|
|
86
|
+
s.status = PodStatus.DELETION_SCHEDULED
|
|
87
|
+
elif status["status"] == "DELETED":
|
|
88
|
+
s.status = PodStatus.DELETED
|
|
89
|
+
elif status["status"] == "ADDED":
|
|
90
|
+
s.status = PodStatus.ADDED
|
|
91
|
+
p.status_changes.append(s)
|
|
92
|
+
self.pods[p.name] = p
|
|
93
|
+
for p in json_obj["added_pods"]:
|
|
94
|
+
self.added_pods.append(p)
|
|
95
|
+
for p in json_obj["initial_pods"]:
|
|
96
|
+
self.initial_pods.append(p)
|
|
97
|
+
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
def to_dict(self) -> dict[str, Any]:
|
|
101
|
+
return {
|
|
102
|
+
"resource_version": self.resource_version,
|
|
103
|
+
"pods": [[k, v.to_dict()] for k, v in self.pods.items()],
|
|
104
|
+
"added_pods": self.added_pods,
|
|
105
|
+
"initial_pods": self.initial_pods,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
def _find_rescheduled_pod(self, parent: str) -> Optional[MonitoredPod]:
|
|
109
|
+
for _, v in self.pods.items():
|
|
110
|
+
found_pod = next(
|
|
111
|
+
filter(
|
|
112
|
+
lambda p: p.status == PodStatus.ADDED,
|
|
113
|
+
v.status_changes,
|
|
114
|
+
),
|
|
115
|
+
None,
|
|
116
|
+
)
|
|
117
|
+
if found_pod and v.name not in self._found_rescheduled_pods:
|
|
118
|
+
# just pick rescheduled pods once
|
|
119
|
+
# keeping the parent for future uses
|
|
120
|
+
self._found_rescheduled_pods[v.name] = parent
|
|
121
|
+
return v
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
def get_pods_status(self) -> PodsStatus:
|
|
125
|
+
|
|
126
|
+
pods_status = PodsStatus()
|
|
127
|
+
for pod_name in self.initial_pods:
|
|
128
|
+
pod = self.pods[pod_name]
|
|
129
|
+
for status_change in pod.status_changes:
|
|
130
|
+
if status_change.status == PodStatus.NOT_READY:
|
|
131
|
+
ready_status = next(
|
|
132
|
+
filter(
|
|
133
|
+
lambda s: s.status == PodStatus.READY,
|
|
134
|
+
pod.status_changes,
|
|
135
|
+
),
|
|
136
|
+
None,
|
|
137
|
+
)
|
|
138
|
+
if not ready_status:
|
|
139
|
+
pods_status.unrecovered.append(
|
|
140
|
+
AffectedPod(
|
|
141
|
+
pod_name=pod.name, namespace=pod.namespace
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
else:
|
|
145
|
+
pods_status.recovered.append(
|
|
146
|
+
AffectedPod(
|
|
147
|
+
pod_name=pod.name,
|
|
148
|
+
namespace=pod.namespace,
|
|
149
|
+
pod_readiness_time=ready_status.timestamp
|
|
150
|
+
- status_change.timestamp,
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
# if there's a DELETION_SCHEDULED events
|
|
156
|
+
# looks for the rescheduled pod
|
|
157
|
+
# and calculates its scheduling and readiness time
|
|
158
|
+
if status_change.status == PodStatus.DELETION_SCHEDULED:
|
|
159
|
+
rescheduled_pod = self._find_rescheduled_pod(pod_name)
|
|
160
|
+
if not rescheduled_pod:
|
|
161
|
+
pods_status.unrecovered.append(
|
|
162
|
+
AffectedPod(
|
|
163
|
+
pod_name=pod.name, namespace=pod.namespace
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
rescheduled_start_ts = next(
|
|
168
|
+
map(
|
|
169
|
+
lambda e: e.timestamp,
|
|
170
|
+
filter(
|
|
171
|
+
lambda s: s.status == PodStatus.ADDED,
|
|
172
|
+
rescheduled_pod.status_changes,
|
|
173
|
+
),
|
|
174
|
+
),
|
|
175
|
+
None,
|
|
176
|
+
)
|
|
177
|
+
rescheduled_ready_ts = next(
|
|
178
|
+
map(
|
|
179
|
+
lambda e: e.timestamp,
|
|
180
|
+
filter(
|
|
181
|
+
lambda s: s.status == PodStatus.READY,
|
|
182
|
+
rescheduled_pod.status_changes,
|
|
183
|
+
),
|
|
184
|
+
),
|
|
185
|
+
None,
|
|
186
|
+
)
|
|
187
|
+
# the pod might be rescheduled correctly
|
|
188
|
+
# but do not become ready in the expected time
|
|
189
|
+
# so it must be marked as `unrecovered` in that
|
|
190
|
+
# case
|
|
191
|
+
if not rescheduled_ready_ts:
|
|
192
|
+
pods_status.unrecovered.append(
|
|
193
|
+
AffectedPod(
|
|
194
|
+
pod_name=rescheduled_pod.name,
|
|
195
|
+
namespace=pod.namespace,
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
else:
|
|
199
|
+
rescheduling_time = (
|
|
200
|
+
rescheduled_start_ts - status_change.timestamp
|
|
201
|
+
if rescheduled_start_ts
|
|
202
|
+
else None
|
|
203
|
+
)
|
|
204
|
+
readiness_time = (
|
|
205
|
+
rescheduled_ready_ts - status_change.timestamp
|
|
206
|
+
if rescheduled_ready_ts
|
|
207
|
+
else None
|
|
208
|
+
)
|
|
209
|
+
pods_status.recovered.append(
|
|
210
|
+
AffectedPod(
|
|
211
|
+
pod_name=rescheduled_pod.name,
|
|
212
|
+
namespace=rescheduled_pod.namespace,
|
|
213
|
+
pod_rescheduling_time=rescheduling_time,
|
|
214
|
+
pod_readiness_time=readiness_time,
|
|
215
|
+
total_recovery_time=(
|
|
216
|
+
rescheduling_time + readiness_time
|
|
217
|
+
if rescheduling_time and readiness_time
|
|
218
|
+
else None
|
|
219
|
+
),
|
|
220
|
+
)
|
|
221
|
+
)
|
|
222
|
+
break
|
|
223
|
+
|
|
224
|
+
return pods_status
|
|
@@ -421,6 +421,7 @@ class HealthCheck:
|
|
|
421
421
|
self.end_timestamp = json_dict["end_timestamp"]
|
|
422
422
|
self.duration = json_dict["duration"]
|
|
423
423
|
|
|
424
|
+
|
|
424
425
|
@dataclass(order=False)
|
|
425
426
|
class VirtCheck:
|
|
426
427
|
"""
|
|
@@ -466,10 +467,11 @@ class VirtCheck:
|
|
|
466
467
|
self.ip_address = json_dict["ip_address"]
|
|
467
468
|
self.namespace = json_dict["namespace"]
|
|
468
469
|
self.vm_name = json_dict["vm_name"]
|
|
469
|
-
self.status = json_dict.get("status",True)
|
|
470
|
-
self.start_timestamp = json_dict.get("start_timestamp","")
|
|
471
|
-
self.end_timestamp = json_dict.get("end_timestamp","")
|
|
472
|
-
self.duration = json_dict.get("duration","")
|
|
470
|
+
self.status = json_dict.get("status", True)
|
|
471
|
+
self.start_timestamp = json_dict.get("start_timestamp", "")
|
|
472
|
+
self.end_timestamp = json_dict.get("end_timestamp", "")
|
|
473
|
+
self.duration = json_dict.get("duration", "")
|
|
474
|
+
|
|
473
475
|
|
|
474
476
|
@dataclass(order=False)
|
|
475
477
|
class ChaosRunTelemetry:
|