wnm 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wnm might be problematic. Click here for more details.
- wnm/__init__.py +1 -1
- wnm/__main__.py +206 -953
- wnm/actions.py +45 -0
- wnm/common.py +21 -0
- wnm/config.py +653 -1
- wnm/decision_engine.py +388 -0
- wnm/executor.py +1292 -0
- wnm/firewall/__init__.py +13 -0
- wnm/firewall/base.py +71 -0
- wnm/firewall/factory.py +95 -0
- wnm/firewall/null_firewall.py +71 -0
- wnm/firewall/ufw_manager.py +118 -0
- wnm/migration.py +42 -0
- wnm/models.py +389 -122
- wnm/process_managers/__init__.py +23 -0
- wnm/process_managers/base.py +203 -0
- wnm/process_managers/docker_manager.py +371 -0
- wnm/process_managers/factory.py +83 -0
- wnm/process_managers/launchd_manager.py +592 -0
- wnm/process_managers/setsid_manager.py +340 -0
- wnm/process_managers/systemd_manager.py +443 -0
- wnm/reports.py +286 -0
- wnm/utils.py +403 -0
- wnm-0.0.10.dist-info/METADATA +316 -0
- wnm-0.0.10.dist-info/RECORD +28 -0
- {wnm-0.0.8.dist-info → wnm-0.0.10.dist-info}/WHEEL +1 -1
- wnm-0.0.8.dist-info/METADATA +0 -93
- wnm-0.0.8.dist-info/RECORD +0 -9
- {wnm-0.0.8.dist-info → wnm-0.0.10.dist-info}/entry_points.txt +0 -0
- {wnm-0.0.8.dist-info → wnm-0.0.10.dist-info}/top_level.txt +0 -0
wnm/decision_engine.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
"""Decision engine for planning node lifecycle actions.
|
|
2
|
+
|
|
3
|
+
This module contains the DecisionEngine class which analyzes machine metrics,
|
|
4
|
+
resource thresholds, and node status to determine what actions should be taken.
|
|
5
|
+
It replaces the monolithic choose_action() function with a more modular approach.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from packaging.version import Version
|
|
12
|
+
|
|
13
|
+
from wnm.actions import Action, ActionType
|
|
14
|
+
from wnm.common import DEAD, DISABLED, REMOVING, RESTARTING, RUNNING, STOPPED, UPGRADING
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DecisionEngine:
|
|
18
|
+
"""Analyzes system state and plans node lifecycle actions.
|
|
19
|
+
|
|
20
|
+
The DecisionEngine separates decision-making from execution. It takes machine
|
|
21
|
+
configuration and current metrics, then returns a prioritized list of actions
|
|
22
|
+
to perform.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, machine_config: Dict[str, Any], metrics: Dict[str, Any]):
|
|
26
|
+
"""Initialize the decision engine.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
machine_config: Machine configuration dictionary with thresholds
|
|
30
|
+
metrics: Current system metrics and node status
|
|
31
|
+
"""
|
|
32
|
+
self.config = machine_config
|
|
33
|
+
self.metrics = metrics
|
|
34
|
+
self.features = self._compute_features()
|
|
35
|
+
|
|
36
|
+
def _compute_features(self) -> Dict[str, bool]:
|
|
37
|
+
"""Compute decision features from metrics and config.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dictionary of boolean features used for decision-making
|
|
41
|
+
"""
|
|
42
|
+
features = {}
|
|
43
|
+
|
|
44
|
+
# Resource availability checks
|
|
45
|
+
features["allow_cpu"] = (
|
|
46
|
+
self.metrics["used_cpu_percent"] < self.config["cpu_less_than"]
|
|
47
|
+
)
|
|
48
|
+
features["allow_mem"] = (
|
|
49
|
+
self.metrics["used_mem_percent"] < self.config["mem_less_than"]
|
|
50
|
+
)
|
|
51
|
+
features["allow_hd"] = (
|
|
52
|
+
self.metrics["used_hd_percent"] < self.config["hd_less_than"]
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Resource pressure checks
|
|
56
|
+
features["remove_cpu"] = (
|
|
57
|
+
self.metrics["used_cpu_percent"] > self.config["cpu_remove"]
|
|
58
|
+
)
|
|
59
|
+
features["remove_mem"] = (
|
|
60
|
+
self.metrics["used_mem_percent"] > self.config["mem_remove"]
|
|
61
|
+
)
|
|
62
|
+
features["remove_hd"] = (
|
|
63
|
+
self.metrics["used_hd_percent"] > self.config["hd_remove"]
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
features["allow_node_cap"] = (
|
|
67
|
+
self.metrics["running_nodes"] < self.config["node_cap"]
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Network I/O checks (if configured)
|
|
71
|
+
if self._is_netio_configured():
|
|
72
|
+
features["allow_netio"] = (
|
|
73
|
+
self.metrics["netio_read_bytes"]
|
|
74
|
+
< self.config["netio_read_less_than"]
|
|
75
|
+
and self.metrics["netio_write_bytes"]
|
|
76
|
+
< self.config["netio_write_less_than"]
|
|
77
|
+
)
|
|
78
|
+
features["remove_netio"] = (
|
|
79
|
+
self.metrics["netio_read_bytes"] > self.config["netio_read_remove"]
|
|
80
|
+
or self.metrics["netio_write_bytes"] > self.config["netio_write_remove"]
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
features["allow_netio"] = True
|
|
84
|
+
features["remove_netio"] = False
|
|
85
|
+
|
|
86
|
+
# Disk I/O checks (if configured)
|
|
87
|
+
if self._is_hdio_configured():
|
|
88
|
+
features["allow_hdio"] = (
|
|
89
|
+
self.metrics["hdio_read_bytes"] < self.config["hdio_read_less_than"]
|
|
90
|
+
and self.metrics["hdio_write_bytes"]
|
|
91
|
+
< self.config["hdio_write_less_than"]
|
|
92
|
+
)
|
|
93
|
+
features["remove_hdio"] = (
|
|
94
|
+
self.metrics["hdio_read_bytes"] > self.config["hdio_read_remove"]
|
|
95
|
+
or self.metrics["hdio_write_bytes"] > self.config["hdio_write_remove"]
|
|
96
|
+
)
|
|
97
|
+
else:
|
|
98
|
+
features["allow_hdio"] = True
|
|
99
|
+
features["remove_hdio"] = False
|
|
100
|
+
|
|
101
|
+
# Load average checks
|
|
102
|
+
features["load_allow"] = (
|
|
103
|
+
self.metrics["load_average_1"] < self.config["desired_load_average"]
|
|
104
|
+
and self.metrics["load_average_5"] < self.config["desired_load_average"]
|
|
105
|
+
and self.metrics["load_average_15"] < self.config["desired_load_average"]
|
|
106
|
+
)
|
|
107
|
+
features["load_not_allow"] = (
|
|
108
|
+
self.metrics["load_average_1"] > self.config["max_load_average_allowed"]
|
|
109
|
+
or self.metrics["load_average_5"] > self.config["max_load_average_allowed"]
|
|
110
|
+
or self.metrics["load_average_15"] > self.config["max_load_average_allowed"]
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Can we add a new node?
|
|
114
|
+
features["add_new_node"] = (
|
|
115
|
+
sum(
|
|
116
|
+
[
|
|
117
|
+
self.metrics.get(m, 0)
|
|
118
|
+
for m in [
|
|
119
|
+
"upgrading_nodes",
|
|
120
|
+
"restarting_nodes",
|
|
121
|
+
"migrating_nodes",
|
|
122
|
+
"removing_nodes",
|
|
123
|
+
]
|
|
124
|
+
]
|
|
125
|
+
)
|
|
126
|
+
== 0
|
|
127
|
+
and features["allow_cpu"]
|
|
128
|
+
and features["allow_hd"]
|
|
129
|
+
and features["allow_mem"]
|
|
130
|
+
and features["allow_node_cap"]
|
|
131
|
+
and features["allow_hdio"]
|
|
132
|
+
and features["allow_netio"]
|
|
133
|
+
and features["load_allow"]
|
|
134
|
+
and self.metrics["total_nodes"] < self.config["node_cap"]
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Do we need to remove nodes?
|
|
138
|
+
features["remove"] = (
|
|
139
|
+
features["load_not_allow"]
|
|
140
|
+
or features["remove_cpu"]
|
|
141
|
+
or features["remove_hd"]
|
|
142
|
+
or features["remove_mem"]
|
|
143
|
+
or features["remove_hdio"]
|
|
144
|
+
or features["remove_netio"]
|
|
145
|
+
or self.metrics["total_nodes"] > self.config["node_cap"]
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Can we upgrade nodes?
|
|
149
|
+
if self.metrics["nodes_to_upgrade"] >= 1:
|
|
150
|
+
# Make sure current version is equal or newer than version on first node
|
|
151
|
+
if Version(self.metrics["antnode_version"]) < Version(
|
|
152
|
+
self.metrics["queen_node_version"]
|
|
153
|
+
):
|
|
154
|
+
logging.warning("node upgrade cancelled due to lower version")
|
|
155
|
+
features["upgrade"] = False
|
|
156
|
+
else:
|
|
157
|
+
if features["remove"]:
|
|
158
|
+
logging.info("Can't upgrade while removing is required")
|
|
159
|
+
features["upgrade"] = False
|
|
160
|
+
else:
|
|
161
|
+
features["upgrade"] = True
|
|
162
|
+
else:
|
|
163
|
+
features["upgrade"] = False
|
|
164
|
+
|
|
165
|
+
return features
|
|
166
|
+
|
|
167
|
+
def _is_netio_configured(self) -> bool:
|
|
168
|
+
"""Check if network I/O thresholds are configured."""
|
|
169
|
+
return (
|
|
170
|
+
self.config["netio_read_less_than"]
|
|
171
|
+
+ self.config["netio_read_remove"]
|
|
172
|
+
+ self.config["netio_write_less_than"]
|
|
173
|
+
+ self.config["netio_write_remove"]
|
|
174
|
+
> 1
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def _is_hdio_configured(self) -> bool:
|
|
178
|
+
"""Check if disk I/O thresholds are configured."""
|
|
179
|
+
return (
|
|
180
|
+
self.config["hdio_read_less_than"]
|
|
181
|
+
+ self.config["hdio_read_remove"]
|
|
182
|
+
+ self.config["hdio_write_less_than"]
|
|
183
|
+
+ self.config["hdio_write_remove"]
|
|
184
|
+
> 1
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def plan_actions(self) -> List[Action]:
|
|
188
|
+
"""Plan the actions to take based on current state.
|
|
189
|
+
|
|
190
|
+
Returns prioritized list of actions, respecting concurrency thresholds.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
List of Action objects in priority order
|
|
194
|
+
"""
|
|
195
|
+
actions = []
|
|
196
|
+
|
|
197
|
+
# Priority 1: System reboot detection
|
|
198
|
+
if int(self.metrics["system_start"]) > int(self.config["last_stopped_at"]):
|
|
199
|
+
return [
|
|
200
|
+
Action(
|
|
201
|
+
type=ActionType.RESURVEY_NODES,
|
|
202
|
+
priority=100,
|
|
203
|
+
reason="system rebooted",
|
|
204
|
+
)
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
# Priority 2: Remove dead nodes
|
|
208
|
+
if self.metrics["dead_nodes"] > 0:
|
|
209
|
+
actions.extend(self._plan_dead_node_removals())
|
|
210
|
+
return actions # Dead nodes take absolute priority
|
|
211
|
+
|
|
212
|
+
# Priority 3: Update nodes with missing version numbers
|
|
213
|
+
if self.metrics.get("nodes_no_version", 0) > 0:
|
|
214
|
+
# This is handled by update_counters, not as an action
|
|
215
|
+
# We'll include it as informational but not block other actions
|
|
216
|
+
pass
|
|
217
|
+
|
|
218
|
+
# Priority 4: Wait for in-progress operations
|
|
219
|
+
if self.metrics["restarting_nodes"]:
|
|
220
|
+
logging.info("Still waiting for RestartDelay")
|
|
221
|
+
return [
|
|
222
|
+
Action(
|
|
223
|
+
type=ActionType.SURVEY_NODES,
|
|
224
|
+
priority=0,
|
|
225
|
+
reason="waiting for restart delay",
|
|
226
|
+
)
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
if self.metrics["upgrading_nodes"]:
|
|
230
|
+
logging.info("Still waiting for UpgradeDelay")
|
|
231
|
+
return [
|
|
232
|
+
Action(
|
|
233
|
+
type=ActionType.SURVEY_NODES,
|
|
234
|
+
priority=0,
|
|
235
|
+
reason="waiting for upgrade delay",
|
|
236
|
+
)
|
|
237
|
+
]
|
|
238
|
+
|
|
239
|
+
# Priority 5: Resource pressure - remove nodes
|
|
240
|
+
if self.features["remove"]:
|
|
241
|
+
actions.extend(self._plan_resource_removal())
|
|
242
|
+
if actions:
|
|
243
|
+
return actions
|
|
244
|
+
|
|
245
|
+
# Priority 6: Upgrades (only if not removing)
|
|
246
|
+
if self.features["upgrade"]:
|
|
247
|
+
actions.extend(self._plan_upgrades())
|
|
248
|
+
if actions:
|
|
249
|
+
return actions
|
|
250
|
+
|
|
251
|
+
# Priority 7: Add nodes (if resources allow)
|
|
252
|
+
if self.features["add_new_node"]:
|
|
253
|
+
actions.extend(self._plan_node_additions())
|
|
254
|
+
if actions:
|
|
255
|
+
return actions
|
|
256
|
+
|
|
257
|
+
# Default: Survey nodes
|
|
258
|
+
return [
|
|
259
|
+
Action(type=ActionType.SURVEY_NODES, priority=0, reason="idle monitoring")
|
|
260
|
+
]
|
|
261
|
+
|
|
262
|
+
def _plan_dead_node_removals(self) -> List[Action]:
|
|
263
|
+
"""Plan removal of dead nodes (highest priority).
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
List of removal actions for all dead nodes
|
|
267
|
+
"""
|
|
268
|
+
# Dead nodes should be removed immediately, all at once
|
|
269
|
+
# The executor will need to query for dead nodes
|
|
270
|
+
return [
|
|
271
|
+
Action(
|
|
272
|
+
type=ActionType.REMOVE_NODE,
|
|
273
|
+
node_id=None, # Executor will query for dead nodes
|
|
274
|
+
priority=90,
|
|
275
|
+
reason="dead node cleanup",
|
|
276
|
+
)
|
|
277
|
+
]
|
|
278
|
+
|
|
279
|
+
def _plan_resource_removal(self) -> List[Action]:
|
|
280
|
+
"""Plan node removals due to resource pressure.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
List of removal or stop actions
|
|
284
|
+
"""
|
|
285
|
+
actions = []
|
|
286
|
+
|
|
287
|
+
# If under HD pressure, over node cap, or upgrades need resources
|
|
288
|
+
if (
|
|
289
|
+
self.features["remove_hd"]
|
|
290
|
+
or self.metrics["total_nodes"] > self.config["node_cap"]
|
|
291
|
+
or (
|
|
292
|
+
self.metrics["nodes_to_upgrade"] > 0
|
|
293
|
+
and self.metrics["removing_nodes"] == 0
|
|
294
|
+
)
|
|
295
|
+
):
|
|
296
|
+
# Priority: Remove stopped nodes first
|
|
297
|
+
if self.metrics["stopped_nodes"] > 0:
|
|
298
|
+
actions.append(
|
|
299
|
+
Action(
|
|
300
|
+
type=ActionType.REMOVE_NODE,
|
|
301
|
+
node_id=None, # Executor will query for youngest stopped
|
|
302
|
+
priority=80,
|
|
303
|
+
reason="remove stopped node (resource pressure)",
|
|
304
|
+
)
|
|
305
|
+
)
|
|
306
|
+
else:
|
|
307
|
+
# Remove youngest running node
|
|
308
|
+
actions.append(
|
|
309
|
+
Action(
|
|
310
|
+
type=ActionType.REMOVE_NODE,
|
|
311
|
+
node_id=None, # Executor will query for youngest running
|
|
312
|
+
priority=75,
|
|
313
|
+
reason="remove running node (resource pressure)",
|
|
314
|
+
)
|
|
315
|
+
)
|
|
316
|
+
else:
|
|
317
|
+
# Just stop a node to reduce resource usage
|
|
318
|
+
if self.metrics["removing_nodes"]:
|
|
319
|
+
logging.info("Still waiting for RemoveDelay")
|
|
320
|
+
return []
|
|
321
|
+
|
|
322
|
+
# Stop the youngest running node
|
|
323
|
+
actions.append(
|
|
324
|
+
Action(
|
|
325
|
+
type=ActionType.STOP_NODE,
|
|
326
|
+
node_id=None, # Executor will query for youngest running
|
|
327
|
+
priority=70,
|
|
328
|
+
reason="stop node (reduce resource usage)",
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
return actions
|
|
333
|
+
|
|
334
|
+
def _plan_upgrades(self) -> List[Action]:
|
|
335
|
+
"""Plan node upgrades.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
List of upgrade actions (currently limited to 1)
|
|
339
|
+
"""
|
|
340
|
+
# Upgrade oldest running node with outdated version
|
|
341
|
+
return [
|
|
342
|
+
Action(
|
|
343
|
+
type=ActionType.UPGRADE_NODE,
|
|
344
|
+
node_id=None, # Executor will query for oldest outdated node
|
|
345
|
+
priority=60,
|
|
346
|
+
reason="upgrade outdated node",
|
|
347
|
+
)
|
|
348
|
+
]
|
|
349
|
+
|
|
350
|
+
def _plan_node_additions(self) -> List[Action]:
|
|
351
|
+
"""Plan adding new nodes or starting stopped nodes.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
List of start or add actions
|
|
355
|
+
"""
|
|
356
|
+
actions = []
|
|
357
|
+
|
|
358
|
+
# Priority: Start stopped nodes first
|
|
359
|
+
if self.metrics["stopped_nodes"] > 0:
|
|
360
|
+
# Check if the stopped node needs upgrading
|
|
361
|
+
# The executor will handle the version check and upgrade if needed
|
|
362
|
+
actions.append(
|
|
363
|
+
Action(
|
|
364
|
+
type=ActionType.START_NODE,
|
|
365
|
+
node_id=None, # Executor will query for oldest stopped
|
|
366
|
+
priority=50,
|
|
367
|
+
reason="start stopped node",
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
elif self.metrics["total_nodes"] < self.config["node_cap"]:
|
|
371
|
+
# Add a new node
|
|
372
|
+
actions.append(
|
|
373
|
+
Action(
|
|
374
|
+
type=ActionType.ADD_NODE,
|
|
375
|
+
priority=40,
|
|
376
|
+
reason="add new node (under capacity)",
|
|
377
|
+
)
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
return actions
|
|
381
|
+
|
|
382
|
+
def get_features(self) -> Dict[str, bool]:
|
|
383
|
+
"""Get the computed decision features.
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
Dictionary of boolean features used for decisions
|
|
387
|
+
"""
|
|
388
|
+
return self.features
|