wnm 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wnm might be problematic. Click here for more details.

wnm/decision_engine.py ADDED
@@ -0,0 +1,388 @@
1
+ """Decision engine for planning node lifecycle actions.
2
+
3
+ This module contains the DecisionEngine class which analyzes machine metrics,
4
+ resource thresholds, and node status to determine what actions should be taken.
5
+ It replaces the monolithic choose_action() function with a more modular approach.
6
+ """
7
+
8
+ import logging
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from packaging.version import Version
12
+
13
+ from wnm.actions import Action, ActionType
14
+ from wnm.common import DEAD, DISABLED, REMOVING, RESTARTING, RUNNING, STOPPED, UPGRADING
15
+
16
+
17
+ class DecisionEngine:
18
+ """Analyzes system state and plans node lifecycle actions.
19
+
20
+ The DecisionEngine separates decision-making from execution. It takes machine
21
+ configuration and current metrics, then returns a prioritized list of actions
22
+ to perform.
23
+ """
24
+
25
+ def __init__(self, machine_config: Dict[str, Any], metrics: Dict[str, Any]):
26
+ """Initialize the decision engine.
27
+
28
+ Args:
29
+ machine_config: Machine configuration dictionary with thresholds
30
+ metrics: Current system metrics and node status
31
+ """
32
+ self.config = machine_config
33
+ self.metrics = metrics
34
+ self.features = self._compute_features()
35
+
36
+ def _compute_features(self) -> Dict[str, bool]:
37
+ """Compute decision features from metrics and config.
38
+
39
+ Returns:
40
+ Dictionary of boolean features used for decision-making
41
+ """
42
+ features = {}
43
+
44
+ # Resource availability checks
45
+ features["allow_cpu"] = (
46
+ self.metrics["used_cpu_percent"] < self.config["cpu_less_than"]
47
+ )
48
+ features["allow_mem"] = (
49
+ self.metrics["used_mem_percent"] < self.config["mem_less_than"]
50
+ )
51
+ features["allow_hd"] = (
52
+ self.metrics["used_hd_percent"] < self.config["hd_less_than"]
53
+ )
54
+
55
+ # Resource pressure checks
56
+ features["remove_cpu"] = (
57
+ self.metrics["used_cpu_percent"] > self.config["cpu_remove"]
58
+ )
59
+ features["remove_mem"] = (
60
+ self.metrics["used_mem_percent"] > self.config["mem_remove"]
61
+ )
62
+ features["remove_hd"] = (
63
+ self.metrics["used_hd_percent"] > self.config["hd_remove"]
64
+ )
65
+
66
+ features["allow_node_cap"] = (
67
+ self.metrics["running_nodes"] < self.config["node_cap"]
68
+ )
69
+
70
+ # Network I/O checks (if configured)
71
+ if self._is_netio_configured():
72
+ features["allow_netio"] = (
73
+ self.metrics["netio_read_bytes"]
74
+ < self.config["netio_read_less_than"]
75
+ and self.metrics["netio_write_bytes"]
76
+ < self.config["netio_write_less_than"]
77
+ )
78
+ features["remove_netio"] = (
79
+ self.metrics["netio_read_bytes"] > self.config["netio_read_remove"]
80
+ or self.metrics["netio_write_bytes"] > self.config["netio_write_remove"]
81
+ )
82
+ else:
83
+ features["allow_netio"] = True
84
+ features["remove_netio"] = False
85
+
86
+ # Disk I/O checks (if configured)
87
+ if self._is_hdio_configured():
88
+ features["allow_hdio"] = (
89
+ self.metrics["hdio_read_bytes"] < self.config["hdio_read_less_than"]
90
+ and self.metrics["hdio_write_bytes"]
91
+ < self.config["hdio_write_less_than"]
92
+ )
93
+ features["remove_hdio"] = (
94
+ self.metrics["hdio_read_bytes"] > self.config["hdio_read_remove"]
95
+ or self.metrics["hdio_write_bytes"] > self.config["hdio_write_remove"]
96
+ )
97
+ else:
98
+ features["allow_hdio"] = True
99
+ features["remove_hdio"] = False
100
+
101
+ # Load average checks
102
+ features["load_allow"] = (
103
+ self.metrics["load_average_1"] < self.config["desired_load_average"]
104
+ and self.metrics["load_average_5"] < self.config["desired_load_average"]
105
+ and self.metrics["load_average_15"] < self.config["desired_load_average"]
106
+ )
107
+ features["load_not_allow"] = (
108
+ self.metrics["load_average_1"] > self.config["max_load_average_allowed"]
109
+ or self.metrics["load_average_5"] > self.config["max_load_average_allowed"]
110
+ or self.metrics["load_average_15"] > self.config["max_load_average_allowed"]
111
+ )
112
+
113
+ # Can we add a new node?
114
+ features["add_new_node"] = (
115
+ sum(
116
+ [
117
+ self.metrics.get(m, 0)
118
+ for m in [
119
+ "upgrading_nodes",
120
+ "restarting_nodes",
121
+ "migrating_nodes",
122
+ "removing_nodes",
123
+ ]
124
+ ]
125
+ )
126
+ == 0
127
+ and features["allow_cpu"]
128
+ and features["allow_hd"]
129
+ and features["allow_mem"]
130
+ and features["allow_node_cap"]
131
+ and features["allow_hdio"]
132
+ and features["allow_netio"]
133
+ and features["load_allow"]
134
+ and self.metrics["total_nodes"] < self.config["node_cap"]
135
+ )
136
+
137
+ # Do we need to remove nodes?
138
+ features["remove"] = (
139
+ features["load_not_allow"]
140
+ or features["remove_cpu"]
141
+ or features["remove_hd"]
142
+ or features["remove_mem"]
143
+ or features["remove_hdio"]
144
+ or features["remove_netio"]
145
+ or self.metrics["total_nodes"] > self.config["node_cap"]
146
+ )
147
+
148
+ # Can we upgrade nodes?
149
+ if self.metrics["nodes_to_upgrade"] >= 1:
150
+ # Make sure current version is equal or newer than version on first node
151
+ if Version(self.metrics["antnode_version"]) < Version(
152
+ self.metrics["queen_node_version"]
153
+ ):
154
+ logging.warning("node upgrade cancelled due to lower version")
155
+ features["upgrade"] = False
156
+ else:
157
+ if features["remove"]:
158
+ logging.info("Can't upgrade while removing is required")
159
+ features["upgrade"] = False
160
+ else:
161
+ features["upgrade"] = True
162
+ else:
163
+ features["upgrade"] = False
164
+
165
+ return features
166
+
167
+ def _is_netio_configured(self) -> bool:
168
+ """Check if network I/O thresholds are configured."""
169
+ return (
170
+ self.config["netio_read_less_than"]
171
+ + self.config["netio_read_remove"]
172
+ + self.config["netio_write_less_than"]
173
+ + self.config["netio_write_remove"]
174
+ > 1
175
+ )
176
+
177
+ def _is_hdio_configured(self) -> bool:
178
+ """Check if disk I/O thresholds are configured."""
179
+ return (
180
+ self.config["hdio_read_less_than"]
181
+ + self.config["hdio_read_remove"]
182
+ + self.config["hdio_write_less_than"]
183
+ + self.config["hdio_write_remove"]
184
+ > 1
185
+ )
186
+
187
+ def plan_actions(self) -> List[Action]:
188
+ """Plan the actions to take based on current state.
189
+
190
+ Returns prioritized list of actions, respecting concurrency thresholds.
191
+
192
+ Returns:
193
+ List of Action objects in priority order
194
+ """
195
+ actions = []
196
+
197
+ # Priority 1: System reboot detection
198
+ if int(self.metrics["system_start"]) > int(self.config["last_stopped_at"]):
199
+ return [
200
+ Action(
201
+ type=ActionType.RESURVEY_NODES,
202
+ priority=100,
203
+ reason="system rebooted",
204
+ )
205
+ ]
206
+
207
+ # Priority 2: Remove dead nodes
208
+ if self.metrics["dead_nodes"] > 0:
209
+ actions.extend(self._plan_dead_node_removals())
210
+ return actions # Dead nodes take absolute priority
211
+
212
+ # Priority 3: Update nodes with missing version numbers
213
+ if self.metrics.get("nodes_no_version", 0) > 0:
214
+ # This is handled by update_counters, not as an action
215
+ # We'll include it as informational but not block other actions
216
+ pass
217
+
218
+ # Priority 4: Wait for in-progress operations
219
+ if self.metrics["restarting_nodes"]:
220
+ logging.info("Still waiting for RestartDelay")
221
+ return [
222
+ Action(
223
+ type=ActionType.SURVEY_NODES,
224
+ priority=0,
225
+ reason="waiting for restart delay",
226
+ )
227
+ ]
228
+
229
+ if self.metrics["upgrading_nodes"]:
230
+ logging.info("Still waiting for UpgradeDelay")
231
+ return [
232
+ Action(
233
+ type=ActionType.SURVEY_NODES,
234
+ priority=0,
235
+ reason="waiting for upgrade delay",
236
+ )
237
+ ]
238
+
239
+ # Priority 5: Resource pressure - remove nodes
240
+ if self.features["remove"]:
241
+ actions.extend(self._plan_resource_removal())
242
+ if actions:
243
+ return actions
244
+
245
+ # Priority 6: Upgrades (only if not removing)
246
+ if self.features["upgrade"]:
247
+ actions.extend(self._plan_upgrades())
248
+ if actions:
249
+ return actions
250
+
251
+ # Priority 7: Add nodes (if resources allow)
252
+ if self.features["add_new_node"]:
253
+ actions.extend(self._plan_node_additions())
254
+ if actions:
255
+ return actions
256
+
257
+ # Default: Survey nodes
258
+ return [
259
+ Action(type=ActionType.SURVEY_NODES, priority=0, reason="idle monitoring")
260
+ ]
261
+
262
+ def _plan_dead_node_removals(self) -> List[Action]:
263
+ """Plan removal of dead nodes (highest priority).
264
+
265
+ Returns:
266
+ List of removal actions for all dead nodes
267
+ """
268
+ # Dead nodes should be removed immediately, all at once
269
+ # The executor will need to query for dead nodes
270
+ return [
271
+ Action(
272
+ type=ActionType.REMOVE_NODE,
273
+ node_id=None, # Executor will query for dead nodes
274
+ priority=90,
275
+ reason="dead node cleanup",
276
+ )
277
+ ]
278
+
279
+ def _plan_resource_removal(self) -> List[Action]:
280
+ """Plan node removals due to resource pressure.
281
+
282
+ Returns:
283
+ List of removal or stop actions
284
+ """
285
+ actions = []
286
+
287
+ # If under HD pressure, over node cap, or upgrades need resources
288
+ if (
289
+ self.features["remove_hd"]
290
+ or self.metrics["total_nodes"] > self.config["node_cap"]
291
+ or (
292
+ self.metrics["nodes_to_upgrade"] > 0
293
+ and self.metrics["removing_nodes"] == 0
294
+ )
295
+ ):
296
+ # Priority: Remove stopped nodes first
297
+ if self.metrics["stopped_nodes"] > 0:
298
+ actions.append(
299
+ Action(
300
+ type=ActionType.REMOVE_NODE,
301
+ node_id=None, # Executor will query for youngest stopped
302
+ priority=80,
303
+ reason="remove stopped node (resource pressure)",
304
+ )
305
+ )
306
+ else:
307
+ # Remove youngest running node
308
+ actions.append(
309
+ Action(
310
+ type=ActionType.REMOVE_NODE,
311
+ node_id=None, # Executor will query for youngest running
312
+ priority=75,
313
+ reason="remove running node (resource pressure)",
314
+ )
315
+ )
316
+ else:
317
+ # Just stop a node to reduce resource usage
318
+ if self.metrics["removing_nodes"]:
319
+ logging.info("Still waiting for RemoveDelay")
320
+ return []
321
+
322
+ # Stop the youngest running node
323
+ actions.append(
324
+ Action(
325
+ type=ActionType.STOP_NODE,
326
+ node_id=None, # Executor will query for youngest running
327
+ priority=70,
328
+ reason="stop node (reduce resource usage)",
329
+ )
330
+ )
331
+
332
+ return actions
333
+
334
+ def _plan_upgrades(self) -> List[Action]:
335
+ """Plan node upgrades.
336
+
337
+ Returns:
338
+ List of upgrade actions (currently limited to 1)
339
+ """
340
+ # Upgrade oldest running node with outdated version
341
+ return [
342
+ Action(
343
+ type=ActionType.UPGRADE_NODE,
344
+ node_id=None, # Executor will query for oldest outdated node
345
+ priority=60,
346
+ reason="upgrade outdated node",
347
+ )
348
+ ]
349
+
350
+ def _plan_node_additions(self) -> List[Action]:
351
+ """Plan adding new nodes or starting stopped nodes.
352
+
353
+ Returns:
354
+ List of start or add actions
355
+ """
356
+ actions = []
357
+
358
+ # Priority: Start stopped nodes first
359
+ if self.metrics["stopped_nodes"] > 0:
360
+ # Check if the stopped node needs upgrading
361
+ # The executor will handle the version check and upgrade if needed
362
+ actions.append(
363
+ Action(
364
+ type=ActionType.START_NODE,
365
+ node_id=None, # Executor will query for oldest stopped
366
+ priority=50,
367
+ reason="start stopped node",
368
+ )
369
+ )
370
+ elif self.metrics["total_nodes"] < self.config["node_cap"]:
371
+ # Add a new node
372
+ actions.append(
373
+ Action(
374
+ type=ActionType.ADD_NODE,
375
+ priority=40,
376
+ reason="add new node (under capacity)",
377
+ )
378
+ )
379
+
380
+ return actions
381
+
382
+ def get_features(self) -> Dict[str, bool]:
383
+ """Get the computed decision features.
384
+
385
+ Returns:
386
+ Dictionary of boolean features used for decisions
387
+ """
388
+ return self.features